26 std::vector<T> tsum(nthreads+1);
28 T* out =
new T[size+1];
37 ithread = omp_get_thread_num();
42 #pragma omp for schedule(static) 44 for (
int i=0; i<
size; i++)
50 tsum[ithread+1] = sum;
55 for(
int i=0; i<(ithread+1); i++)
61 #pragma omp for schedule(static) 63 for (
int i=0; i<
size; i++)
76 template <
typename SR,
typename NTO,
typename IT,
typename NT1,
typename NT2>
78 (
const SpDCCols<IT, NT1> &
A,
79 const SpDCCols<IT, NT2> &
B,
80 bool clearA,
bool clearB)
82 IT mdim = A.getnrow();
83 IT ndim = B.getncol();
85 if(A.isZero() || B.isZero())
87 return new SpTuples<IT, NTO>(0, mdim, ndim);
91 Dcsc<IT,NT1>* Adcsc = A.GetDCSC();
92 Dcsc<IT,NT2>* Bdcsc = B.GetDCSC();
94 float cf =
static_cast<float>(nA+1) / static_cast<float>(Adcsc->nzc);
95 IT csize =
static_cast<IT
>(ceil(cf));
97 Adcsc->ConstructAux(nA, aux);
104 numThreads = omp_get_num_threads();
109 IT* colptrC = prefixsum<IT>(colnnzC, Bdcsc->nzc, numThreads);
111 IT nnzc = colptrC[Bdcsc->nzc];
112 std::tuple<IT,IT,NTO> * tuplesC =
static_cast<std::tuple<IT,IT,NTO> *
> (::operator
new (
sizeof(std::tuple<IT,IT,NTO>[nnzc])));
118 std::vector<std::vector< std::pair<IT,IT>>> colindsVec(numThreads);
119 std::vector<std::vector<HeapEntry<IT,NT1>>> globalheapVec(numThreads);
121 for(
int i=0; i<numThreads; i++)
123 colindsVec[i].resize(nnzA/numThreads);
124 globalheapVec[i].resize(nnzA/numThreads);
129 #pragma omp parallel for 131 for(
int i=0; i < Bdcsc->nzc; ++i)
133 size_t nnzcolB = Bdcsc->cp[i+1] - Bdcsc->cp[i];
136 myThread = omp_get_thread_num();
138 if(colindsVec[myThread].
size() < nnzcolB)
140 colindsVec[myThread].resize(nnzcolB);
141 globalheapVec[myThread].resize(nnzcolB);
147 Adcsc->FillColInds(Bdcsc->ir + Bdcsc->cp[i], nnzcolB, colindsVec[myThread], aux, csize);
148 std::pair<IT,IT> * colinds = colindsVec[myThread].data();
149 HeapEntry<IT,NT1> * wset = globalheapVec[myThread].data();
153 for(IT j = 0; (unsigned)j < nnzcolB; ++j)
155 if(colinds[j].first != colinds[j].second)
157 wset[hsize++] = HeapEntry< IT,NT1 > (Adcsc->ir[colinds[j].first], j, Adcsc->numx[colinds[j].first]);
160 std::make_heap(wset, wset+hsize);
162 IT curptr = colptrC[i];
165 std::pop_heap(wset, wset + hsize);
166 IT locb = wset[hsize-1].runr;
168 NTO mrhs =
SR::multiply(wset[hsize-1].num, Bdcsc->numx[Bdcsc->cp[i]+locb]);
169 if (!SR::returnedSAID())
171 if( (curptr > colptrC[i]) && std::get<0>(tuplesC[curptr-1]) == wset[hsize-1].key)
173 std::get<2>(tuplesC[curptr-1]) = SR::add(std::get<2>(tuplesC[curptr-1]), mrhs);
177 tuplesC[curptr++]= std::make_tuple(wset[hsize-1].key, Bdcsc->jc[i], mrhs) ;
182 if( (++(colinds[locb].first)) != colinds[locb].second)
185 wset[hsize-1].key = Adcsc->ir[colinds[locb].first];
186 wset[hsize-1].num = Adcsc->numx[colinds[locb].first];
187 std::push_heap(wset, wset+hsize);
197 delete const_cast<SpDCCols<IT, NT1> *
>(&
A);
199 delete const_cast<SpDCCols<IT, NT2> *
>(&
B);
204 SpTuples<IT, NTO>* spTuplesC =
new SpTuples<IT, NTO> (nnzc, mdim, ndim, tuplesC,
true,
true);
210 template <
typename IT,
typename NT1,
typename NT2>
211 IT*
estimateNNZ(
const SpDCCols<IT, NT1> & A,
const SpDCCols<IT, NT2> & B)
213 IT nnzA = A.getnnz();
214 if(A.isZero() || B.isZero())
219 Dcsc<IT,NT1>* Adcsc = A.GetDCSC();
220 Dcsc<IT,NT2>* Bdcsc = B.GetDCSC();
222 float cf =
static_cast<float>(A.getncol()+1) / static_cast<float>(Adcsc->nzc);
223 IT csize =
static_cast<IT
>(ceil(cf));
225 Adcsc->ConstructAux(A.getncol(), aux);
232 numThreads = omp_get_num_threads();
237 IT* colnnzC =
new IT[Bdcsc->nzc];
240 #pragma omp parallel for 242 for(IT i=0; i< Bdcsc->nzc; ++i)
248 std::vector<std::vector< std::pair<IT,IT>>> colindsVec(numThreads);
249 std::vector<std::vector<std::pair<IT,IT>>> globalheapVec(numThreads);
252 for(
int i=0; i<numThreads; i++)
254 colindsVec[i].resize(nnzA/numThreads);
255 globalheapVec[i].resize(nnzA/numThreads);
259 #pragma omp parallel for 261 for(
int i=0; i < Bdcsc->nzc; ++i)
263 size_t nnzcolB = Bdcsc->cp[i+1] - Bdcsc->cp[i];
266 myThread = omp_get_thread_num();
268 if(colindsVec[myThread].
size() < nnzcolB)
270 colindsVec[myThread].resize(nnzcolB);
271 globalheapVec[myThread].resize(nnzcolB);
276 Adcsc->FillColInds(Bdcsc->ir + Bdcsc->cp[i], nnzcolB, colindsVec[myThread], aux, csize);
277 std::pair<IT,IT> * colinds = colindsVec[myThread].data();
278 std::pair<IT,IT> * curheap = globalheapVec[myThread].data();
282 for(IT j = 0; (unsigned)j < nnzcolB; ++j)
284 if(colinds[j].first != colinds[j].second)
286 curheap[hsize++] = std::make_pair(Adcsc->ir[colinds[j].first], j);
289 std::make_heap(curheap, curheap+hsize, std::greater<std::pair<IT,IT>>());
295 std::pop_heap(curheap, curheap + hsize, std::greater<std::pair<IT,IT>>());
296 IT locb = curheap[hsize-1].second;
298 if( curheap[hsize-1].first != prevRow)
300 prevRow = curheap[hsize-1].first;
304 if( (++(colinds[locb].first)) != colinds[locb].second)
306 curheap[hsize-1].first = Adcsc->ir[colinds[locb].first];
307 std::push_heap(curheap, curheap+hsize, std::greater<std::pair<IT,IT>>());
T * prefixsum(T *in, int size, int nthreads)
IT * estimateNNZ(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B)
SpTuples< IT, NTO > * LocalSpGEMM(const SpDCCols< IT, NT1 > &A, const SpDCCols< IT, NT2 > &B, bool clearA, bool clearB)
SpDCCols< IT, NT > * multiply(SpDCCols< IT, NT > &splitA, SpDCCols< IT, NT > &splitB, CCGrid &CMG, bool isBT, bool threaded)