IsoSpec  1.95
marginalTrek++.cpp
1 /*
2  * Copyright (C) 2015-2018 Mateusz Łącki and Michał Startek.
3  *
4  * This file is part of IsoSpec.
5  *
6  * IsoSpec is free software: you can redistribute it and/or modify
7  * it under the terms of the Simplified ("2-clause") BSD licence.
8  *
9  * IsoSpec is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12  *
13  * You should have received a copy of the Simplified BSD Licence
14  * along with IsoSpec. If not, see <https://opensource.org/licenses/BSD-2-Clause>.
15  */
16 
17 
18 #include <cmath>
19 #include <algorithm>
20 #include <vector>
21 #include <stdlib.h>
22 #include <tuple>
23 #include <unordered_map>
24 #include <unordered_set>
25 #include <queue>
26 #include <utility>
27 #include <iostream>
28 #include <string.h>
29 #include <string>
30 #include <limits>
31 #include <cstdlib>
32 #include <fenv.h>
33 #include "platform.h"
34 #include "marginalTrek++.h"
35 #include "conf.h"
36 #include "allocator.h"
37 #include "operators.h"
38 #include "summator.h"
39 #include "element_tables.h"
40 #include "misc.h"
41 
42 
43 namespace IsoSpec
44 {
45 
47 
55 Conf initialConfigure(const int atomCnt, const int isotopeNo, const double* probs, const double* lprobs)
56 {
61  Conf res = new int[isotopeNo];
62 
63  // This approximates the mode (heuristics: the mean is close to the mode).
64  for(int i = 0; i < isotopeNo; ++i )
65  res[i] = int( atomCnt * probs[i] ) + 1;
66 
67  // The number of assigned atoms above.
68  int s = 0;
69 
70  for(int i = 0; i < isotopeNo; ++i) s += res[i];
71 
72  int diff = atomCnt - s;
73 
74  // Too little: enlarging fist index.
75  if( diff > 0 ){
76  res[0] += diff;
77  }
78  // Too much: trying to redistribute the difference: hopefully the first element is the largest.
79  if( diff < 0 ){
80  diff = abs(diff);
81  int i = 0, coordDiff = 0;
82 
83  while( diff > 0){
84  coordDiff = res[i] - diff;
85 
86  if( coordDiff >= 0 ){
87  res[i] -= diff;
88  diff = 0;
89  } else {
90  res[i] = 0;
91  i++;
92  diff = abs(coordDiff);
93  }
94  }
95  }
96 
97  // What we computed so far will be very close to the mode: hillclimb the rest of the way
98 
99  bool modified = true;
100  double LP = unnormalized_logProb(res, lprobs, isotopeNo);
101  double NLP;
102 
103  while(modified)
104  {
105  modified = false;
106  for(int ii = 0; ii<isotopeNo; ii++)
107  for(int jj = 0; jj<isotopeNo; jj++)
108  if(ii != jj && res[ii] > 0)
109  {
110  res[ii]--;
111  res[jj]++;
112  NLP = unnormalized_logProb(res, lprobs, isotopeNo);
113  if(NLP>LP || (NLP==LP && ii>jj))
114  {
115  modified = true;
116  LP = NLP;
117  }
118  else
119  {
120  res[ii]++;
121  res[jj]--;
122  }
123  }
124 
125 
126  }
127  return res;
128 }
129 
130 
131 
132 #if !ISOSPEC_BUILDING_R
133 void printMarginal( const std::tuple<double*,double*,int*,int>& results, int dim)
134 {
135  for(int i=0; i<std::get<3>(results); i++){
136 
137  std::cout << "Mass = " << std::get<0>(results)[i] <<
138  " log-prob =\t" << std::get<1>(results)[i] <<
139  " prob =\t" << exp(std::get<1>(results)[i]) <<
140  "\tand configuration =\t";
141 
142  for(int j=0; j<dim; j++) std::cout << std::get<2>(results)[i*dim + j] << " ";
143 
144  std::cout << std::endl;
145  }
146 }
147 #endif
148 
149 
150 double* getMLogProbs(const double* probs, int isoNo)
151 {
157  int curr_method = fegetround();
158  fesetround(FE_UPWARD);
159  double* ret = new double[isoNo];
160 
161  // here we change the table of probabilities and log it.
162  for(int i = 0; i < isoNo; i++)
163  {
164  ret[i] = log(probs[i]);
165  for(int j=0; j<ISOSPEC_NUMBER_OF_ISOTOPIC_ENTRIES; j++)
166  if(elem_table_probability[j] == probs[i])
167  {
168  ret[i] = elem_table_log_probability[j];
169  break;
170  }
171  }
172  fesetround(curr_method);
173  return ret;
174 }
175 
176 double get_loggamma_nominator(int x)
177 {
178  // calculate log gamma of the nominator calculated in the binomial exression.
179  int curr_method = fegetround();
180  fesetround(FE_UPWARD);
181  double ret = lgamma(x+1);
182  fesetround(curr_method);
183  return ret;
184 }
185 
186 
188  const double* _masses,
189  const double* _probs,
190  int _isotopeNo,
191  int _atomCnt
192 ) :
193 disowned(false),
194 isotopeNo(_isotopeNo),
195 atomCnt(_atomCnt),
196 atom_masses(array_copy<double>(_masses, _isotopeNo)),
197 atom_lProbs(getMLogProbs(_probs, isotopeNo)),
198 loggamma_nominator(get_loggamma_nominator(_atomCnt)),
199 mode_conf(initialConfigure(atomCnt, isotopeNo, _probs, atom_lProbs)),
200 mode_lprob(loggamma_nominator+unnormalized_logProb(mode_conf, atom_lProbs, isotopeNo)),
201 mode_mass(mass(mode_conf, atom_masses, isotopeNo)),
202 mode_prob(exp(mode_lprob)),
203 smallest_lprob(atomCnt * *std::min_element(atom_lProbs, atom_lProbs+isotopeNo))
204 {
205  try
206  {
207  if(ISOSPEC_G_FACT_TABLE_SIZE-1 <= atomCnt)
208  throw std::length_error("Subisotopologue too large, size limit (that is, the maximum number of atoms of a single element in a molecule) is: " + std::to_string(ISOSPEC_G_FACT_TABLE_SIZE-1));
209  for(size_t ii = 0; ii < isotopeNo; ii++)
210  if(_probs[ii] <= 0.0 || _probs[ii] > 1.0)
211  throw std::invalid_argument("All isotope probabilities p must fulfill: 0.0 < p <= 1.0");
212  }
213  catch(...)
214  {
215  delete[] atom_masses;
216  delete[] atom_lProbs;
217  delete[] mode_conf;
218  throw;
219  }
220 }
221 
222 // the move-constructor: used in the specialization of the marginal.
224 disowned(other.disowned),
225 isotopeNo(other.isotopeNo),
226 atomCnt(other.atomCnt),
227 atom_masses(other.atom_masses),
228 atom_lProbs(other.atom_lProbs),
230 mode_conf(other.mode_conf),
231 mode_lprob(other.mode_lprob),
232 mode_mass(other.mode_mass),
233 mode_prob(other.mode_prob),
235 {
236  other.disowned = true;
237 }
238 
240 {
241  if(!disowned)
242  {
243  delete[] atom_masses;
244  delete[] atom_lProbs;
245  delete[] mode_conf;
246  }
247 }
248 
249 
251 {
252  double ret_mass = std::numeric_limits<double>::infinity();
253  for(unsigned int ii=0; ii < isotopeNo; ii++)
254  if( ret_mass > atom_masses[ii] )
255  ret_mass = atom_masses[ii];
256  return ret_mass*atomCnt;
257 }
258 
260 {
261  double ret_mass = 0.0;
262  for(unsigned int ii=0; ii < isotopeNo; ii++)
263  if( ret_mass < atom_masses[ii] )
264  ret_mass = atom_masses[ii];
265  return ret_mass*atomCnt;
266 }
267 
268 // this is roughly an equivalent of IsoSpec-Threshold-Generator
270  Marginal&& m,
271  int tabSize,
272  int hashSize
273 ) :
274 Marginal(std::move(m)),
275 current_count(0),
276 keyHasher(isotopeNo),
277 equalizer(isotopeNo),
278 orderMarginal(atom_lProbs, isotopeNo),
279 visited(hashSize,keyHasher,equalizer),
280 pq(orderMarginal),
281 totalProb(),
282 candidate(new int[isotopeNo]),
283 allocator(isotopeNo, tabSize)
284 {
285  int* initialConf = allocator.makeCopy(mode_conf);
286 
287  pq.push(initialConf);
288  visited[initialConf] = 0;
289 
290  totalProb = Summator();
291 
292  current_count = 0;
293 
294  add_next_conf();
295 }
296 
297 
298 bool MarginalTrek::add_next_conf()
299 {
304  if(pq.size() < 1) return false;
305 
306  Conf topConf = pq.top();
307  pq.pop();
308  ++current_count;
309  visited[topConf] = current_count;
310 
311  _confs.push_back(topConf);
312  _conf_masses.push_back(mass(topConf, atom_masses, isotopeNo));
313  double logprob = logProb(topConf);
314  _conf_lprobs.push_back(logprob);
315 
316 
317  totalProb.add( exp( logprob ) );
318 
319  for( unsigned int i = 0; i < isotopeNo; ++i )
320  {
321  for( unsigned int j = 0; j < isotopeNo; ++j )
322  {
323  // Growing index different than decreasing one AND
324  // Remain on simplex condition.
325  if( i != j && topConf[j] > 0 ){
326  copyConf(topConf, candidate, isotopeNo);
327 
328  ++candidate[i];
329  --candidate[j];
330 
331  // candidate should not have been already visited.
332  if( visited.count( candidate ) == 0 )
333  {
334  Conf acceptedCandidate = allocator.makeCopy(candidate);
335  pq.push(acceptedCandidate);
336 
337  visited[acceptedCandidate] = 0;
338  }
339  }
340  }
341  }
342 
343  return true;
344 }
345 
347 {
348  Summator s;
349  int last_idx = -1;
350  for(unsigned int i=0; i<_conf_lprobs.size(); i++)
351  {
352  s.add(_conf_lprobs[i]);
353  if(s.get() >= cutoff)
354  {
355  last_idx = i;
356  break;
357  }
358  }
359  if(last_idx > -1)
360  return last_idx;
361 
362  while(totalProb.get() < cutoff && add_next_conf()) {}
363  return _conf_lprobs.size();
364 }
365 
366 
367 MarginalTrek::~MarginalTrek()
368 {
369  delete[] candidate;
370 }
371 
372 
373 
374 
376  double lCutOff,
377  bool sort,
378  int tabSize,
379  int hashSize
380 ) : Marginal(std::move(m)),
381 allocator(isotopeNo, tabSize)
382 {
383  const ConfEqual equalizer(isotopeNo);
384  const KeyHasher keyHasher(isotopeNo);
385  const ConfOrderMarginalDescending orderMarginal(atom_lProbs, isotopeNo);
386 
387  std::unordered_set<Conf,KeyHasher,ConfEqual> visited(hashSize,keyHasher,equalizer);
388 
389  Conf currentConf = allocator.makeCopy(mode_conf);
390  if(logProb(currentConf) >= lCutOff)
391  {
392  // create a copy and store a ptr to the *same* copy in both structures
393  // (save some space and time)
394  auto tmp = allocator.makeCopy(currentConf);
395  configurations.push_back(tmp);
396  visited.insert(tmp);
397  }
398 
399  unsigned int idx = 0;
400 
401  while(idx < configurations.size())
402  {
403  memcpy(currentConf, configurations[idx], sizeof(int)*isotopeNo);
404  idx++;
405  for(unsigned int ii = 0; ii < isotopeNo; ii++ )
406  for(unsigned int jj = 0; jj < isotopeNo; jj++ )
407  if( ii != jj && currentConf[jj] > 0)
408  {
409  currentConf[ii]++;
410  currentConf[jj]--;
411 
412  if (visited.count(currentConf) == 0 && logProb(currentConf) >= lCutOff)
413  {
414  // create a copy and store a ptr to the *same* copy in
415  // both structures (save some space and time)
416  auto tmp = allocator.makeCopy(currentConf);
417  visited.insert(tmp);
418  configurations.push_back(tmp);
419  // std::cout << " V: "; for (auto it : visited) std::cout << it << " "; std::cout << std::endl;
420  }
421 
422  currentConf[ii]--;
423  currentConf[jj]++;
424 
425  }
426  }
427 
428  // orderMarginal defines the order of configurations (compares their logprobs)
429  // akin to key in Python sort.
430  if(sort)
431  std::sort(configurations.begin(), configurations.end(), orderMarginal);
432 
433 
434  confs = &configurations[0];
435  no_confs = configurations.size();
436  lProbs = new double[no_confs+1];
437  probs = new double[no_confs];
438  masses = new double[no_confs];
439 
440 
441  for(unsigned int ii=0; ii < no_confs; ii++)
442  {
443  lProbs[ii] = logProb(confs[ii]);
444  probs[ii] = exp(lProbs[ii]);
445  masses[ii] = mass(confs[ii], atom_masses, isotopeNo);
446  }
447  lProbs[no_confs] = -std::numeric_limits<double>::infinity();
448 }
449 
450 
452 {
453  if(lProbs != nullptr)
454  delete[] lProbs;
455  if(masses != nullptr)
456  delete[] masses;
457  if(probs != nullptr)
458  delete[] probs;
459 }
460 
461 
462 
463 
464 } // namespace IsoSpec
465 
double getLightestConfMass() const
Get the mass of the lightest subisotopologue.
const double mode_prob
virtual ~PrecalculatedMarginal()
Destructor.
The marginal distribution class (a subisotopologue).
int processUntilCutoff(double cutoff)
Calculate subisotopologues with probability above or equal to the cut-off.
MarginalTrek(Marginal &&m, int tabSize=1000, int hashSize=1000)
Move constructor: specializes the Marginal class.
const unsigned int isotopeNo
const Conf mode_conf
const unsigned int atomCnt
virtual ~Marginal()
Destructor.
const double *const atom_lProbs
double getHeaviestConfMass() const
Get the mass of the heaviest subisotopologue.
const double loggamma_nominator
Conf initialConfigure(const int atomCnt, const int isotopeNo, const double *probs, const double *lprobs)
Find one of the most probable subisotopologues.
const double smallest_lprob
Marginal(const double *_masses, const double *_probs, int _isotopeNo, int _atomCnt)
Class constructor.
const double mode_lprob
PrecalculatedMarginal(Marginal &&m, double lCutOff, bool sort=true, int tabSize=1000, int hashSize=1000)
The move constructor (disowns the Marginal).
const double *const atom_masses
const double mode_mass
double * getMLogProbs(const double *probs, int isoNo)
double logProb(Conf conf) const
Calculate the log-probability of a given subisotopologue.