libStatGen Software  1
Cigar.cpp
1 /*
2  * Copyright (C) 2010-2011 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include "Cigar.h"
21 #include "STLUtilities.h"
22 
23 // Initialize INDEX_NA.
24 const int32_t Cigar::INDEX_NA = -1;
25 
26 
27 ////////////////////////////////////////////////////////////////////////
28 //
29 // Cigar Class
30 //
31 
32 //
33 // Set the passed in string to the string reprentation of the Cigar operations
34 // in this object.
35 //
36 void Cigar::getCigarString(std::string& cigarString) const
37 {
38  using namespace STLUtilities;
39 
40  std::vector<CigarOperator>::const_iterator i;
41 
42  cigarString.clear(); // clear result string
43 
44  // Progressively append the character representations of the operations to
45  // the cigar string.
46  for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
47  {
48  cigarString << (*i).count << (*i).getChar();
49  }
50 }
51 
52 void Cigar::getCigarString(String& cigarString) const
53 {
54  std::string cigar;
55 
56  getCigarString(cigar);
57 
58  cigarString = cigar.c_str();
59 
60  return;
61 }
62 
63 void Cigar::getExpandedString(std::string &s) const
64 {
65  s = "";
66 
67  std::vector<CigarOperator>::const_iterator i;
68 
69  // Progressively append the character representations of the operations to
70  // the string passed in
71 
72  for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
73  {
74  for (uint32_t j = 0; j<(*i).count; j++) s += (*i).getChar();
75  }
76  return;
77 }
78 
79 
80 bool Cigar::operator == (Cigar &rhs) const
81 {
82 
83  if (this->size() != rhs.size()) return false;
84 
85  for (int i = 0; i < this->size(); i++)
86  {
87  if (cigarOperations[i]!=rhs.cigarOperations[i]) return false;
88  }
89  return true;
90 }
91 
92 
93 // return the length of the read that corresponds to
94 // the current CIGAR string.
96 {
97  int matchCount = 0;
98  std::vector<CigarOperator>::const_iterator i;
99  for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
100  {
101  switch (i->operation)
102  {
103  case match:
104  case mismatch:
105  case softClip:
106  case insert:
107  matchCount += i->count;
108  break;
109  default:
110  // we only care about operations that are in the query sequence.
111  break;
112  }
113  }
114  return matchCount;
115 }
116 
117 
118 // return the number of bases in the reference that
119 // this read "spans"
121 {
122  int matchCount = 0;
123  std::vector<CigarOperator>::const_iterator i;
124  for (i = cigarOperations.begin(); i != cigarOperations.end(); i++)
125  {
126  switch (i->operation)
127  {
128  case match:
129  case mismatch:
130  case del:
131  case skip:
132  matchCount += i->count;
133  break;
134  default:
135  // we only care about operations that are in the reference sequence.
136  break;
137  }
138  }
139  return matchCount;
140 }
141 
142 
143 // Return the number of clips that are at the beginning of the cigar.
145 {
146  int numBeginClips = 0;
147  for (unsigned int i = 0; i != cigarOperations.size(); i++)
148  {
149  if ((cigarOperations[i].operation == softClip) ||
150  (cigarOperations[i].operation == hardClip))
151  {
152  // Clipping operator, increment the counter.
153  numBeginClips += cigarOperations[i].count;
154  }
155  else
156  {
157  // Break out of the loop since a non-clipping operator was found.
158  break;
159  }
160  }
161  return(numBeginClips);
162 }
163 
164 
165 // Return the number of clips that are at the end of the cigar.
167 {
168  int numEndClips = 0;
169  for (int i = (cigarOperations.size() - 1); i >= 0; i--)
170  {
171  if ((cigarOperations[i].operation == softClip) ||
172  (cigarOperations[i].operation == hardClip))
173  {
174  // Clipping operator, increment the counter.
175  numEndClips += cigarOperations[i].count;
176  }
177  else
178  {
179  // Break out of the loop since a non-clipping operator was found.
180  break;
181  }
182  }
183  return(numEndClips);
184 }
185 
186 
187 int32_t Cigar::getRefOffset(int32_t queryIndex)
188 {
189  // If the vectors aren't set, set them.
190  if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
191  {
192  setQueryAndReferenceIndexes();
193  }
194  if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToRef.size()))
195  {
196  return(INDEX_NA);
197  }
198  return(queryToRef[queryIndex]);
199 }
200 
201 
202 int32_t Cigar::getQueryIndex(int32_t refOffset)
203 {
204  // If the vectors aren't set, set them.
205  if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
206  {
207  setQueryAndReferenceIndexes();
208  }
209  if ((refOffset < 0) || ((uint32_t)refOffset >= refToQuery.size()))
210  {
211  return(INDEX_NA);
212  }
213  return(refToQuery[refOffset]);
214 }
215 
216 
217 int32_t Cigar::getRefPosition(int32_t queryIndex, int32_t queryStartPos)
218 {
219  // If the vectors aren't set, set them.
220  if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
221  {
222  setQueryAndReferenceIndexes();
223  }
224  if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToRef.size()))
225  {
226  return(INDEX_NA);
227  }
228 
229  if (queryToRef[queryIndex] != INDEX_NA)
230  {
231  return(queryToRef[queryIndex] + queryStartPos);
232  }
233  return(INDEX_NA);
234 }
235 
236 
237 // Return the query index associated with the specified reference position
238 // when the query starts at the specified reference position based on
239 // this cigar.
240 int32_t Cigar::getQueryIndex(int32_t refPosition, int32_t queryStartPos)
241 {
242  // If the vectors aren't set, set them.
243  if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
244  {
245  setQueryAndReferenceIndexes();
246  }
247 
248  int32_t refOffset = refPosition - queryStartPos;
249  if ((refOffset < 0) || ((uint32_t)refOffset >= refToQuery.size()))
250  {
251  return(INDEX_NA);
252  }
253 
254  return(refToQuery[refOffset]);
255 }
256 
257 
259 {
260  // If the vectors aren't set, set them.
261  if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
262  {
263  setQueryAndReferenceIndexes();
264  }
265  if ((queryIndex < 0) || ((uint32_t)queryIndex >= queryToCigar.size()))
266  {
267  return(INDEX_NA);
268  }
269  return(queryToCigar[queryIndex]);
270 }
271 
272 
274 {
275  // If the vectors aren't set, set them.
276  if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
277  {
278  setQueryAndReferenceIndexes();
279  }
280  if ((refOffset < 0) || ((uint32_t)refOffset >= refToCigar.size()))
281  {
282  return(INDEX_NA);
283  }
284  return(refToCigar[refOffset]);
285 }
286 
287 
288 int32_t Cigar::getExpandedCigarIndexFromRefPos(int32_t refPosition,
289  int32_t queryStartPos)
290 {
291  return(getExpandedCigarIndexFromRefOffset(refPosition - queryStartPos));
292 }
293 
294 
295 char Cigar::getCigarCharOp(int32_t expandedCigarIndex)
296 {
297  // Check if the expanded cigar has been set yet
298  if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
299  {
300  // Set the expanded cigar.
301  setQueryAndReferenceIndexes();
302  }
303 
304  // Check to see if the index is in range.
305  if((expandedCigarIndex < 0) ||
306  ((uint32_t)expandedCigarIndex >= myExpandedCigar.length()))
307  {
308  return('?');
309  }
310  return(myExpandedCigar[expandedCigarIndex]);
311 }
312 
313 
314 char Cigar::getCigarCharOpFromQueryIndex(int32_t queryIndex)
315 {
317 }
318 
319 
320 char Cigar::getCigarCharOpFromRefOffset(int32_t refOffset)
321 {
323 }
324 
325 
326 char Cigar::getCigarCharOpFromRefPos(int32_t refPosition, int32_t queryStartPos)
327 {
328  return(getCigarCharOp(getExpandedCigarIndexFromRefPos(refPosition, queryStartPos)));
329 }
330 
331 
332 // Return the number of bases that overlap the reference and the
333 // read associated with this cigar that falls within the specified region.
334 uint32_t Cigar::getNumOverlaps(int32_t start, int32_t end,
335  int32_t queryStartPos)
336 {
337  // Get the overlap info.
338  if ((queryToRef.size() == 0) || (refToQuery.size() == 0))
339  {
340  setQueryAndReferenceIndexes();
341  }
342 
343  // Get the start and end offsets.
344  int32_t startRefOffset = 0;
345  // If the specified start is more than the queryStartPos, set
346  // the startRefOffset to the appropriate non-zero value.
347  // (if start is <= queryStartPos, than startRefOffset is 0 - it should
348  // not be set to a negative value.)
349  if (start > queryStartPos)
350  {
351  startRefOffset = start - queryStartPos;
352  }
353 
354  int32_t endRefOffset = end - queryStartPos;
355  if (end == -1)
356  {
357  // -1 means that the region goes to the end of the refrerence.
358  // So set endRefOffset to the max refOffset + 1 which is the
359  // size of the refToQuery vector.
360  endRefOffset = refToQuery.size();
361  }
362 
363 
364  // if endRefOffset is less than 0, then this read does not fall within
365  // the specified region, so return 0.
366  if (endRefOffset < 0)
367  {
368  return(0);
369  }
370 
371  // Get the overlaps for these offsets.
372  // Loop through the read counting positions that match the reference
373  // within this region.
374  int32_t refOffset = 0;
375  int32_t numOverlaps = 0;
376  for (unsigned int queryIndex = 0; queryIndex < queryToRef.size();
377  queryIndex++)
378  {
379  refOffset = getRefOffset(queryIndex);
380  if (refOffset > endRefOffset)
381  {
382  // Past the end of the specified region, so stop checking
383  // for overlaps since there will be no more.
384  break;
385  }
386  else if ((refOffset >= startRefOffset) && (refOffset < endRefOffset))
387  {
388  // within the region, increment the counter.
389  ++numOverlaps;
390  }
391  }
392 
393  return(numOverlaps);
394 }
395 
396 
397 // Return whether or not the cigar has an indel
399 {
400  for(unsigned int i = 0; i < cigarOperations.size(); i++)
401  {
402  if((cigarOperations[i].operation == insert) ||
403  (cigarOperations[i].operation == del))
404  {
405  // Found an indel, so return true.
406  return(true);
407  }
408  }
409  // Went through all the operations, and found no indel, so return false.
410  return(false);
411 }
412 
413 
414 // Clear the query index/reference offset index vectors.
415 void Cigar::clearQueryAndReferenceIndexes()
416 {
417  queryToRef.clear();
418  refToQuery.clear();
419  refToCigar.clear();
420  queryToCigar.clear();
421  myExpandedCigar.clear();
422 }
423 
424 
425 ///////////////////////////////////////////////////////
426 // Set the query index/reference offset index vectors.
427 //
428 // For Cigar: 3M2I2M1D1M
429 // That total count of cigar elements is 9 (3+2+2+1+1)
430 //
431 // The entries that are valid in the query/reference contain the index/offset
432 // where they are found in the query/reference. N/A are marked by 'x':
433 // query indexes: 0123456x7
434 // ---------
435 // reference offsets: 012xx3456
436 //
437 // This shows what query index is associated with which reference offset and
438 // vice versa.
439 // For ones where an x appears, -1 would be returned.
440 //
441 void Cigar::setQueryAndReferenceIndexes()
442 {
443  // First ensure that the vectors are clear by clearing them.
444  clearQueryAndReferenceIndexes();
445 
446  int extPos = 0;
447 
448  // Process each cigar index.
449  for (uint32_t cigarIndex = 0; cigarIndex < cigarOperations.size(); cigarIndex++)
450  {
451  // Process the cigar operation.
452  switch (cigarOperations[cigarIndex].operation)
453  {
454  case match:
455  case mismatch:
456  // For match/mismatch, update the maps between query
457  // and reference for the number of matches/mismatches.
458  for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
459  {
460  // The associated indexes are the next location in
461  // each array, which is equal to the current size.
462  int32_t queryToRefLen = queryToRef.size();
463  int32_t refToQueryLen = refToQuery.size();
464  queryToRef.push_back(refToQueryLen);
465  refToQuery.push_back(queryToRefLen);
466  refToCigar.push_back(extPos);
467  queryToCigar.push_back(extPos++);
468  myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar());
469  }
470  break;
471  case insert:
472  case softClip:
473  // Add N/A reference offset for each query index that this
474  // insert covers.
475  for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
476  {
477  queryToRef.push_back(INDEX_NA);
478  queryToCigar.push_back(extPos++);
479  myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar());
480  }
481  break;
482  case del:
483  case skip:
484  // Add N/A query index for each reference offset that this
485  // deletion/skip covers.
486  for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
487  {
488  refToQuery.push_back(INDEX_NA);
489  refToCigar.push_back(extPos++);
490  myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar());
491  }
492  break;
493  case hardClip:
494  case pad:
495  case none:
496  for (uint32_t i = 0; i < cigarOperations[cigarIndex].count; i++)
497  {
498  myExpandedCigar.push_back(cigarOperations[cigarIndex].getChar());
499  ++extPos;
500  }
501  break;
502  };
503  }
504 }
505 
int32_t getExpandedCigarIndexFromQueryIndex(int32_t queryIndex)
Returns the index into the expanded cigar for the cigar associated with the specified queryIndex...
Definition: Cigar.cpp:258
int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos)
Return the reference position associated with the specified query index or INDEX_NA based on this cig...
Definition: Cigar.cpp:217
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition: Cigar.h:91
Padding (not in reference or query). Associated with CIGAR Operation "P".
Definition: Cigar.h:96
int getNumEndClips() const
Return the number of clips that are at the end of the cigar.
Definition: Cigar.cpp:166
skipped region from the reference (the reference contains bases that have no corresponding base in th...
Definition: Cigar.h:93
no operation has been set.
Definition: Cigar.h:88
int size() const
Return the number of cigar operations.
Definition: Cigar.h:364
int getNumBeginClips() const
Return the number of clips that are at the beginning of the cigar.
Definition: Cigar.cpp:144
static const int32_t INDEX_NA
Value associated with an index that is not applicable/does not exist, used for converting between que...
Definition: Cigar.h:492
uint32_t getNumOverlaps(int32_t start, int32_t end, int32_t queryStartPos)
Return the number of bases that overlap the reference and the read associated with this cigar that fa...
Definition: Cigar.cpp:334
This file is inspired by the poor quality of string support in STL for what should be trivial capabil...
char getCigarCharOp(int32_t expandedCigarIndex)
Return the character code of the cigar operator associated with the specified expanded CIGAR index...
Definition: Cigar.cpp:295
Hard clip on the read (clipped sequence not present in the query sequence or reference). Associated with CIGAR Operation "H".
Definition: Cigar.h:95
bool operator==(Cigar &rhs) const
Return true if the 2 Cigars are the same (the same operations of the same sizes). ...
Definition: Cigar.cpp:80
void getCigarString(String &cigarString) const
Set the passed in String to the string reprentation of the Cigar operations in this object...
Definition: Cigar.cpp:52
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that)...
Definition: Cigar.h:83
int32_t getRefOffset(int32_t queryIndex)
Return the reference offset associated with the specified query index or INDEX_NA based on this cigar...
Definition: Cigar.cpp:187
char getCigarCharOpFromRefPos(int32_t refPosition, int32_t queryStartPos)
Return the character code of the cigar operator associated with the specified reference position...
Definition: Cigar.cpp:326
mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:90
bool hasIndel()
Return whether or not the cigar has indels (insertions or delections)
Definition: Cigar.cpp:398
void getExpandedString(std::string &s) const
Sets the specified string to a valid CIGAR string of characters that represent the cigar with no digi...
Definition: Cigar.cpp:63
int getExpectedReferenceBaseCount() const
Return the number of bases in the reference that this CIGAR "spans".
Definition: Cigar.cpp:120
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)...
Definition: Cigar.h:94
int32_t getExpandedCigarIndexFromRefPos(int32_t refPosition, int32_t queryStartPos)
Returns the index into the expanded cigar for the cigar associated with the specified reference posit...
Definition: Cigar.cpp:288
match/mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:89
int32_t getQueryIndex(int32_t refOffset)
Return the query index associated with the specified reference offset or INDEX_NA based on this cigar...
Definition: Cigar.cpp:202
deletion from the reference (the reference contains bases that have no corresponding base in the quer...
Definition: Cigar.h:92
int getExpectedQueryBaseCount() const
Return the length of the read that corresponds to the current CIGAR string.
Definition: Cigar.cpp:95
char getCigarCharOpFromQueryIndex(int32_t queryIndex)
Return the character code of the cigar operator associated with the specified queryIndex.
Definition: Cigar.cpp:314
char getCigarCharOpFromRefOffset(int32_t refOffset)
Return the character code of the cigar operator associated with the specified reference offset...
Definition: Cigar.cpp:320
int32_t getExpandedCigarIndexFromRefOffset(int32_t refOffset)
Returns the index into the expanded cigar for the cigar associated with the specified reference offse...
Definition: Cigar.cpp:273