libStatGen Software  1
SamQuerySeqWithRefHelper.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <stdexcept>
19 
20 #include "SamQuerySeqWithRefHelper.h"
21 #include "BaseUtilities.h"
22 #include "SamFlag.h"
23 
24 SamQuerySeqWithRefIter::SamQuerySeqWithRefIter(SamRecord& record,
25  GenomeSequence& refSequence,
26  bool forward)
27  : myRecord(record),
28  myRefSequence(refSequence),
29  myCigar(NULL),
30  myStartOfReadOnRefIndex(INVALID_GENOME_INDEX),
31  myQueryIndex(0),
32  myForward(forward)
33 {
34  myCigar = myRecord.getCigarInfo();
35  myStartOfReadOnRefIndex =
36  refSequence.getGenomePosition(myRecord.getReferenceName());
37 
38  if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX)
39  {
40  // This reference name was found in the reference file, so
41  // add the start position.
42  myStartOfReadOnRefIndex += myRecord.get0BasedPosition();
43  }
44 
45  if(!forward)
46  {
47  myQueryIndex = myRecord.getReadLength() - 1;
48  }
49 }
50 
51 
52 SamQuerySeqWithRefIter::~SamQuerySeqWithRefIter()
53 {
54 }
55 
56 
57 
59 {
60  myCigar = myRecord.getCigarInfo();
61  if(myCigar == NULL)
62  {
63  // Failed to get Cigar.
64  return(false);
65  }
66 
67  // Get where the position of where this read starts as mapped to the
68  // reference.
69  myStartOfReadOnRefIndex =
70  myRefSequence.getGenomePosition(myRecord.getReferenceName());
71 
72  if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX)
73  {
74  // This reference name was found in the reference file, so
75  // add the start position.
76  myStartOfReadOnRefIndex += myRecord.get0BasedPosition();
77  }
78 
79  myForward = forward;
80 
81  if(myForward)
82  {
83  myQueryIndex = 0;
84  }
85  else
86  {
87  // reverse, so start at the last entry.
88  myQueryIndex = myRecord.getReadLength() - 1;
89  }
90  return(true);
91 }
92 
93 
94 // Returns information for the next position where the query and the
95 // reference match or mismatch. To be a match or mismatch, both the query
96 // and reference must have a base that is not 'N'.
97 // This means:
98 // insertions and deletions are not mismatches or matches.
99 // 'N' bases are not matches or mismatches
100 // Returns true if an entry was found, false if there are no more matches or
101 // mismatches.
103 {
104  // Check whether or not this read is mapped.
105  // If the read is not mapped, return no matches.
106  if(!SamFlag::isMapped(myRecord.getFlag()))
107  {
108  // Not mapped.
109  return(false);
110  }
111 
112  // Check that the Cigar is set.
113  if(myCigar == NULL)
114  {
115  // Error.
116  throw(std::runtime_error("Cannot determine matches/mismatches since failed to retrieve the cigar"));
117  return(false);
118  }
119 
120  // If myStartOfReadOnRefIndex is the default (unset) value, then
121  // the reference was not found, so return false, no matches/mismatches.
122  if(myStartOfReadOnRefIndex == INVALID_GENOME_INDEX)
123  {
124  // This reference name was not found in the reference file, so just
125  // return no matches/mismatches.
126  return(false);
127  }
128 
129 
130  // Repull the read length from the record to check just in case the
131  // record has changed length.
132  // Loop until a match or mismatch is found as long as query index
133  // is still within the read (Loop is broken by a return).
134  while((myQueryIndex < myRecord.getReadLength()) &&
135  (myQueryIndex >= 0))
136  {
137  // Still more bases, look for a match/mismatch.
138 
139  // Get the reference offset for this read position.
140  int32_t refOffset = myCigar->getRefOffset(myQueryIndex);
141  if(refOffset == Cigar::INDEX_NA)
142  {
143  // This is either a softclip or an insertion
144  // which do not count as a match or a mismatch, so
145  // go to the next index.
146  nextIndex();
147  continue;
148  }
149 
150  // Both the reference and the read have a base, so get the bases.
151  char readBase = myRecord.getSequence(myQueryIndex, SamRecord::NONE);
152  char refBase = myRefSequence[myStartOfReadOnRefIndex + refOffset];
153 
154  // If either the read or the reference bases are unknown, then
155  // it does not count as a match or a mismatch.
156  if(BaseUtilities::isAmbiguous(readBase) ||
158  {
159  // Either the reference base or the read base are unknown,
160  // so skip this position.
161  nextIndex();
162  continue;
163  }
164 
165  // Both the read & the reference have a known base, so it is either
166  // a match or a mismatch.
167  matchMismatchInfo.setQueryIndex(myQueryIndex);
168 
169  // Check if they are equal.
170  if(BaseUtilities::areEqual(readBase, refBase))
171  {
172  // Match.
173  matchMismatchInfo.setType(SamSingleBaseMatchInfo::MATCH);
174  // Increment the query index to the next position.
175  nextIndex();
176  return(true);
177  }
178  else
179  {
180  // Mismatch
181  matchMismatchInfo.setType(SamSingleBaseMatchInfo::MISMATCH);
182  // Increment the query index to the next position.
183  nextIndex();
184  return(true);
185  }
186  }
187 
188  // No matches or mismatches were found, so return false.
189  return(false);
190 }
191 
192 
193 void SamQuerySeqWithRefIter::nextIndex()
194 {
195  if(myForward)
196  {
197  ++myQueryIndex;
198  }
199  else
200  {
201  --myQueryIndex;
202  }
203 }
204 
205 
206 SamSingleBaseMatchInfo::SamSingleBaseMatchInfo()
207  : myType(UNKNOWN),
208  myQueryIndex(0)
209 {
210 }
211 
212 
213 SamSingleBaseMatchInfo::~SamSingleBaseMatchInfo()
214 {
215 }
216 
217 
219 {
220  return(myType);
221 }
222 
223 
225 {
226  return(myQueryIndex);
227 }
228 
229 
231 {
232  myType = newType;
233 }
234 
235 
237 {
238  myQueryIndex = queryIndex;
239 }
240 
241 
242 ///////////////////////////////////////////////////////////////////////////
243 void SamQuerySeqWithRef::seqWithEquals(const char* currentSeq,
244  int32_t seq0BasedPos,
245  Cigar& cigar,
246  const char* referenceName,
247  const GenomeSequence& refSequence,
248  std::string& updatedSeq)
249 {
250  updatedSeq = currentSeq;
251 
252  int32_t seqLength = updatedSeq.length();
253  int32_t queryIndex = 0;
254 
255  uint32_t startOfReadOnRefIndex =
256  refSequence.getGenomePosition(referenceName);
257 
258  if(startOfReadOnRefIndex == INVALID_GENOME_INDEX)
259  {
260  // This reference name was not found in the reference file, so just
261  // return.
262  return;
263  }
264  startOfReadOnRefIndex += seq0BasedPos;
265 
266  // Loop until the entire sequence has been updated.
267  while(queryIndex < seqLength)
268  {
269  // Still more bases, look for matches.
270 
271  // Get the reference offset for this read position.
272  int32_t refOffset = cigar.getRefOffset(queryIndex);
273  if(refOffset != Cigar::INDEX_NA)
274  {
275  // Both the reference and the read have a base, so get the bases.
276  char readBase = currentSeq[queryIndex];
277  char refBase = refSequence[startOfReadOnRefIndex + refOffset];
278 
279  // If neither base is unknown and they are the same, count it
280  // as a match.
281  if(!BaseUtilities::isAmbiguous(readBase) &&
282  !BaseUtilities::isAmbiguous(refBase) &&
283  (BaseUtilities::areEqual(readBase, refBase)))
284  {
285  // Match.
286  updatedSeq[queryIndex] = '=';
287  }
288  }
289  // Increment the query index to the next position.
290  ++queryIndex;
291  continue;
292  }
293 }
294 
295 
296 void SamQuerySeqWithRef::seqWithoutEquals(const char* currentSeq,
297  int32_t seq0BasedPos,
298  Cigar& cigar,
299  const char* referenceName,
300  const GenomeSequence& refSequence,
301  std::string& updatedSeq)
302 {
303  updatedSeq = currentSeq;
304 
305  int32_t seqLength = updatedSeq.length();
306  int32_t queryIndex = 0;
307 
308  uint32_t startOfReadOnRefIndex =
309  refSequence.getGenomePosition(referenceName);
310 
311  if(startOfReadOnRefIndex == INVALID_GENOME_INDEX)
312  {
313  // This reference name was not found in the reference file, so just
314  // return.
315  return;
316  }
317  startOfReadOnRefIndex += seq0BasedPos;
318 
319  // Loop until the entire sequence has been updated.
320  while(queryIndex < seqLength)
321  {
322  // Still more bases, look for matches.
323 
324  // Get the reference offset for this read position.
325  int32_t refOffset = cigar.getRefOffset(queryIndex);
326  if(refOffset != Cigar::INDEX_NA)
327  {
328  // Both the reference and the read have a base, so get the bases.
329  char readBase = currentSeq[queryIndex];
330  char refBase = refSequence[startOfReadOnRefIndex + refOffset];
331 
332  // If the bases are equal, set the sequence to the reference
333  // base. (Skips the check for ambiguous to catch a case where
334  // ambiguous had been converted to a '=', and if both are ambiguous,
335  // it will still be set to ambiguous.)
336  if(BaseUtilities::areEqual(readBase, refBase))
337  {
338  // Match.
339  updatedSeq[queryIndex] = refBase;
340  }
341  }
342 
343  // Increment the query index to the next position.
344  ++queryIndex;
345  continue;
346  }
347 }
static void seqWithEquals(const char *currentSeq, int32_t seq0BasedPos, Cigar &cigar, const char *referenceName, const GenomeSequence &refSequence, std::string &updatedSeq)
Gets the sequence with &#39;=&#39; in any position where the sequence matches the reference.
bool reset(bool forward=true)
Reset to start at the beginning of the record.
void setType(Type newType)
Set the type (match/mismatch/unkown) for this object.
void setQueryIndex(int32_t queryIndex)
Set the query index for this object.
static bool areEqual(char base1, char base2)
Returns whether or not two bases are equal (case insensitive), if one of the bases is &#39;=&#39;...
static const int32_t INDEX_NA
Value associated with an index that is not applicable/does not exist, used for converting between que...
Definition: Cigar.h:492
Type getType()
Get the type (match/mismatch/unknown) for this object.
bool getNextMatchMismatch(SamSingleBaseMatchInfo &matchMismatchInfo)
Returns information for the next position where the query and the reference match or mismatch...
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that)...
Definition: Cigar.h:83
int32_t getRefOffset(int32_t queryIndex)
Return the reference offset associated with the specified query index or INDEX_NA based on this cigar...
Definition: Cigar.cpp:187
Leave the sequence as is.
Definition: SamRecord.h:58
int32_t getQueryIndex()
Get the query index for this object.
Type
More types can be added later as needed.
static bool isAmbiguous(char base)
Returns whether or not the specified bases is an indicator for ambiguity.
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record...
Definition: SamRecord.h:51
static void seqWithoutEquals(const char *currentSeq, int32_t seq0BasedPos, Cigar &cigar, const char *referenceName, const GenomeSequence &refSequence, std::string &updatedSeq)
Gets the sequence converting &#39;=&#39; to the appropriate base using the reference.
This class contains the match/mismatch information between the reference and a read for a single base...
genomeIndex_t getGenomePosition(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and position, return the genome position