libStatGen Software  1
SamTags.cpp
1 /*
2  * Copyright (C) 2011 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "SamTags.h"
19 #include "BaseUtilities.h"
20 
21 const char* SamTags::BQ_TAG = "BQ";
22 const char SamTags::BQ_TAG_TYPE = 'Z';
23 const char* SamTags::MD_TAG = "MD";
24 const char SamTags::MD_TAG_TYPE = 'Z';
25 const char* SamTags::ORIG_POS_TAG = "OP";
26 const char SamTags::ORIG_POS_TAG_TYPE = 'i';
27 const char* SamTags::ORIG_CIGAR_TAG = "OC";
28 const char SamTags::ORIG_CIGAR_TAG_TYPE = 'Z';
29 const char* SamTags::ORIG_QUAL_TAG = "OQ";
30 const char SamTags::ORIG_QUAL_TAG_TYPE = 'Z';
31 
32 
33 // Create the MD tag for the specified input record and the genome.
34 bool SamTags::createMDTag(String& outputMDtag, SamRecord& inputRec,
35  GenomeSequence& genome)
36 {
37  outputMDtag.Clear();
38  // Get the cigar to use for determing alignment.
39  Cigar* cigarInfo = inputRec.getCigarInfo();
40  if(cigarInfo == NULL)
41  {
42  throw(std::runtime_error("Cannot createMDTag - failed to get the cigar"));
43  return(false);
44  }
45  int32_t queryIndex = Cigar::INDEX_NA;
46 
47  // get where this read starts on the reference.
48  uint32_t startOfReadOnRefIndex =
49  genome.getGenomePosition(inputRec.getReferenceName());
50  if(startOfReadOnRefIndex == (uint32_t)INVALID_CHROMOSOME_INDEX)
51  {
52  // Failed to find the reference for this chromosome, so return false.
53  return(false);
54  }
55  startOfReadOnRefIndex += inputRec.get0BasedPosition();
56 
57  // Track the number of consecutive matches.
58  int32_t matchCount = 0;
59  // Track if it is currently in a deletion so it knows when not to add
60  // a '^'.
61  bool currentDeletion = false;
62 
63  // Loop through the Reference indices (ignores insertions/pads/clips).
64  for(int refOffset = 0;
65  refOffset < cigarInfo->getExpectedReferenceBaseCount();
66  ++refOffset)
67  {
68  // Get the query index for this reference position..
69  queryIndex = cigarInfo->getQueryIndex(refOffset);
70 
71  char refBase = genome[startOfReadOnRefIndex + refOffset];
72 
73  if(queryIndex != Cigar::INDEX_NA)
74  {
75  // Both the reference and the read have a base, so get the bases.
76  char readBase = inputRec.getSequence(queryIndex);
77  currentDeletion = false;
78 
79  // If neither base is unknown and they are the same, count it
80  // as a match.
81  if(!BaseUtilities::isAmbiguous(readBase) &&
82  !BaseUtilities::isAmbiguous(refBase) &&
83  (BaseUtilities::areEqual(readBase, refBase)))
84  {
85  // Match, so update counter.
86  ++matchCount;
87  }
88  else
89  {
90  // Mismatch, so output the number of matches if any.
91  if(matchCount != 0)
92  {
93  outputMDtag += matchCount;
94  matchCount = 0;
95  }
96  outputMDtag += refBase;
97  }
98  }
99  else
100  {
101  // This reference position is not in the query, so it is a deletion.
102  // Deletion, so output the number of matches if any.
103  if(matchCount != 0)
104  {
105  outputMDtag += matchCount;
106  matchCount = 0;
107  }
108 
109  if(!currentDeletion)
110  {
111  // Not currently in a deletion, so add the ^
112  outputMDtag += '^';
113  }
114  // Add the deleted base.
115  outputMDtag += refBase;
116  currentDeletion = true;
117  }
118  }
119 
120  // output the match count at the end.
121  outputMDtag += matchCount;
122  return(true);
123 }
124 
125 // Check to see if the MD tag in the record is accurate.
127 {
128  String calcMDtag;
129  if(!createMDTag(calcMDtag, inputRec, genome))
130  {
131  // Could not generate the MD tag, so just return that it is incorrect.
132  return(false);
133  }
134 
135  const String* origMDtag = inputRec.getStringTag(MD_TAG);
136 
137  if(origMDtag == NULL)
138  {
139  // There was no tag.
140  // if there is not a new tag, then they are the same and true
141  // should be returned. If there is a new tag, then the old one was
142  // wrong so false should be returned. So just return the result of
143  // IsEmpty.
144  return(calcMDtag.IsEmpty());
145  }
146  else
147  {
148  // origMDtag is not NULL, so just compare the two tags.
149  return(calcMDtag == *origMDtag);
150  }
151 }
152 
153 
154 // Update/Add the MD tag in the inputRec.
155 bool SamTags::updateMDTag(SamRecord& inputRec, GenomeSequence& genome)
156 {
157  // Calculate the new MD tag.
158  String calcMDtag;
159  createMDTag(calcMDtag, inputRec, genome);
160 
161  // Add the MD tag. If it is already there and is different it will
162  // replace it. If it is already there and it is the same, it won't
163  // do anything.
164  return(inputRec.addTag(MD_TAG, MD_TAG_TYPE, calcMDtag.c_str()));
165 }
static bool createMDTag(String &outputMDtag, SamRecord &inputRec, GenomeSequence &genome)
Create the MD tag for the specified input record and the genome.
Definition: SamTags.cpp:34
const String * getStringTag(const char *tag)
Get the string value for the specified tag.
Definition: SamRecord.cpp:2168
Cigar * getCigarInfo()
Returns a pointer to the Cigar object associated with this record.
Definition: SamRecord.cpp:1824
static bool areEqual(char base1, char base2)
Returns whether or not two bases are equal (case insensitive), if one of the bases is &#39;=&#39;...
static const int32_t INDEX_NA
Value associated with an index that is not applicable/does not exist, used for converting between que...
Definition: Cigar.h:492
const char * getSequence()
Returns the SAM formatted sequence string (SEQ), translating the base as specified by setSequenceTran...
Definition: SamRecord.cpp:1556
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that)...
Definition: Cigar.h:83
int getExpectedReferenceBaseCount() const
Return the number of bases in the reference that this CIGAR "spans".
Definition: Cigar.cpp:120
int32_t getQueryIndex(int32_t refOffset)
Return the query index associated with the specified reference offset or INDEX_NA based on this cigar...
Definition: Cigar.cpp:202
static bool isMDTagCorrect(SamRecord &inputRec, GenomeSequence &genome)
Check to see if the MD tag in the record is accurate.
Definition: SamTags.cpp:126
static bool isAmbiguous(char base)
Returns whether or not the specified bases is an indicator for ambiguity.
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
bool addTag(const char *tag, char vtype, const char *value)
Add the specified tag,vtype,value to the record.
Definition: SamRecord.cpp:779
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record...
Definition: SamRecord.h:51
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
Definition: SamRecord.cpp:1307
genomeIndex_t getGenomePosition(const char *chromosomeName, unsigned int chromosomeIndex) const
given a chromosome name and position, return the genome position
const char * getReferenceName()
Get the reference sequence name (RNAME) of the record.
Definition: SamRecord.cpp:1286