libStatGen Software  1
SamFilter.cpp
1 /*
2  * Copyright (C) 2010 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 //////////////////////////////////////////////////////////////////////////
19 
20 
21 #include "SamFilter.h"
22 
23 #include "SamQuerySeqWithRefHelper.h"
24 #include "BaseUtilities.h"
25 #include "SamFlag.h"
26 
28  GenomeSequence& refSequence,
29  double mismatchThreshold)
30 {
31  // Read & clip from the left & right.
32  SamQuerySeqWithRefIter iterFromFront(record, refSequence, true);
33  SamQuerySeqWithRefIter iterFromBack(record, refSequence, false);
34 
35  SamSingleBaseMatchInfo baseMatchInfo;
36 
37  int32_t readLength = record.getReadLength();
38  // Init last front clip to be prior to the lastFront index (0).
39  const int32_t initialLastFrontClipPos = -1;
40  int32_t lastFrontClipPos = initialLastFrontClipPos;
41  // Init first back clip to be past the last index (readLength).
42  int32_t firstBackClipPos = readLength;
43 
44  bool fromFrontComplete = false;
45  bool fromBackComplete = false;
46  int32_t numBasesFromFront = 0;
47  int32_t numBasesFromBack = 0;
48  int32_t numMismatchFromFront = 0;
49  int32_t numMismatchFromBack = 0;
50 
51  //////////////////////////////////////////////////////////
52  // Determining the clip positions.
53  while(!fromFrontComplete || !fromBackComplete)
54  {
55  // Read from the front (left to right) of the read until
56  // more have been read from that direction than the opposite direction.
57  while(!fromFrontComplete &&
58  ((numBasesFromFront <= numBasesFromBack) ||
59  (fromBackComplete)))
60  {
61  if(iterFromFront.getNextMatchMismatch(baseMatchInfo) == false)
62  {
63  // Nothing more to read in this direction.
64  fromFrontComplete = true;
65  break;
66  }
67  // Got a read. Check to see if it is to or past the last clip.
68  if(baseMatchInfo.getQueryIndex() >= firstBackClipPos)
69  {
70  // This base is past where we are clipping, so we
71  // are done reading in this direction.
72  fromFrontComplete = true;
73  break;
74  }
75  // This is an actual base read from the left to the
76  // right, so up the counter and determine if it was a mismatch.
77  ++numBasesFromFront;
78 
79  if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
80  {
81  // Mismatch
82  ++numMismatchFromFront;
83  // Check to see if it is over the threshold.
84  double mismatchPercent =
85  (double)numMismatchFromFront / numBasesFromFront;
86  if(mismatchPercent > mismatchThreshold)
87  {
88  // Need to clip.
89  lastFrontClipPos = baseMatchInfo.getQueryIndex();
90  // Reset the counters.
91  numBasesFromFront = 0;
92  numMismatchFromFront = 0;
93  }
94  }
95  }
96 
97  // Now, read from right to left until more have been read
98  // from the back than from the front.
99  while(!fromBackComplete &&
100  ((numBasesFromBack <= numBasesFromFront) ||
101  (fromFrontComplete)))
102  {
103  if(iterFromBack.getNextMatchMismatch(baseMatchInfo) == false)
104  {
105  // Nothing more to read in this direction.
106  fromBackComplete = true;
107  break;
108  }
109  // Got a read. Check to see if it is to or past the first clip.
110  if(baseMatchInfo.getQueryIndex() <= lastFrontClipPos)
111  {
112  // This base is past where we are clipping, so we
113  // are done reading in this direction.
114  fromBackComplete = true;
115  break;
116  }
117  // This is an actual base read from the right to the
118  // left, so up the counter and determine if it was a mismatch.
119  ++numBasesFromBack;
120 
121  if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
122  {
123  // Mismatch
124  ++numMismatchFromBack;
125  // Check to see if it is over the threshold.
126  double mismatchPercent =
127  (double)numMismatchFromBack / numBasesFromBack;
128  if(mismatchPercent > mismatchThreshold)
129  {
130  // Need to clip.
131  firstBackClipPos = baseMatchInfo.getQueryIndex();
132  // Reset the counters.
133  numBasesFromBack = 0;
134  numMismatchFromBack = 0;
135  }
136  }
137  }
138  }
139 
140  //////////////////////////////////////////////////////////
141  // Done determining the clip positions, so clip.
142  // To determine the number of clips from the front, add 1 to the
143  // lastFrontClipPos since the index starts at 0.
144  // To determine the number of clips from the back, subtract the
145  // firstBackClipPos from the readLength.
146  // Example:
147  // Pos: 012345
148  // Read: AAAAAA
149  // Read Length = 6. If lastFrontClipPos = 2 and firstBackClipPos = 4, numFrontClips = 3 & numBack = 2.
150  return(softClip(record, lastFrontClipPos + 1, readLength - firstBackClipPos));
151 }
152 
153 
154 // Soft clip the record from the front and/or the back.
156  int32_t numFrontClips,
157  int32_t numBackClips)
158 {
159  //////////////////////////////////////////////////////////
160  Cigar* cigar = record.getCigarInfo();
161  FilterStatus status = NONE;
162  int32_t startPos = record.get0BasedPosition();
163  CigarRoller updatedCigar;
164 
165  status = softClip(*cigar, numFrontClips, numBackClips,
166  startPos, updatedCigar);
167 
168  if(status == FILTERED)
169  {
170  /////////////////////////////
171  // The entire read is clipped, so rather than clipping it,
172  // filter it out.
173  filterRead(record);
174  return(FILTERED);
175  }
176  else if(status == CLIPPED)
177  {
178  // Part of the read was clipped, and now that we have
179  // an updated cigar, update the read.
180  record.setCigar(updatedCigar);
181 
182  // Update the starting position.
183  record.set0BasedPosition(startPos);
184  }
185  return(status);
186 }
187 
188 
189 // Soft clip the cigar from the front and/or the back, writing the value
190 // into the new cigar.
192  int32_t numFrontClips,
193  int32_t numBackClips,
194  int32_t& startPos,
195  CigarRoller& updatedCigar)
196 {
197  int32_t readLength = oldCigar.getExpectedQueryBaseCount();
198  int32_t endClipPos = readLength - numBackClips;
199  FilterStatus status = NONE;
200 
201  if((numFrontClips != 0) || (numBackClips != 0))
202  {
203  // Clipping from front and/or from the back.
204 
205  // Check to see if the entire read was clipped.
206  int32_t totalClips = numFrontClips + numBackClips;
207  if(totalClips >= readLength)
208  {
209  /////////////////////////////
210  // The entire read is clipped, so rather than clipping it,
211  // filter it out.
212  return(FILTERED);
213  }
214 
215  // Part of the read was clipped.
216  status = CLIPPED;
217 
218  // Loop through, creating an updated cigar.
219  int origCigarOpIndex = 0;
220 
221  // Track how many read positions are covered up to this
222  // point by the cigar to determine up to up to what
223  // point in the cigar is affected by this clipping.
224  int32_t numPositions = 0;
225 
226  // Track if any non-clips are in the new cigar.
227  bool onlyClips = true;
228 
229  const Cigar::CigarOperator* op = NULL;
230 
231  //////////////////
232  // Clip from front
233  while((origCigarOpIndex < oldCigar.size()) &&
234  (numPositions < numFrontClips))
235  {
236  op = &(oldCigar.getOperator(origCigarOpIndex));
237  switch(op->operation)
238  {
239  case Cigar::hardClip:
240  // Keep this operation as the new clips do not
241  // affect other clips.
242  updatedCigar += *op;
243  break;
244  case Cigar::del:
245  case Cigar::skip:
246  // Skip and delete are going to be dropped, and
247  // are not in the read, so the read index doesn't
248  // need to be updated
249  break;
250  case Cigar::insert:
251  case Cigar::match:
252  case Cigar::mismatch:
253  case Cigar::softClip:
254  // Update the read index as these types
255  // are found in the read.
256  numPositions += op->count;
257  break;
258  case Cigar::none:
259  default:
260  // Nothing to do for none.
261  break;
262  };
263  ++origCigarOpIndex;
264  }
265 
266  // If bases were clipped from the front, add the clip and
267  // any partial cigar operation as necessary.
268  if(numFrontClips != 0)
269  {
270  // Add the softclip to the front of the read.
271  updatedCigar.Add(Cigar::softClip, numFrontClips);
272 
273  // Add the rest of the last Cigar operation if
274  // it is not entirely clipped.
275  int32_t newCount = numPositions - numFrontClips;
276  if(newCount > 0)
277  {
278  // Before adding it, check to see if the same
279  // operation is clipped from the end.
280  // numPositions greater than the endClipPos
281  // means that it is equal or past that position,
282  // so shorten the number of positions.
283  if(numPositions > endClipPos)
284  {
285  newCount -= (numPositions - endClipPos);
286  }
287  if(newCount > 0)
288  {
289  updatedCigar.Add(op->operation, newCount);
290  if(!Cigar::isClip(op->operation))
291  {
292  onlyClips = false;
293  }
294  }
295  }
296  }
297 
298  // Add operations until the point of the end clip is reached.
299  // For example...
300  // 2M1D3M = MMDMMM readLength = 5
301  // readIndex: 01 234
302  // at cigarOpIndex 0 (2M), numPositions = 2.
303  // at cigarOpIndex 1 (1D), numPositions = 2.
304  // at cigarOpIndex 2 (3M), numPositions = 5.
305  // if endClipPos = 2, we still want to consume the 1D, so
306  // need to keep looping until numPositions > endClipPos
307  while((origCigarOpIndex < oldCigar.size()) &&
308  (numPositions <= endClipPos))
309  {
310  op = &(oldCigar.getOperator(origCigarOpIndex));
311 
312  // Update the numPositions count if the operations indicates
313  // bases within the read.
314  if(!Cigar::foundInQuery(op->operation))
315  {
316  // This operation is not in the query read sequence,
317  // so it is not yet to the endClipPos, just add the
318  // operation do not increment the number of positions.
319  updatedCigar += *op;
320  if(!Cigar::isClip(op->operation))
321  {
322  onlyClips = false;
323  }
324  }
325  else
326  {
327  // This operation appears in the query sequence, so
328  // check to see if the clip occurs in this operation.
329 
330  // endClipPos is 0 based & numPositions is a count.
331  // If endClipPos is 4, then it is the 5th position.
332  // If 4 positions are covered so far (numPositions = 4),
333  // then we are right at endCLipPos: 4-4 = 0, none of
334  // this operation should be kept.
335  // If only 3 positions were covered, then we are at offset
336  // 3, so offset 3 should be added: 4-3 = 1.
337  uint32_t numPosTilClip = endClipPos - numPositions;
338 
339  if(numPosTilClip < op->count)
340  {
341  // this operation is partially clipped, write the part
342  // that was not clipped if it is not all clipped.
343  if(numPosTilClip != 0)
344  {
345  updatedCigar.Add(op->operation,
346  numPosTilClip);
347  if(!Cigar::isClip(op->operation))
348  {
349  onlyClips = false;
350  }
351  }
352  }
353  else
354  {
355  // This operation is not clipped, so add it
356  updatedCigar += *op;
357  if(!Cigar::isClip(op->operation))
358  {
359  onlyClips = false;
360  }
361  }
362  // This operation occurs in the query sequence, so
363  // increment the number of positions covered.
364  numPositions += op->count;
365  }
366 
367  // Move to the next cigar position.
368  ++origCigarOpIndex;
369  }
370 
371  //////////////////
372  // Add the softclip to the back.
373  if(numBackClips != 0)
374  {
375  // Add the softclip to the end
376  updatedCigar.Add(Cigar::softClip, numBackClips);
377  }
378 
379  //////////////////
380  // Add any hardclips remaining in the original cigar to the back.
381  while(origCigarOpIndex < oldCigar.size())
382  {
383  op = &(oldCigar.getOperator(origCigarOpIndex));
384  if(op->operation == Cigar::hardClip)
385  {
386  // Keep this operation as the new clips do not
387  // affect other clips.
388  updatedCigar += *op;
389  }
390  ++origCigarOpIndex;
391  }
392 
393  // Check to see if the new cigar is only clips.
394  if(onlyClips)
395  {
396  // Only clips in the new cigar, so mark the read as filtered
397  // instead of updating the cigar.
398  /////////////////////////////
399  // The entire read was clipped.
400  status = FILTERED;
401  }
402  else
403  {
404  // Part of the read was clipped.
405  // Update the starting position if a clip was added to
406  // the front.
407  if(numFrontClips > 0)
408  {
409  // Convert from query index to reference position (from the
410  // old cigar)
411  // Get the position for the last front clipped position by
412  // getting the position associated with the clipped base on
413  // the reference. Then add one to get to the first
414  // non-clipped position.
415  int32_t lastFrontClipPos = numFrontClips - 1;
416  int32_t newStartPos = oldCigar.getRefPosition(lastFrontClipPos,
417  startPos);
418  if(newStartPos != Cigar::INDEX_NA)
419  {
420  // Add one to get first non-clipped position.
421  startPos = newStartPos + 1;
422  }
423  }
424  }
425  }
426  return(status);
427 }
428 
429 
431  GenomeSequence& refSequence,
432  uint32_t qualityThreshold,
433  uint8_t defaultQualityInt)
434 {
435  uint32_t totalMismatchQuality =
436  sumMismatchQuality(record, refSequence, defaultQualityInt);
437 
438  // If the total mismatch quality is over the threshold,
439  // filter the read.
440  if(totalMismatchQuality > qualityThreshold)
441  {
442  filterRead(record);
443  return(FILTERED);
444  }
445  return(NONE);
446 }
447 
448 
449 // NOTE: Only positions where the reference and read both have bases that
450 // are different and not 'N' are considered mismatches.
452  GenomeSequence& refSequence,
453  uint8_t defaultQualityInt)
454 {
455  // Track the mismatch info.
456  int mismatchQual = 0;
457  int numMismatch = 0;
458 
459  SamQuerySeqWithRefIter sequenceIter(record, refSequence);
460 
461  SamSingleBaseMatchInfo baseMatchInfo;
462  while(sequenceIter.getNextMatchMismatch(baseMatchInfo))
463  {
464  if(baseMatchInfo.getType() == SamSingleBaseMatchInfo::MISMATCH)
465  {
466  // Got a mismatch, get the associated quality.
467  char readQualityChar =
468  record.getQuality(baseMatchInfo.getQueryIndex());
469  uint8_t readQualityInt =
470  BaseUtilities::getPhredBaseQuality(readQualityChar);
471 
472  if(readQualityInt == BaseUtilities::UNKNOWN_QUALITY_INT)
473  {
474  // Quality was not specified, so use the configured setting.
475  readQualityInt = defaultQualityInt;
476  }
477  mismatchQual += readQualityInt;
478  ++numMismatch;
479  }
480  }
481 
482  return(mismatchQual);
483 }
484 
485 
487 {
488  // Filter the read by marking it as unmapped.
489  uint16_t flag = record.getFlag();
490  SamFlag::setUnmapped(flag);
491  // Clear N/A flags.
492  flag &= ~SamFlag::PROPER_PAIR;
493  flag &= ~SamFlag::SECONDARY_ALIGNMENT;
494  flag &= ~SamFlag::SUPPLEMENTARY_ALIGNMENT;
495  record.setFlag(flag);
496  // Clear Cigar
497  record.setCigar("*");
498  // Clear mapping quality
499  record.setMapQuality(0);
500 }
static const uint8_t UNKNOWN_QUALITY_INT
Int value used when the quality is unknown.
Definition: BaseUtilities.h:51
static uint8_t getPhredBaseQuality(char charQuality)
Get phred base quality from the specified ascii quality.
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object....
Definition: CigarRoller.h:67
void Add(Operation operation, int count)
Append the specified operation with the specified count to this object.
Definition: CigarRoller.cpp:77
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that).
Definition: Cigar.h:84
int size() const
Return the number of cigar operations.
Definition: Cigar.h:364
int getExpectedQueryBaseCount() const
Return the length of the read that corresponds to the current CIGAR string.
Definition: Cigar.cpp:95
static const int32_t INDEX_NA
Value associated with an index that is not applicable/does not exist, used for converting between que...
Definition: Cigar.h:492
static bool foundInQuery(Operation op)
Return true if the specified operation is found in the query sequence, false if not.
Definition: Cigar.h:219
int32_t getRefPosition(int32_t queryIndex, int32_t queryStartPos)
Return the reference position associated with the specified query index or INDEX_NA based on this cig...
Definition: Cigar.cpp:217
@ del
deletion from the reference (the reference contains bases that have no corresponding base in the quer...
Definition: Cigar.h:92
@ mismatch
mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:90
@ hardClip
Hard clip on the read (clipped sequence not present in the query sequence or reference)....
Definition: Cigar.h:95
@ match
match/mismatch operation. Associated with CIGAR Operation "M"
Definition: Cigar.h:89
@ insert
insertion to the reference (the query sequence contains bases that have no corresponding base in the ...
Definition: Cigar.h:91
@ skip
skipped region from the reference (the reference contains bases that have no corresponding base in th...
Definition: Cigar.h:93
@ softClip
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)....
Definition: Cigar.h:94
@ none
no operation has been set.
Definition: Cigar.h:88
const CigarOperator & getOperator(int i) const
Return the Cigar Operation at the specified index (starting at 0).
Definition: Cigar.h:354
static bool isClip(Operation op)
Return true if the specified operation is a clipping operation, false if not.
Definition: Cigar.h:261
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
FilterStatus
Enum describing what sort of filtering was done.
Definition: SamFilter.h:29
@ NONE
The filter did not affect the read.
Definition: SamFilter.h:30
@ FILTERED
Filtering caused the read to be modified to unmapped.
Definition: SamFilter.h:32
@ CLIPPED
Filtering clipped the read.
Definition: SamFilter.h:31
static FilterStatus softClip(SamRecord &record, int32_t numFrontClips, int32_t numBackClips)
Soft clip the record from the front and/or the back.
Definition: SamFilter.cpp:155
static uint32_t sumMismatchQuality(SamRecord &record, GenomeSequence &refSequence, uint8_t defaultQualityInt)
Get the sum of the qualities of all mismatches in the record.
Definition: SamFilter.cpp:451
static FilterStatus clipOnMismatchThreshold(SamRecord &record, GenomeSequence &refSequence, double mismatchThreshold)
Clip the read based on the specified mismatch threshold.
Definition: SamFilter.cpp:27
static void filterRead(SamRecord &record)
Filter the read by marking it as unmapped.
Definition: SamFilter.cpp:486
static FilterStatus filterOnMismatchQuality(SamRecord &record, GenomeSequence &refSequence, uint32_t qualityThreshold, uint8_t defaultQualityInt)
Filter the read based on the specified quality threshold.
Definition: SamFilter.cpp:430
Class for extracting information from a SAM Flag.
Definition: SamFlag.h:29
static void setUnmapped(uint16_t &flag)
Mark the passed in flag as unmapped.
Definition: SamFlag.h:104
Iterates through the query and compare with reference.
bool getNextMatchMismatch(SamSingleBaseMatchInfo &matchMismatchInfo)
Returns information for the next position where the query and the reference match or mismatch.
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record.
Definition: SamRecord.h:52
Cigar * getCigarInfo()
Returns a pointer to the Cigar object associated with this record.
Definition: SamRecord.cpp:1836
bool setMapQuality(uint8_t mapQuality)
Set the mapping quality (MAPQ).
Definition: SamRecord.cpp:251
bool setFlag(uint16_t flag)
Set the bitwise FLAG to the specified value.
Definition: SamRecord.cpp:215
uint16_t getFlag()
Get the flag (FLAG).
Definition: SamRecord.cpp:1384
bool setCigar(const char *cigar)
Set the CIGAR to the specified SAM formatted cigar string.
Definition: SamRecord.cpp:259
int32_t getReadLength()
Get the length of the read.
Definition: SamRecord.cpp:1391
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
Definition: SamRecord.cpp:1319
bool set0BasedPosition(int32_t position)
Set the leftmost position using the specified 0-based (BAM format) value.
Definition: SamRecord.cpp:242
const char * getQuality()
Returns the SAM formatted quality string (QUAL).
Definition: SamRecord.cpp:1638
This class contains the match/mismatch information between the reference and a read for a single base...
int32_t getQueryIndex()
Get the query index for this object.
Type getType()
Get the type (match/mismatch/unknown) for this object.