eXpress “1.5”

src/mapparser.cpp

00001 //
00002 //  mapparser.cpp
00003 //  express
00004 //
00005 //  Created by Adam Roberts on 8/19/11.
00006 //  Copyright 2011 Adam Roberts. All rights reserved.
00007 //
00008 
00009 #include "mapparser.h"
00010 #include "main.h"
00011 #include "fragments.h"
00012 #include "targets.h"
00013 #include "threadsafety.h"
00014 #include "library.h"
00015 #include <boost/algorithm/string/predicate.hpp>
00016 
00017 using namespace std;
00018 
00019 const size_t BUFF_SIZE = 9999;
00020 
00028 size_t cigar_length(const char* cigar_str, vector<Indel>& inserts,
00029                     vector<Indel>& deletes) {
00030   inserts.clear();
00031   deletes.clear();
00032   const char* p_cig = cigar_str;
00033   size_t i = 0; // read index
00034   size_t j = 0; // genomic index
00035   while (*p_cig) {
00036     char* t;
00037     size_t op_len = (size_t)strtol(p_cig, &t, 10);
00038     char op_char = toupper(*t);
00039     switch(op_char) {
00040       case 'I':
00041       case 'S':
00042         inserts.push_back(Indel(i, op_len));
00043         i += op_len;
00044         break;
00045       case 'D':
00046         deletes.push_back(Indel(i, op_len));
00047         j += op_len;
00048         break;
00049       case 'M':
00050       case 'N':
00051         i += op_len;
00052         j += op_len;
00053         break;
00054     }
00055     p_cig = t + 1;
00056   }
00057   return j;
00058 }
00059 
00067 size_t cigar_length(vector<BamTools::CigarOp>& cigar_vec,
00068                     vector<Indel>& inserts, vector<Indel>& deletes) {
00069   inserts.clear();
00070   deletes.clear();
00071   size_t i = 0; // read index
00072   size_t j = 0; // genomic index
00073   for (size_t k = 0; k < cigar_vec.size(); ++k) {
00074     char op_char = cigar_vec[k].Type;
00075     size_t op_len = cigar_vec[k].Length;
00076     switch(op_char) {
00077       case 'I':
00078       case 'S':
00079         inserts.push_back(Indel(i, op_len));
00080         i += op_len;
00081         break;
00082       case 'D':
00083         deletes.push_back(Indel(i, op_len));
00084         j += op_len;
00085         break;
00086       case 'M':
00087       case 'N':
00088         i += op_len;
00089         j += op_len;
00090         break;
00091     }
00092   }
00093   return j;
00094 }
00095 
00096 MapParser::MapParser(Library* lib, bool write_active)
00097     : _lib(lib), _write_active(write_active) {
00098 
00099   string in_file = lib->in_file_name;
00100   string out_file = lib->out_file_name;
00101   bool is_sam = false;
00102   if (in_file.size() == 0) {
00103     logger.info("No alignment file specified. Expecting streaming input on "
00104                 "stdin...\n");
00105     _parser.reset(new SAMParser(&cin));
00106     is_sam = true;
00107   } else {
00108     logger.info("Attempting to read '%s' in BAM format...", in_file.c_str());
00109     BamTools::BamReader* reader = new BamTools::BamReader();
00110     if (reader->Open(in_file)) {
00111       logger.info("Parsing BAM header...");
00112       _parser.reset(new BAMParser(reader));
00113       if (out_file.size()) {
00114         out_file += ".bam";
00115         BamTools::BamWriter* writer = new BamTools::BamWriter();
00116         if (writer->Open(out_file, reader->GetHeader(),
00117                          reader->GetReferenceData())) {
00118           bool sample = out_file.substr(out_file.length()-8,4) == "samp";
00119           _writer.reset(new BAMWriter(writer, sample));
00120         } else {
00121           logger.severe("Unable to open output BAM file '%s'.",
00122                         out_file.c_str());
00123         }
00124       }
00125     } else {
00126       delete reader;
00127       logger.info("Input is not in BAM format. Trying SAM...");
00128       ifstream* ifs = new ifstream(in_file.c_str());
00129       if (!ifs->is_open()) {
00130         logger.severe("Unable to open input SAM file '%s'.", in_file.c_str());
00131       }
00132       _parser.reset(new SAMParser(ifs));
00133       is_sam = true;
00134     }
00135   }
00136 
00137   if (is_sam && out_file.size()) {
00138     out_file += ".sam";
00139     ofstream* ofs = new ofstream(out_file.c_str());
00140     if (!ofs->is_open()) {
00141       logger.severe("Unable to open output SAM file '%s'.", out_file.c_str());
00142     }
00143     *ofs << _parser->header();
00144     bool sample = out_file.substr(out_file.length()-8,4) == "samp";
00145     _writer.reset(new SAMWriter(ofs, sample));
00146   }
00147 }
00148 
00149 void MapParser::threaded_parse(ParseThreadSafety* thread_safety_p,
00150                                size_t stop_at,
00151                                size_t num_neighbors) {
00152   ParseThreadSafety& pts = *thread_safety_p;
00153   bool fragments_remain = true;
00154   size_t n = 0;
00155   size_t still_out = 0;
00156 
00157   TargetTable& targ_table = *(_lib->targ_table);
00158 
00159   while (!stop_at || n < stop_at) {
00160     Fragment* frag = NULL;
00161     while (fragments_remain) {
00162       frag = new Fragment(_lib);
00163       fragments_remain = _parser->next_fragment(*frag);
00164       if (frag->num_hits()) {
00165         break;
00166       }
00167       delete frag;
00168       frag = NULL;
00169     }
00170     for (size_t i = 0; frag && i < frag->hits().size(); ++i) {
00171       FragHit& m = *(frag->hits()[i]);
00172       
00173       if (m.first_read() && m.first_read()->seq.length() > max_read_len) {
00174         logger.severe("Length of first read for fragment '%s' is longer than "
00175                       "maximum allowed read length (%d vs. %d). Increase the "
00176                       "limit using the '--max-read-len,L' option.",
00177                       m.frag_name().c_str(),  m.first_read()->seq.length(),
00178                       max_read_len);
00179       }
00180       if (m.second_read() && m.second_read()->seq.length() > max_read_len) {
00181         logger.severe("Length of second read for fragment '%s' is longer than "
00182                       "maximum allowed read length (%d vs. %d). Increase the "
00183                       "limit using the '--max-read-len,L' option.",
00184                       m.frag_name().c_str(),  m.second_read()->seq.length(),
00185                       max_read_len);
00186       }
00187       
00188       Target* t = targ_table.get_targ(m.target_id());
00189       if (!t) {
00190         logger.severe("Target sequence at index '%s' not found. Verify that it "
00191                       "is in the SAM/BAM header and FASTA file.",
00192                       m.target_id());
00193       }
00194       m.target(t);
00195       assert(t->id() == m.target_id());
00196       
00197       // Add num_neighbors targets on either side to the neighbors list.
00198       // Used for experimental feature.
00199       vector<const Target*> neighbors;
00200       for (TargID j = 1; j <= num_neighbors;  j++) {
00201         if (j <= m.target_id()) {
00202           neighbors.push_back(targ_table.get_targ(m.target_id() - j));
00203         }
00204         if (j + m.target_id() < targ_table.size()) {
00205           neighbors.push_back(targ_table.get_targ(m.target_id() + j));
00206         }
00207       }
00208       m.neighbors(neighbors);
00209     }
00210     boost::scoped_ptr<Fragment> done_frag(pts.proc_out.pop(false));
00211     while (done_frag) {
00212       if (_writer && _write_active) {
00213         _writer->write_fragment(*done_frag);
00214       }
00215       still_out--;
00216       done_frag.reset(pts.proc_out.pop(false));
00217     }
00218 
00219     if (!frag) {
00220       break;
00221     }
00222 
00223     pts.proc_in.push(frag);
00224     n++;
00225     still_out++;
00226   }
00227 
00228   pts.proc_in.push(NULL);
00229 
00230   while (still_out) {
00231     boost::scoped_ptr<Fragment> done_frag(pts.proc_out.pop(true));
00232     if (_writer && _write_active) {
00233       _writer->write_fragment(*done_frag);
00234     }
00235     still_out--;
00236   }
00237 }
00238 
00239 BAMParser::BAMParser(BamTools::BamReader* reader) : _reader(reader) {
00240   BamTools::BamAlignment a;
00241 
00242   size_t index = 0;
00243   foreach(const BamTools::RefData& ref, _reader->GetReferenceData()) {
00244     if (_targ_index.count(ref.RefName)) {
00245       logger.severe("Target '%s' appears multiple times in BAM header.",
00246                     ref.RefName.c_str());
00247     }
00248     _targ_index[ref.RefName] = index++;
00249     _targ_lengths[ref.RefName] = ref.RefLength;
00250   }
00251 
00252   // Get first valid ReadHit
00253   _read_buff = new ReadHit();
00254   do {
00255     if (!_reader->GetNextAlignment(a)) {
00256       logger.severe("Input BAM file contains no valid alignments.");
00257     }
00258   } while(!map_end_from_alignment(a));
00259 }
00260 
00261 bool BAMParser::next_fragment(Fragment& nf) {
00262   nf.add_map_end(_read_buff);
00263 
00264   BamTools::BamAlignment a;
00265   _read_buff = new ReadHit();
00266 
00267   while(true) {
00268     if (!_reader->GetNextAlignment(a)) {
00269       return false;
00270     } else if (!map_end_from_alignment(a)) {
00271       continue;
00272     } else if (!nf.add_map_end(_read_buff)) {
00273       return true;
00274     }
00275     _read_buff = new ReadHit();
00276   }
00277 }
00278 
00279 bool BAMParser::map_end_from_alignment(BamTools::BamAlignment& a) {
00280   ReadHit& r = *_read_buff;
00281 
00282   if (!a.IsMapped()) {
00283     return false;
00284   }
00285 
00286   bool is_paired = a.IsPaired();
00287 
00288   if (is_paired && !a.IsProperPair()) {
00289     return false;
00290   }
00291 
00292   if (is_paired && (direction == F || direction == R)) {
00293     return false;
00294   }
00295   
00296   bool is_reversed = a.IsReverseStrand();
00297   bool is_mate_reversed = a.IsMateReverseStrand();
00298   if (is_paired && (!a.IsMateMapped() || a.RefID != a.MateRefID ||
00299                     is_reversed == is_mate_reversed ||
00300                     (is_reversed && a.MatePosition > a.Position) ||
00301                     (is_mate_reversed && a.MatePosition < a.Position))) {
00302     return false;
00303   }
00304   
00305   bool left_first = (!is_paired && !is_reversed) ||
00306                     (a.IsFirstMate() && !is_reversed)||
00307                     (a.IsSecondMate() && is_reversed);
00308 
00309   if (((direction == RF || direction == R) && left_first) ||
00310       ((direction == FR || direction == F) && !left_first)) {
00311     return false;
00312   }
00313     
00314   r.name = a.Name;
00315   if (boost::algorithm::ends_with(r.name, "\1") ||
00316       boost::algorithm::ends_with(r.name, "\2")) {
00317     r.name = r.name.substr(r.name.size()-2);
00318   }
00319 
00320   r.reversed = is_reversed;
00321   r.first = !is_paired || a.IsFirstMate();
00322   r.targ_id = a.RefID;
00323   r.left = a.Position;
00324   r.mate_l = a.MatePosition;
00325   r.seq.set(a.QueryBases, is_reversed);
00326   r.bam = a;
00327   r.right = r.left + cigar_length(a.CigarData, r.inserts, r.deletes);
00328   
00329   foreach (Indel& indel, r.inserts) {
00330     if (indel.len > max_indel_size) {
00331       return false;
00332     }
00333   }
00334   foreach (Indel& indel, r.deletes) {
00335     if (indel.len > max_indel_size) {
00336       return false;
00337     }
00338   }
00339   
00340   return true;
00341 }
00342 
00343 void BAMParser::reset() {
00344   _reader->Rewind();
00345 
00346   // Get first valid FragHit
00347   BamTools::BamAlignment a;
00348   delete _read_buff;
00349   _read_buff = new ReadHit();
00350   do {
00351     _reader->GetNextAlignment(a);
00352   } while(!map_end_from_alignment(a));
00353 }
00354 
00355 SAMParser::SAMParser(istream* in) {
00356   _in = in;
00357 
00358   char line_buff[BUFF_SIZE];
00359   _read_buff = new ReadHit();
00360   _header = "";
00361 
00362   // Parse header
00363   size_t index = 0;
00364   while(_in->good()) {
00365     _in->getline(line_buff, BUFF_SIZE-1, '\n');
00366     if (line_buff[0] != '@') {
00367       break;
00368     }
00369     _header += line_buff;
00370     _header += "\n";
00371 
00372     string str(line_buff);
00373     size_t idx = str.find("SN:");
00374     if (idx!=string::npos) {
00375       string name = str.substr(idx+3);
00376       name = name.substr(0,name.find_first_of("\n\t "));
00377       if (_targ_index.count(name)) {
00378         logger.severe("Target '%s' appears multiple times in SAM header.",
00379                       str.c_str());
00380       }
00381       _targ_index[name] = index++;
00382       idx = str.find("LN:");
00383       if (idx != string::npos) {
00384         string len = str.substr(idx+3);
00385         len = len.substr(0,len.find_first_of("\n\t "));
00386         _targ_lengths[name] = atoi(len.c_str());
00387       }
00388     }
00389   }
00390 
00391   // Load first aligned read
00392   while(!map_end_from_line(line_buff)) {
00393     if (!_in->good()) {
00394       logger.severe("Input SAM file contains no valid alignments.");
00395     }
00396     _in->getline(line_buff, BUFF_SIZE-1, '\n');
00397   }
00398 }
00399 
00400 bool SAMParser::next_fragment(Fragment& nf) {
00401   nf.add_map_end(_read_buff);
00402 
00403   _read_buff = new ReadHit();
00404   char line_buff[BUFF_SIZE];
00405 
00406   while(_in->good()) {
00407     _in->getline (line_buff, BUFF_SIZE-1, '\n');
00408     if (!map_end_from_line(line_buff)) {
00409       continue;
00410     }
00411     if (!nf.add_map_end(_read_buff)) {
00412       break;
00413     }
00414     _read_buff = new ReadHit();
00415   }
00416 
00417   return _in->good();
00418 }
00419 
00420 bool SAMParser::map_end_from_line(char* line) {
00421   ReadHit& r = *_read_buff;
00422   string sam_line(line);
00423   char *p = strtok(line, "\t");
00424   int sam_flag = 0;
00425   bool paired = 0;
00426   bool left_first = 0;
00427   bool other_reversed = 0;
00428 
00429   int i = 0;
00430   while (p && i <= 9) {
00431     switch(i++) {
00432       case 0: {
00433         r.name = p;
00434         if (boost::algorithm::ends_with(r.name, "\1") ||
00435             boost::algorithm::ends_with(r.name, "\2")) {
00436           r.name = r.name.substr(r.name.size()-2);
00437         }
00438         break;
00439       }
00440       case 1: {
00441         sam_flag = atoi(p);
00442         if (sam_flag & 0x4) {
00443           goto stop;
00444         }
00445         paired = sam_flag & 0x1;
00446         if (paired && (direction == F || direction == R)) {
00447           goto stop;
00448         }
00449         if (paired && (!(sam_flag & 0x2) || sam_flag & 0x8)) {
00450           goto stop;
00451         }
00452         r.reversed = sam_flag & 0x10;
00453         other_reversed = sam_flag & 0x20;
00454         if (paired && r.reversed == other_reversed) {
00455           goto stop;
00456         }
00457         r.first = !paired || sam_flag & 0x40;
00458         left_first = ((!paired && !r.reversed) || (r.first && !r.reversed) ||
00459                         (!r.first && r.reversed));
00460         if (((direction == RF || direction == R) && left_first) ||
00461             ((direction == FR || direction == F) && !left_first)) {
00462           goto stop;
00463         }
00464         break;
00465       }
00466       case 2: {
00467         if(p[0] == '*') {
00468           goto stop;
00469         }
00470         if (!_targ_index.count(p)) {
00471           logger.severe("Target sequence '%s' not found. Verify that it is in "
00472                         "the SAM/BAM header and FASTA file.", p);
00473         }
00474         r.targ_id = _targ_index[p];
00475         break;
00476       }
00477       case 3: {
00478         r.left = (size_t)(atoi(p)-1);
00479         break;
00480       }
00481       case 4: {
00482         break;
00483       }
00484       case 5: {
00485         r.right = r.left + cigar_length(p, r.inserts, r.deletes);
00486         foreach (Indel& indel, r.inserts) {
00487           if (indel.len > max_indel_size) {
00488             goto stop;
00489           }
00490         }
00491         foreach (Indel& indel, r.deletes) {
00492           if (indel.len > max_indel_size) {
00493             goto stop;
00494           }
00495         }
00496         break;
00497       }
00498       case 6: {
00499         // skip if only one end mapped of paired-end read
00500         if(paired && p[0] != '=') {
00501           goto stop;
00502         }
00503         break;
00504       }
00505       case 7: {
00506         r.mate_l = atoi(p)-1;
00507         if (paired && ((r.reversed && r.left < (size_t)r.mate_l) ||
00508                        (other_reversed && r.left > (size_t)r.mate_l))) {
00509           goto stop;
00510         }
00511         break;
00512       }
00513       case 8: {
00514         break;
00515       }
00516       case 9: {
00517         r.seq.set(p, r.reversed);
00518         r.sam = sam_line;
00519         goto stop;
00520       }
00521     }
00522     p = strtok(NULL, "\t");
00523   }
00524  stop:
00525   return i == 10;
00526 }
00527 
00528 void SAMParser::reset() {
00529   // Rewind input file
00530   _in->clear();
00531   _in->seekg(0, ios::beg);
00532 
00533   // Load first alignment
00534   char line_buff[BUFF_SIZE];
00535   delete _read_buff;
00536   _read_buff = new ReadHit();
00537 
00538   while(_in->good()) {
00539     _in->getline(line_buff, BUFF_SIZE-1, '\n');
00540     if (line_buff[0] != '@') {
00541       break;
00542     }
00543   }
00544 
00545   while(!map_end_from_line(line_buff)) {
00546     _in->getline(line_buff, BUFF_SIZE-1, '\n');
00547   }
00548 }
00549 
00550 BAMWriter::BAMWriter(BamTools::BamWriter* writer, bool sample)
00551    : _writer(writer) {
00552   _sample = sample;
00553 }
00554 
00555 BAMWriter::~BAMWriter() {
00556   _writer->Close();
00557 }
00558 
00559 void BAMWriter::write_fragment(Fragment& f) {
00560   if (_sample) {
00561     const FragHit* hit = f.sample_hit();
00562     PairStatus ps = hit->pair_status();
00563     if (ps != RIGHT_ONLY) {
00564       _writer->SaveAlignment(hit->left_read()->bam);
00565     }
00566     if (ps != LEFT_ONLY) {
00567       _writer->SaveAlignment(hit->right_read()->bam);
00568     }
00569   } else {
00570     double total = 0;
00571     foreach(FragHit* hit, f.hits()) {
00572       total += sexp(hit->params()->posterior);
00573       PairStatus ps = hit->pair_status();
00574       if (ps != RIGHT_ONLY) {
00575         hit->left_read()->bam.AddTag("XP","f",(float)sexp(hit->params()->posterior));
00576         _writer->SaveAlignment(hit->left_read()->bam);
00577       }
00578       if (ps != LEFT_ONLY) {
00579         hit->right_read()->bam.AddTag("XP","f",(float)sexp(hit->params()->posterior));
00580         _writer->SaveAlignment(hit->right_read()->bam);
00581       }
00582     }
00583     assert(approx_eq(total, 1.0));
00584   }
00585 }
00586 
00587 SAMWriter::SAMWriter(ostream* out, bool sample) : _out(out) {
00588   _sample = sample;
00589 }
00590 
00591 SAMWriter::~SAMWriter() {
00592   _out->flush();
00593 }
00594 
00595 void SAMWriter::write_fragment(Fragment& f) {
00596   if (_sample) {
00597     const FragHit* hit = f.sample_hit();
00598     PairStatus ps = hit->pair_status();
00599 
00600     if (ps != RIGHT_ONLY) {
00601       *_out << hit->left_read()->sam << endl;
00602     }
00603     if (ps != LEFT_ONLY) {
00604       *_out << hit->right_read()->sam << endl;
00605     }
00606   } else {
00607     double total = 0;
00608     foreach(const FragHit* hit, f.hits()) {
00609       total += sexp(hit->params()->posterior);
00610       PairStatus ps = hit->pair_status();
00611       if (ps != RIGHT_ONLY) {
00612         *_out << hit->left_read()->sam << " XP:f:"
00613               << (float)sexp(hit->params()->posterior) << endl;
00614       }
00615       if (ps != LEFT_ONLY) {
00616         *_out << hit->right_read()->sam << " XP:f:"
00617               << (float)sexp(hit->params()->posterior) << endl;
00618       }
00619     }
00620     assert(approx_eq(total, 1.0));
00621   }
00622 }
 All Classes Functions Variables