eXpress “1.5”
|
00001 // 00002 // mapparser.cpp 00003 // express 00004 // 00005 // Created by Adam Roberts on 8/19/11. 00006 // Copyright 2011 Adam Roberts. All rights reserved. 00007 // 00008 00009 #include "mapparser.h" 00010 #include "main.h" 00011 #include "fragments.h" 00012 #include "targets.h" 00013 #include "threadsafety.h" 00014 #include "library.h" 00015 #include <boost/algorithm/string/predicate.hpp> 00016 00017 using namespace std; 00018 00019 const size_t BUFF_SIZE = 9999; 00020 00028 size_t cigar_length(const char* cigar_str, vector<Indel>& inserts, 00029 vector<Indel>& deletes) { 00030 inserts.clear(); 00031 deletes.clear(); 00032 const char* p_cig = cigar_str; 00033 size_t i = 0; // read index 00034 size_t j = 0; // genomic index 00035 while (*p_cig) { 00036 char* t; 00037 size_t op_len = (size_t)strtol(p_cig, &t, 10); 00038 char op_char = toupper(*t); 00039 switch(op_char) { 00040 case 'I': 00041 case 'S': 00042 inserts.push_back(Indel(i, op_len)); 00043 i += op_len; 00044 break; 00045 case 'D': 00046 deletes.push_back(Indel(i, op_len)); 00047 j += op_len; 00048 break; 00049 case 'M': 00050 case 'N': 00051 i += op_len; 00052 j += op_len; 00053 break; 00054 } 00055 p_cig = t + 1; 00056 } 00057 return j; 00058 } 00059 00067 size_t cigar_length(vector<BamTools::CigarOp>& cigar_vec, 00068 vector<Indel>& inserts, vector<Indel>& deletes) { 00069 inserts.clear(); 00070 deletes.clear(); 00071 size_t i = 0; // read index 00072 size_t j = 0; // genomic index 00073 for (size_t k = 0; k < cigar_vec.size(); ++k) { 00074 char op_char = cigar_vec[k].Type; 00075 size_t op_len = cigar_vec[k].Length; 00076 switch(op_char) { 00077 case 'I': 00078 case 'S': 00079 inserts.push_back(Indel(i, op_len)); 00080 i += op_len; 00081 break; 00082 case 'D': 00083 deletes.push_back(Indel(i, op_len)); 00084 j += op_len; 00085 break; 00086 case 'M': 00087 case 'N': 00088 i += op_len; 00089 j += op_len; 00090 break; 00091 } 00092 } 00093 return j; 00094 } 00095 00096 MapParser::MapParser(Library* lib, bool write_active) 00097 : _lib(lib), _write_active(write_active) { 00098 00099 string in_file = lib->in_file_name; 00100 string out_file = lib->out_file_name; 00101 bool is_sam = false; 00102 if (in_file.size() == 0) { 00103 logger.info("No alignment file specified. Expecting streaming input on " 00104 "stdin...\n"); 00105 _parser.reset(new SAMParser(&cin)); 00106 is_sam = true; 00107 } else { 00108 logger.info("Attempting to read '%s' in BAM format...", in_file.c_str()); 00109 BamTools::BamReader* reader = new BamTools::BamReader(); 00110 if (reader->Open(in_file)) { 00111 logger.info("Parsing BAM header..."); 00112 _parser.reset(new BAMParser(reader)); 00113 if (out_file.size()) { 00114 out_file += ".bam"; 00115 BamTools::BamWriter* writer = new BamTools::BamWriter(); 00116 if (writer->Open(out_file, reader->GetHeader(), 00117 reader->GetReferenceData())) { 00118 bool sample = out_file.substr(out_file.length()-8,4) == "samp"; 00119 _writer.reset(new BAMWriter(writer, sample)); 00120 } else { 00121 logger.severe("Unable to open output BAM file '%s'.", 00122 out_file.c_str()); 00123 } 00124 } 00125 } else { 00126 delete reader; 00127 logger.info("Input is not in BAM format. Trying SAM..."); 00128 ifstream* ifs = new ifstream(in_file.c_str()); 00129 if (!ifs->is_open()) { 00130 logger.severe("Unable to open input SAM file '%s'.", in_file.c_str()); 00131 } 00132 _parser.reset(new SAMParser(ifs)); 00133 is_sam = true; 00134 } 00135 } 00136 00137 if (is_sam && out_file.size()) { 00138 out_file += ".sam"; 00139 ofstream* ofs = new ofstream(out_file.c_str()); 00140 if (!ofs->is_open()) { 00141 logger.severe("Unable to open output SAM file '%s'.", out_file.c_str()); 00142 } 00143 *ofs << _parser->header(); 00144 bool sample = out_file.substr(out_file.length()-8,4) == "samp"; 00145 _writer.reset(new SAMWriter(ofs, sample)); 00146 } 00147 } 00148 00149 void MapParser::threaded_parse(ParseThreadSafety* thread_safety_p, 00150 size_t stop_at, 00151 size_t num_neighbors) { 00152 ParseThreadSafety& pts = *thread_safety_p; 00153 bool fragments_remain = true; 00154 size_t n = 0; 00155 size_t still_out = 0; 00156 00157 TargetTable& targ_table = *(_lib->targ_table); 00158 00159 while (!stop_at || n < stop_at) { 00160 Fragment* frag = NULL; 00161 while (fragments_remain) { 00162 frag = new Fragment(_lib); 00163 fragments_remain = _parser->next_fragment(*frag); 00164 if (frag->num_hits()) { 00165 break; 00166 } 00167 delete frag; 00168 frag = NULL; 00169 } 00170 for (size_t i = 0; frag && i < frag->hits().size(); ++i) { 00171 FragHit& m = *(frag->hits()[i]); 00172 00173 if (m.first_read() && m.first_read()->seq.length() > max_read_len) { 00174 logger.severe("Length of first read for fragment '%s' is longer than " 00175 "maximum allowed read length (%d vs. %d). Increase the " 00176 "limit using the '--max-read-len,L' option.", 00177 m.frag_name().c_str(), m.first_read()->seq.length(), 00178 max_read_len); 00179 } 00180 if (m.second_read() && m.second_read()->seq.length() > max_read_len) { 00181 logger.severe("Length of second read for fragment '%s' is longer than " 00182 "maximum allowed read length (%d vs. %d). Increase the " 00183 "limit using the '--max-read-len,L' option.", 00184 m.frag_name().c_str(), m.second_read()->seq.length(), 00185 max_read_len); 00186 } 00187 00188 Target* t = targ_table.get_targ(m.target_id()); 00189 if (!t) { 00190 logger.severe("Target sequence at index '%s' not found. Verify that it " 00191 "is in the SAM/BAM header and FASTA file.", 00192 m.target_id()); 00193 } 00194 m.target(t); 00195 assert(t->id() == m.target_id()); 00196 00197 // Add num_neighbors targets on either side to the neighbors list. 00198 // Used for experimental feature. 00199 vector<const Target*> neighbors; 00200 for (TargID j = 1; j <= num_neighbors; j++) { 00201 if (j <= m.target_id()) { 00202 neighbors.push_back(targ_table.get_targ(m.target_id() - j)); 00203 } 00204 if (j + m.target_id() < targ_table.size()) { 00205 neighbors.push_back(targ_table.get_targ(m.target_id() + j)); 00206 } 00207 } 00208 m.neighbors(neighbors); 00209 } 00210 boost::scoped_ptr<Fragment> done_frag(pts.proc_out.pop(false)); 00211 while (done_frag) { 00212 if (_writer && _write_active) { 00213 _writer->write_fragment(*done_frag); 00214 } 00215 still_out--; 00216 done_frag.reset(pts.proc_out.pop(false)); 00217 } 00218 00219 if (!frag) { 00220 break; 00221 } 00222 00223 pts.proc_in.push(frag); 00224 n++; 00225 still_out++; 00226 } 00227 00228 pts.proc_in.push(NULL); 00229 00230 while (still_out) { 00231 boost::scoped_ptr<Fragment> done_frag(pts.proc_out.pop(true)); 00232 if (_writer && _write_active) { 00233 _writer->write_fragment(*done_frag); 00234 } 00235 still_out--; 00236 } 00237 } 00238 00239 BAMParser::BAMParser(BamTools::BamReader* reader) : _reader(reader) { 00240 BamTools::BamAlignment a; 00241 00242 size_t index = 0; 00243 foreach(const BamTools::RefData& ref, _reader->GetReferenceData()) { 00244 if (_targ_index.count(ref.RefName)) { 00245 logger.severe("Target '%s' appears multiple times in BAM header.", 00246 ref.RefName.c_str()); 00247 } 00248 _targ_index[ref.RefName] = index++; 00249 _targ_lengths[ref.RefName] = ref.RefLength; 00250 } 00251 00252 // Get first valid ReadHit 00253 _read_buff = new ReadHit(); 00254 do { 00255 if (!_reader->GetNextAlignment(a)) { 00256 logger.severe("Input BAM file contains no valid alignments."); 00257 } 00258 } while(!map_end_from_alignment(a)); 00259 } 00260 00261 bool BAMParser::next_fragment(Fragment& nf) { 00262 nf.add_map_end(_read_buff); 00263 00264 BamTools::BamAlignment a; 00265 _read_buff = new ReadHit(); 00266 00267 while(true) { 00268 if (!_reader->GetNextAlignment(a)) { 00269 return false; 00270 } else if (!map_end_from_alignment(a)) { 00271 continue; 00272 } else if (!nf.add_map_end(_read_buff)) { 00273 return true; 00274 } 00275 _read_buff = new ReadHit(); 00276 } 00277 } 00278 00279 bool BAMParser::map_end_from_alignment(BamTools::BamAlignment& a) { 00280 ReadHit& r = *_read_buff; 00281 00282 if (!a.IsMapped()) { 00283 return false; 00284 } 00285 00286 bool is_paired = a.IsPaired(); 00287 00288 if (is_paired && !a.IsProperPair()) { 00289 return false; 00290 } 00291 00292 if (is_paired && (direction == F || direction == R)) { 00293 return false; 00294 } 00295 00296 bool is_reversed = a.IsReverseStrand(); 00297 bool is_mate_reversed = a.IsMateReverseStrand(); 00298 if (is_paired && (!a.IsMateMapped() || a.RefID != a.MateRefID || 00299 is_reversed == is_mate_reversed || 00300 (is_reversed && a.MatePosition > a.Position) || 00301 (is_mate_reversed && a.MatePosition < a.Position))) { 00302 return false; 00303 } 00304 00305 bool left_first = (!is_paired && !is_reversed) || 00306 (a.IsFirstMate() && !is_reversed)|| 00307 (a.IsSecondMate() && is_reversed); 00308 00309 if (((direction == RF || direction == R) && left_first) || 00310 ((direction == FR || direction == F) && !left_first)) { 00311 return false; 00312 } 00313 00314 r.name = a.Name; 00315 if (boost::algorithm::ends_with(r.name, "\1") || 00316 boost::algorithm::ends_with(r.name, "\2")) { 00317 r.name = r.name.substr(r.name.size()-2); 00318 } 00319 00320 r.reversed = is_reversed; 00321 r.first = !is_paired || a.IsFirstMate(); 00322 r.targ_id = a.RefID; 00323 r.left = a.Position; 00324 r.mate_l = a.MatePosition; 00325 r.seq.set(a.QueryBases, is_reversed); 00326 r.bam = a; 00327 r.right = r.left + cigar_length(a.CigarData, r.inserts, r.deletes); 00328 00329 foreach (Indel& indel, r.inserts) { 00330 if (indel.len > max_indel_size) { 00331 return false; 00332 } 00333 } 00334 foreach (Indel& indel, r.deletes) { 00335 if (indel.len > max_indel_size) { 00336 return false; 00337 } 00338 } 00339 00340 return true; 00341 } 00342 00343 void BAMParser::reset() { 00344 _reader->Rewind(); 00345 00346 // Get first valid FragHit 00347 BamTools::BamAlignment a; 00348 delete _read_buff; 00349 _read_buff = new ReadHit(); 00350 do { 00351 _reader->GetNextAlignment(a); 00352 } while(!map_end_from_alignment(a)); 00353 } 00354 00355 SAMParser::SAMParser(istream* in) { 00356 _in = in; 00357 00358 char line_buff[BUFF_SIZE]; 00359 _read_buff = new ReadHit(); 00360 _header = ""; 00361 00362 // Parse header 00363 size_t index = 0; 00364 while(_in->good()) { 00365 _in->getline(line_buff, BUFF_SIZE-1, '\n'); 00366 if (line_buff[0] != '@') { 00367 break; 00368 } 00369 _header += line_buff; 00370 _header += "\n"; 00371 00372 string str(line_buff); 00373 size_t idx = str.find("SN:"); 00374 if (idx!=string::npos) { 00375 string name = str.substr(idx+3); 00376 name = name.substr(0,name.find_first_of("\n\t ")); 00377 if (_targ_index.count(name)) { 00378 logger.severe("Target '%s' appears multiple times in SAM header.", 00379 str.c_str()); 00380 } 00381 _targ_index[name] = index++; 00382 idx = str.find("LN:"); 00383 if (idx != string::npos) { 00384 string len = str.substr(idx+3); 00385 len = len.substr(0,len.find_first_of("\n\t ")); 00386 _targ_lengths[name] = atoi(len.c_str()); 00387 } 00388 } 00389 } 00390 00391 // Load first aligned read 00392 while(!map_end_from_line(line_buff)) { 00393 if (!_in->good()) { 00394 logger.severe("Input SAM file contains no valid alignments."); 00395 } 00396 _in->getline(line_buff, BUFF_SIZE-1, '\n'); 00397 } 00398 } 00399 00400 bool SAMParser::next_fragment(Fragment& nf) { 00401 nf.add_map_end(_read_buff); 00402 00403 _read_buff = new ReadHit(); 00404 char line_buff[BUFF_SIZE]; 00405 00406 while(_in->good()) { 00407 _in->getline (line_buff, BUFF_SIZE-1, '\n'); 00408 if (!map_end_from_line(line_buff)) { 00409 continue; 00410 } 00411 if (!nf.add_map_end(_read_buff)) { 00412 break; 00413 } 00414 _read_buff = new ReadHit(); 00415 } 00416 00417 return _in->good(); 00418 } 00419 00420 bool SAMParser::map_end_from_line(char* line) { 00421 ReadHit& r = *_read_buff; 00422 string sam_line(line); 00423 char *p = strtok(line, "\t"); 00424 int sam_flag = 0; 00425 bool paired = 0; 00426 bool left_first = 0; 00427 bool other_reversed = 0; 00428 00429 int i = 0; 00430 while (p && i <= 9) { 00431 switch(i++) { 00432 case 0: { 00433 r.name = p; 00434 if (boost::algorithm::ends_with(r.name, "\1") || 00435 boost::algorithm::ends_with(r.name, "\2")) { 00436 r.name = r.name.substr(r.name.size()-2); 00437 } 00438 break; 00439 } 00440 case 1: { 00441 sam_flag = atoi(p); 00442 if (sam_flag & 0x4) { 00443 goto stop; 00444 } 00445 paired = sam_flag & 0x1; 00446 if (paired && (direction == F || direction == R)) { 00447 goto stop; 00448 } 00449 if (paired && (!(sam_flag & 0x2) || sam_flag & 0x8)) { 00450 goto stop; 00451 } 00452 r.reversed = sam_flag & 0x10; 00453 other_reversed = sam_flag & 0x20; 00454 if (paired && r.reversed == other_reversed) { 00455 goto stop; 00456 } 00457 r.first = !paired || sam_flag & 0x40; 00458 left_first = ((!paired && !r.reversed) || (r.first && !r.reversed) || 00459 (!r.first && r.reversed)); 00460 if (((direction == RF || direction == R) && left_first) || 00461 ((direction == FR || direction == F) && !left_first)) { 00462 goto stop; 00463 } 00464 break; 00465 } 00466 case 2: { 00467 if(p[0] == '*') { 00468 goto stop; 00469 } 00470 if (!_targ_index.count(p)) { 00471 logger.severe("Target sequence '%s' not found. Verify that it is in " 00472 "the SAM/BAM header and FASTA file.", p); 00473 } 00474 r.targ_id = _targ_index[p]; 00475 break; 00476 } 00477 case 3: { 00478 r.left = (size_t)(atoi(p)-1); 00479 break; 00480 } 00481 case 4: { 00482 break; 00483 } 00484 case 5: { 00485 r.right = r.left + cigar_length(p, r.inserts, r.deletes); 00486 foreach (Indel& indel, r.inserts) { 00487 if (indel.len > max_indel_size) { 00488 goto stop; 00489 } 00490 } 00491 foreach (Indel& indel, r.deletes) { 00492 if (indel.len > max_indel_size) { 00493 goto stop; 00494 } 00495 } 00496 break; 00497 } 00498 case 6: { 00499 // skip if only one end mapped of paired-end read 00500 if(paired && p[0] != '=') { 00501 goto stop; 00502 } 00503 break; 00504 } 00505 case 7: { 00506 r.mate_l = atoi(p)-1; 00507 if (paired && ((r.reversed && r.left < (size_t)r.mate_l) || 00508 (other_reversed && r.left > (size_t)r.mate_l))) { 00509 goto stop; 00510 } 00511 break; 00512 } 00513 case 8: { 00514 break; 00515 } 00516 case 9: { 00517 r.seq.set(p, r.reversed); 00518 r.sam = sam_line; 00519 goto stop; 00520 } 00521 } 00522 p = strtok(NULL, "\t"); 00523 } 00524 stop: 00525 return i == 10; 00526 } 00527 00528 void SAMParser::reset() { 00529 // Rewind input file 00530 _in->clear(); 00531 _in->seekg(0, ios::beg); 00532 00533 // Load first alignment 00534 char line_buff[BUFF_SIZE]; 00535 delete _read_buff; 00536 _read_buff = new ReadHit(); 00537 00538 while(_in->good()) { 00539 _in->getline(line_buff, BUFF_SIZE-1, '\n'); 00540 if (line_buff[0] != '@') { 00541 break; 00542 } 00543 } 00544 00545 while(!map_end_from_line(line_buff)) { 00546 _in->getline(line_buff, BUFF_SIZE-1, '\n'); 00547 } 00548 } 00549 00550 BAMWriter::BAMWriter(BamTools::BamWriter* writer, bool sample) 00551 : _writer(writer) { 00552 _sample = sample; 00553 } 00554 00555 BAMWriter::~BAMWriter() { 00556 _writer->Close(); 00557 } 00558 00559 void BAMWriter::write_fragment(Fragment& f) { 00560 if (_sample) { 00561 const FragHit* hit = f.sample_hit(); 00562 PairStatus ps = hit->pair_status(); 00563 if (ps != RIGHT_ONLY) { 00564 _writer->SaveAlignment(hit->left_read()->bam); 00565 } 00566 if (ps != LEFT_ONLY) { 00567 _writer->SaveAlignment(hit->right_read()->bam); 00568 } 00569 } else { 00570 double total = 0; 00571 foreach(FragHit* hit, f.hits()) { 00572 total += sexp(hit->params()->posterior); 00573 PairStatus ps = hit->pair_status(); 00574 if (ps != RIGHT_ONLY) { 00575 hit->left_read()->bam.AddTag("XP","f",(float)sexp(hit->params()->posterior)); 00576 _writer->SaveAlignment(hit->left_read()->bam); 00577 } 00578 if (ps != LEFT_ONLY) { 00579 hit->right_read()->bam.AddTag("XP","f",(float)sexp(hit->params()->posterior)); 00580 _writer->SaveAlignment(hit->right_read()->bam); 00581 } 00582 } 00583 assert(approx_eq(total, 1.0)); 00584 } 00585 } 00586 00587 SAMWriter::SAMWriter(ostream* out, bool sample) : _out(out) { 00588 _sample = sample; 00589 } 00590 00591 SAMWriter::~SAMWriter() { 00592 _out->flush(); 00593 } 00594 00595 void SAMWriter::write_fragment(Fragment& f) { 00596 if (_sample) { 00597 const FragHit* hit = f.sample_hit(); 00598 PairStatus ps = hit->pair_status(); 00599 00600 if (ps != RIGHT_ONLY) { 00601 *_out << hit->left_read()->sam << endl; 00602 } 00603 if (ps != LEFT_ONLY) { 00604 *_out << hit->right_read()->sam << endl; 00605 } 00606 } else { 00607 double total = 0; 00608 foreach(const FragHit* hit, f.hits()) { 00609 total += sexp(hit->params()->posterior); 00610 PairStatus ps = hit->pair_status(); 00611 if (ps != RIGHT_ONLY) { 00612 *_out << hit->left_read()->sam << " XP:f:" 00613 << (float)sexp(hit->params()->posterior) << endl; 00614 } 00615 if (ps != LEFT_ONLY) { 00616 *_out << hit->right_read()->sam << " XP:f:" 00617 << (float)sexp(hit->params()->posterior) << endl; 00618 } 00619 } 00620 assert(approx_eq(total, 1.0)); 00621 } 00622 }