eXpress “1.5”
|
00001 00009 #include "main.h" 00010 #include "targets.h" 00011 #include "lengthdistribution.h" 00012 #include "fragments.h" 00013 #include "biascorrection.h" 00014 #include "mismatchmodel.h" 00015 #include "mapparser.h" 00016 #include "library.h" 00017 #include <iostream> 00018 #include <fstream> 00019 #include <cassert> 00020 #include <stdio.h> 00021 #include <limits> 00022 #include <float.h> 00023 00024 using namespace std; 00025 00026 Target::Target(TargID id, const std::string& name, const std::string& seq, 00027 bool prob_seq, double alpha, const Librarian* libs, 00028 const BiasBoss* known_bias_boss, const LengthDistribution* known_fld) 00029 : _libs(libs), 00030 _id(id), 00031 _name(name), 00032 _seq_f(seq, 0, prob_seq), 00033 _seq_r(_seq_f), 00034 _alpha(log(alpha)), 00035 _ret_params(&_curr_params), 00036 _uniq_counts(0), 00037 _tot_counts(0), 00038 _avg_bias(0), 00039 _avg_bias_buffer(0), 00040 _solvable(false) { 00041 if ((_libs->curr_lib()).bias_table) { 00042 _start_bias.reset(new std::vector<float>(seq.length(),0)); 00043 _start_bias_buffer.reset(new std::vector<float>(seq.length(),0)); 00044 _end_bias.reset(new std::vector<float>(seq.length(),0)); 00045 _end_bias_buffer.reset(new std::vector<float>(seq.length(),0)); 00046 } 00047 update_target_bias_buffer(known_bias_boss, known_fld); 00048 swap_bias_parameters(); 00049 _init_pseudo_mass = _cached_eff_len + _alpha; 00050 } 00051 00052 void Target::add_hit(const FragHit& hit, double v, double m) { 00053 double p = hit.params()->posterior; 00054 _curr_params.mass = log_add(_curr_params.mass, p+m); 00055 double mass_with_pseudo = log_add(_ret_params->mass, _init_pseudo_mass); 00056 if (p != LOG_1 || v != LOG_0) { 00057 if (p != LOG_0) { 00058 _curr_params.ambig_mass = log_add(_curr_params.ambig_mass, p+m); 00059 _curr_params.tot_ambig_mass = log_add(_curr_params.tot_ambig_mass, m); 00060 } 00061 double p_hat = _curr_params.ambig_mass; 00062 if (_curr_params.tot_ambig_mass != LOG_0) { 00063 p_hat -= _curr_params.tot_ambig_mass; 00064 } else { 00065 assert(p_hat == LOG_0); 00066 } 00067 assert(p_hat == LOG_0 || p_hat <= LOG_1); 00068 _curr_params.var_sum = min(log_add(_curr_params.var_sum, v + m), 00069 _curr_params.tot_ambig_mass + p_hat 00070 + log_sub(LOG_1, p_hat)); 00071 double var_update = log_add(p + 2*m, v + 2*m); 00072 _curr_params.mass_var = min(log_add(_curr_params.mass_var, var_update), 00073 mass_with_pseudo + log_sub(_bundle->mass(), 00074 mass_with_pseudo)); 00075 } 00076 if (_curr_params.haplotype) { 00077 _curr_params.haplotype->update_mass(this, hit.frag_name(), 00078 hit.params()->align_likelihood, p); 00079 } 00080 (_libs->curr_lib()).targ_table->update_total_fpb(m - _cached_eff_len); 00081 } 00082 00083 void Target::round_reset() { 00084 _last_params = _curr_params; 00085 _curr_params = RoundParams(); 00086 _ret_params = &_last_params; 00087 _init_pseudo_mass = LOG_0; 00088 } 00089 00090 double Target::rho() const { 00091 double eff_len = cached_effective_length(false); 00092 if (eff_len == LOG_0) { 00093 return LOG_0; 00094 } 00095 00096 return mass(true) - eff_len - (_libs->curr_lib()).targ_table->total_fpb(); 00097 } 00098 00099 double Target::mass(bool with_pseudo) const { 00100 if (!with_pseudo) { 00101 return _ret_params->mass; 00102 } 00103 return log_add(_ret_params->mass, _alpha+_cached_eff_len+_avg_bias); 00104 } 00105 00106 double Target::mass_var() const { 00107 return _ret_params->mass_var; 00108 } 00109 00110 double Target::sample_likelihood(bool with_pseudo, 00111 const vector<const Target*>* neighbors) const { 00112 const Library& lib = _libs->curr_lib(); 00113 00114 double ll = LOG_1; 00115 double tot_mass = mass(with_pseudo); 00116 double tot_eff_len = cached_effective_length(lib.bias_table); 00117 if (neighbors) { 00118 foreach (const Target* neighbor, *neighbors) { 00119 tot_mass = log_add(tot_mass, neighbor->mass(with_pseudo)); 00120 tot_eff_len = log_add(tot_eff_len, 00121 neighbor->cached_effective_length(lib.bias_table)); 00122 } 00123 } 00124 ll += tot_mass - tot_eff_len; 00125 assert(!isnan(ll)); 00126 return ll; 00127 } 00128 00129 double Target::align_likelihood(const FragHit& frag) const { 00130 00131 const Library& lib = _libs->curr_lib(); 00132 00133 double ll = LOG_1; 00134 00135 const PairStatus ps = frag.pair_status(); 00136 00137 if (lib.mismatch_table) { 00138 ll += (lib.mismatch_table)->log_likelihood(frag); 00139 } 00140 00141 if (lib.bias_table) { 00142 if (ps != RIGHT_ONLY) { 00143 ll += _start_bias->at(frag.left()); 00144 } 00145 if (ps != LEFT_ONLY) { 00146 ll += _end_bias->at(frag.right() - 1); 00147 } 00148 } 00149 00150 if (ps == PAIRED) { 00151 ll += (lib.fld)->pmf(frag.length()); 00152 } else if (ps == LEFT_ONLY && length() - frag.left() < (lib.fld)->max_val()) { 00153 ll += (lib.fld)->cmf(length() - frag.left()); 00154 } else if (ps == RIGHT_ONLY && frag.right() < (lib.fld)->max_val()) { 00155 ll += (lib.fld)->cmf(frag.right()); 00156 } 00157 00158 assert(!(isnan(ll)||isinf(ll))); 00159 return ll; 00160 } 00161 00162 double Target::est_effective_length(const LengthDistribution* fld, 00163 bool with_bias) const { 00164 if (!fld) { 00165 fld = (_libs->curr_lib()).fld.get(); 00166 } 00167 00168 double eff_len = LOG_0; 00169 00170 double log_length = log((double)length()); 00171 if (log_length < fld->mean()) { 00172 eff_len = log_length; 00173 } else { 00174 for(size_t l = fld->min_val(); l <= min(length(), fld->max_val()); l++) { 00175 eff_len = log_add(eff_len, fld->pmf(l)+log((double)length()-l+1)); 00176 } 00177 } 00178 00179 if (with_bias) { 00180 eff_len += _avg_bias; 00181 } 00182 00183 return eff_len; 00184 } 00185 00186 double Target::cached_effective_length(bool with_bias) const { 00187 if (with_bias) { 00188 return _cached_eff_len + _avg_bias; 00189 } 00190 return _cached_eff_len; 00191 } 00192 00193 void Target::update_target_bias_buffer(const BiasBoss* bias_table, 00194 const LengthDistribution* fld) { 00195 if (bias_table) { 00196 _avg_bias_buffer = bias_table->get_target_bias(*_start_bias_buffer, 00197 *_end_bias_buffer, *this); 00198 } 00199 assert(!isnan(_avg_bias_buffer) && !isinf(_avg_bias_buffer)); 00200 _cached_eff_len_buffer = est_effective_length(fld, false); 00201 } 00202 00203 void Target::swap_bias_parameters() { 00204 _cached_eff_len = _cached_eff_len_buffer; 00205 _avg_bias = _avg_bias_buffer; 00206 _start_bias.swap(_start_bias_buffer); 00207 _end_bias.swap(_end_bias_buffer); 00208 } 00209 00210 void HaplotypeHandler::commit_buffer() { 00211 if (_committed) { 00212 return; 00213 } 00214 00215 bool all_eq = true; 00216 foreach (double val, _align_likelihoods_buff) { 00217 all_eq &= (val == _align_likelihoods_buff[0]); 00218 } 00219 if (!all_eq) { 00220 for (size_t i = 0; i < _targets.size(); ++i) { 00221 _haplo_taus.increment(0, i, _masses_buff[i]); 00222 } 00223 } 00224 00225 for (size_t i = 0; i < _targets.size(); ++i) { 00226 _align_likelihoods_buff[i] = LOG_0; 00227 _masses_buff[i] = LOG_0; 00228 } 00229 _committed = true; 00230 } 00231 00232 HaplotypeHandler::HaplotypeHandler(vector<Target*> targets, double alpha=1) 00233 : _haplo_taus(1, targets.size(), alpha), 00234 _frag_name_buff(""), 00235 _align_likelihoods_buff(vector<double>(targets.size(), LOG_0)), 00236 _masses_buff(vector<double>(targets.size(), LOG_0)), 00237 _committed(true) { 00238 00239 boost::shared_ptr<HaplotypeHandler> handler(this); 00240 foreach(Target* targ, targets) { 00241 _targets.push_back(targ); 00242 targ->haplotype(handler); 00243 } 00244 } 00245 00246 size_t HaplotypeHandler::find_target(const Target* targ) { 00247 for (size_t i = 0; i < _targets.size(); ++i) { 00248 if (_targets[i] == targ) { 00249 assert(_targets[i]->id() == targ->id()); 00250 return i; 00251 } 00252 } 00253 logger.severe("Haplotype target not found."); 00254 return SSIZE_MAX; 00255 } 00256 00257 double HaplotypeHandler::get_mass(const Target* targ, bool with_pseudo) { 00258 commit_buffer(); 00259 00260 TargID i = find_target(targ);; 00261 00262 double total_mass = LOG_0; 00263 foreach(const Target* targ, _targets) { 00264 total_mass = log_add(total_mass, targ->_ret_params->mass); 00265 if (with_pseudo){ 00266 total_mass = log_add(total_mass, targ->cached_effective_length()); 00267 } 00268 } 00269 00270 return _haplo_taus(0, i) + total_mass; 00271 } 00272 00273 void HaplotypeHandler::update_mass(const Target* targ, const string& frag_name, 00274 double align_likelihood, double mass) { 00275 if (frag_name != _frag_name_buff) { 00276 commit_buffer(); 00277 _frag_name_buff = frag_name; 00278 } else { 00279 assert(!_committed); 00280 } 00281 00282 TargID i = find_target(targ); 00283 00284 _align_likelihoods_buff[i] = log_add(_align_likelihoods_buff[i], 00285 align_likelihood); 00286 _masses_buff[i] = log_add(_masses_buff[i], mass); 00287 00288 _committed = false; 00289 } 00290 00291 00292 TargetTable::TargetTable(string targ_fasta_file, string haplotype_file, 00293 bool prob_seqs, bool known_aux_params, double alpha, 00294 const AlphaMap* alpha_map, const Librarian* libs) 00295 : _libs(libs) { 00296 string info_msg = "Loading target sequences"; 00297 const Library& lib = _libs->curr_lib(); 00298 const TransIndex& targ_index = lib.map_parser->targ_index(); 00299 const TransIndex& targ_lengths = lib.map_parser->targ_lengths(); 00300 if (lib.bias_table && !known_aux_params) { 00301 info_msg += " and measuring bias background"; 00302 } 00303 info_msg += "..."; 00304 logger.info(info_msg.c_str()); 00305 00306 size_t num_targs = targ_index.size(); 00307 _targ_map = vector<Target*>(num_targs, NULL); 00308 _total_fpb = log(alpha*num_targs); 00309 00310 boost::unordered_set<string> target_names; 00311 00312 ifstream infile (targ_fasta_file.c_str()); 00313 string line; 00314 string seq = ""; 00315 string name = ""; 00316 if (infile.is_open()) { 00317 while (infile.good()) { 00318 getline(infile, line, '\n'); 00319 if (line.empty()) { 00320 continue; 00321 } 00322 if (line[0] == '>') { 00323 if (!name.empty()) { 00324 if (alpha_map) { 00325 alpha = alpha_map->find(name)->second; 00326 } 00327 add_targ(name, seq, prob_seqs, known_aux_params, alpha, targ_index, 00328 targ_lengths); 00329 } 00330 name = line.substr(1,line.find(' ')-1); 00331 if (target_names.count(name)) { 00332 logger.severe("Target '%s' is duplicated in the input FASTA. Ensure " 00333 "target names are unique and re-map before re-running " 00334 "eXpress.", name.c_str()); 00335 } 00336 if (alpha_map && !alpha_map->count(name)) { 00337 logger.severe("Target '%s' is was not found in the prior parameter " 00338 "file.", name.c_str()); 00339 } 00340 target_names.insert(name); 00341 seq = ""; 00342 } else { 00343 seq += line; 00344 } 00345 } 00346 if (!name.empty()) { 00347 if (alpha_map) { 00348 alpha = alpha_map->find(name)->second; 00349 } 00350 add_targ(name, seq, prob_seqs, known_aux_params, alpha, targ_index, 00351 targ_lengths); 00352 } 00353 00354 infile.close(); 00355 if (lib.bias_table && !known_aux_params) { 00356 lib.bias_table->normalize_expectations(); 00357 } 00358 } else { 00359 logger.severe("Unable to open MultiFASTA file '%s'.", 00360 targ_fasta_file.c_str()); 00361 } 00362 00363 if (size() == 0) { 00364 logger.severe("No targets found in MultiFASTA file '%s'.", 00365 targ_fasta_file.c_str()); 00366 } 00367 00368 for(TransIndex::const_iterator it = targ_index.begin(); 00369 it != targ_index.end(); ++it) { 00370 if (!_targ_map[it->second]) { 00371 logger.severe("Sequence for target '%s' not found in MultiFASTA file " 00372 "'%s'.", it->first.c_str(), targ_fasta_file.c_str()); 00373 } 00374 } 00375 logger.info("Initialized %d targets.", size()); 00376 00377 // Load haplotype information, if provided 00378 if (haplotype_file.size()) { 00379 logger.info("Loading haplotype information..."); 00380 infile.open(haplotype_file.c_str()); 00381 size_t num_haplotype_groups = 0; 00382 if (infile.is_open()) { 00383 const size_t BUFF_SIZE = 99999; 00384 char line_buff[BUFF_SIZE]; 00385 while (infile.good()) { 00386 infile.getline (line_buff, BUFF_SIZE, '\n'); 00387 00388 if (strlen(line_buff) == 0) { 00389 continue; 00390 } 00391 00392 vector<Target*> haplotype_targets; 00393 char *p = strtok(line_buff, ","); 00394 do { 00395 if (targ_index.count(p) == 0) { 00396 logger.severe("Haplotype target '%s' does not exist in MultiFASTA " 00397 "or alignment files.", p); 00398 } 00399 haplotype_targets.push_back(_targ_map[targ_index.at(p)]); 00400 p = strtok(NULL, ","); 00401 } while (p); 00402 00403 if (haplotype_targets.size() < 2) { 00404 logger.severe("Haplotype groups must contain at least 2 targets."); 00405 } 00406 00407 _haplotype_groups.insert(haplotype_targets); 00408 00409 if (!alpha_map) { 00410 foreach(Target* targ, haplotype_targets) { 00411 targ->alpha(alpha/haplotype_targets.size()); 00412 } 00413 } 00414 // Ownership is given to the targets. 00415 new HaplotypeHandler(haplotype_targets); 00416 num_haplotype_groups++; 00417 } 00418 } 00419 logger.info("Initialized " SIZE_T_FMT " haplotype groups.", 00420 num_haplotype_groups); 00421 } 00422 } 00423 00424 TargetTable::~TargetTable() { 00425 foreach(Target* targ, _targ_map) { 00426 delete targ; 00427 } 00428 } 00429 00430 void TargetTable::add_targ(const string& name, const string& seq, bool prob_seq, 00431 bool known_aux_params, double alpha, 00432 const TransIndex& targ_index, 00433 const TransIndex& targ_lengths) { 00434 TransIndex::const_iterator it = targ_index.find(name); 00435 if (it == targ_index.end()) { 00436 logger.warn("Target '%s' exists in MultiFASTA but not alignment " 00437 "(SAM/BAM) file.", name.c_str()); 00438 return; 00439 } 00440 00441 if (targ_lengths.find(name)->second != seq.length()) { 00442 logger.severe("Target '%s' differs in length between MultiFASTA and " 00443 "alignment (SAM/BAM) files (%d vs. %d).", name.c_str(), 00444 seq.length(), targ_lengths.find(name)->second); 00445 } 00446 00447 const Library& lib = _libs->curr_lib(); 00448 const BiasBoss* known_bias_boss = (known_aux_params) ? lib.bias_table.get() 00449 : NULL; 00450 const LengthDistribution* known_fld = (known_aux_params) ? lib.fld.get() 00451 : NULL; 00452 00453 Target* targ = new Target(it->second, name, seq, prob_seq, alpha, _libs, 00454 known_bias_boss, known_fld); 00455 if (lib.bias_table && !known_aux_params) { 00456 (lib.bias_table)->update_expectations(*targ); 00457 } 00458 _targ_map[targ->id()] = targ; 00459 targ->bundle(_bundle_table.create_bundle(targ)); 00460 } 00461 00462 Target* TargetTable::get_targ(TargID id) { 00463 return _targ_map[id]; 00464 } 00465 00466 Bundle* TargetTable::merge_bundles(Bundle* b1, Bundle* b2) { 00467 if (b1 != b2) { 00468 return _bundle_table.merge(b1, b2); 00469 } 00470 return b1; 00471 } 00472 00473 void TargetTable::round_reset() { 00474 foreach(Target* targ, _targ_map) { 00475 targ->round_reset(); 00476 targ->bundle()->incr_mass(targ->mass(false)); 00477 } 00478 foreach(vector<Target*> hap_group, _haplotype_groups) { 00479 // Ownership is given to the targets. 00480 new HaplotypeHandler(hap_group); 00481 } 00482 } 00483 00484 void project_to_polytope(vector<Target*> bundle_targ, 00485 vector<double>& targ_counts, double bundle_counts) { 00486 vector<bool> polytope_bound(bundle_targ.size(), false); 00487 while (true) { 00488 double unbound_counts = 0; 00489 double bound_counts = 0; 00490 for (size_t i = 0; i < bundle_targ.size(); ++i) { 00491 Target& targ = *bundle_targ[i]; 00492 00493 if (targ_counts[i] > targ.tot_counts()) { 00494 targ_counts[i] = targ.tot_counts(); 00495 polytope_bound[i] = true; 00496 } else if (targ_counts[i] < targ.uniq_counts()) { 00497 targ_counts[i] = targ.uniq_counts(); 00498 polytope_bound[i] = true; 00499 } 00500 00501 if (polytope_bound[i]) { 00502 bound_counts += targ_counts[i]; 00503 } else { 00504 unbound_counts += targ_counts[i]; 00505 } 00506 } 00507 00508 if (approx_eq(unbound_counts + bound_counts, bundle_counts)) { 00509 return; 00510 } 00511 00512 if (unbound_counts == 0) { 00513 polytope_bound = vector<bool>(bundle_targ.size(), false); 00514 unbound_counts = bound_counts; 00515 bound_counts = 0; 00516 } 00517 00518 double normalizer = (bundle_counts - bound_counts)/unbound_counts; 00519 for (size_t i = 0; i < bundle_targ.size(); ++i) { 00520 if (!polytope_bound[i]) { 00521 targ_counts[i] *= normalizer; 00522 } 00523 } 00524 } 00525 } 00526 00527 struct Result { 00528 double fpkm, fpkm_std_dev, fpkm_lo, fpkm_hi; 00529 double count_alpha, count_beta; 00530 double est_counts, eff_len, eff_counts; 00531 double cpb; // counts per base for TPM calculation 00532 00533 void set_zeros() { 00534 fpkm = fpkm_std_dev = fpkm_lo = fpkm_hi = 00535 count_alpha = count_beta = est_counts = 00536 eff_len = eff_counts = cpb = 0.0; 00537 } 00538 }; 00539 00540 void TargetTable::masses_to_counts() { 00541 foreach (Bundle* bundle, _bundle_table.bundles()) { 00542 00543 const vector<Target*>& bundle_targ = *(bundle->targets()); 00544 00545 // Calculate total counts for bundle and bundle-level rho 00546 // Do not include pseudo-mass because it will screw up multi-round results 00547 double l_bundle_mass = LOG_0; 00548 foreach (Target* targ, bundle_targ) { 00549 l_bundle_mass = log_add(l_bundle_mass, targ->mass(false)); 00550 } 00551 00552 if (bundle->counts()) { 00553 double l_bundle_counts = log((double)bundle->counts()); 00554 double l_var_renorm = 2*(l_bundle_counts - l_bundle_mass); 00555 00556 vector<double> targ_counts(bundle_targ.size(),0); 00557 bool requires_projection = false; 00558 00559 for (size_t i = 0; i < bundle_targ.size(); ++i) { 00560 Target& targ = *bundle_targ[i]; 00561 double l_targ_frac = targ.mass(false) - l_bundle_mass; 00562 targ_counts[i] = sexp(l_targ_frac + l_bundle_counts); 00563 requires_projection |= targ_counts[i] > (double)targ.tot_counts() || 00564 targ_counts[i] < (double)targ.uniq_counts(); 00565 } 00566 00567 if (bundle_targ.size() > 1 && requires_projection) { 00568 project_to_polytope(bundle_targ, targ_counts, bundle->counts()); 00569 } 00570 00571 // Calculate individual counts and rhos 00572 for (size_t i = 0; i < bundle_targ.size(); ++i) { 00573 Target& targ = *bundle_targ[i]; 00574 double mass = targ.mass(false); 00575 targ._curr_params.mass = log((double)targ_counts[i]); 00576 targ._curr_params.mass_var = min(targ.mass_var(), 00577 mass + log_sub(l_bundle_mass, mass)) 00578 + l_var_renorm; 00579 targ._curr_params.var_sum = targ.var_sum() + l_var_renorm; 00580 } 00581 } 00582 00583 bundle->reset_mass(); 00584 bundle->incr_mass(log((double)bundle->counts())); 00585 } 00586 } 00587 00588 void TargetTable::output_results(string output_dir, size_t tot_counts, 00589 bool output_varcov, bool output_rdds) { 00590 FILE * expr_file = fopen((output_dir + "/results.xprs").c_str(), "w"); 00591 ofstream varcov_file; 00592 ofstream rdds_file; 00593 if (output_varcov) { 00594 varcov_file.open((output_dir + "/varcov.xprs").c_str()); 00595 } 00596 if (output_rdds) { 00597 rdds_file.open((output_dir + "/rdds.xprs").c_str()); 00598 rdds_file << "target_id\tposition\tp_value\tref_nuc\tP(A)\tP(C)\tP(G)\t" 00599 << "P(T)\tobs_A\tobs_C\tobs_G\tobs_T\texp_A\texp_C\texp_G\texp_T" 00600 << endl; 00601 } 00602 fprintf(expr_file, "bundle_id\ttarget_id\tlength\teff_length\ttot_counts\t" 00603 "uniq_counts\test_counts\teff_counts\tambig_distr_alpha\t" 00604 "ambig_distr_beta\tfpkm\tfpkm_conf_low\tfpkm_conf_high\t" 00605 "solvable\ttpm\n"); 00606 00607 const double l_bil = log(1000000000.); 00608 const double l_tot_counts = log((double)tot_counts); 00609 00610 vector<Result> res(size()); 00611 00612 size_t bundle_id = 0; 00613 size_t t_id = 0; 00614 foreach (Bundle* bundle, _bundle_table.bundles()) { 00615 ++bundle_id; 00616 00617 const vector<Target*>& bundle_targ = *(bundle->targets()); 00618 00619 if (output_varcov) { 00620 varcov_file << ">" << bundle_id << ": "; 00621 for (size_t i = 0; i < bundle_targ.size(); ++i) { 00622 if (i) { 00623 varcov_file << ", "; 00624 } 00625 varcov_file << bundle_targ[i]->name(); 00626 } 00627 varcov_file << endl; 00628 } 00629 00630 // Calculate total counts for bundle and bundle-level rho 00631 // Do not include pseudo-mass because it will screw up multi-round results 00632 double l_bundle_mass = LOG_0; 00633 foreach (Target* targ, bundle_targ) { 00634 l_bundle_mass = log_add(l_bundle_mass, targ->mass(false)); 00635 } 00636 00637 if (bundle->counts()) { 00638 const double l_bundle_counts = log((double)bundle->counts()); 00639 const double l_var_renorm = 2*(l_bundle_counts - l_bundle_mass); 00640 00641 vector<double> targ_counts(bundle_targ.size(),0); 00642 bool requires_projection = false; 00643 00644 for (size_t i = 0; i < bundle_targ.size(); ++i) { 00645 Target& targ = *bundle_targ[i]; 00646 const double l_targ_frac = targ.mass(false) - l_bundle_mass; 00647 targ_counts[i] = sexp(l_targ_frac + l_bundle_counts); 00648 requires_projection |= targ_counts[i] > (double)targ.tot_counts() || 00649 targ_counts[i] < (double)targ.uniq_counts(); 00650 } 00651 00652 if (bundle_targ.size() > 1 && requires_projection) { 00653 project_to_polytope(bundle_targ, targ_counts, bundle->counts()); 00654 } 00655 00656 // Calculate individual counts and rhos 00657 for (size_t i = 0; i < bundle_targ.size(); ++i) { 00658 Target& targ = *bundle_targ[i]; 00659 const double l_eff_len = targ.est_effective_length(); 00660 00661 // Calculate count variance 00662 const double mass = targ.mass(false); 00663 const double mass_var = min(targ.mass_var(), 00664 mass + log_sub(l_bundle_mass, mass)); 00665 double count_alpha = 0; 00666 double count_beta = 0; 00667 double count_var = 0; 00668 00669 if (targ.tot_counts() != targ.uniq_counts()) { 00670 double n = targ.tot_counts()-targ.uniq_counts(); 00671 if (targ_counts[i] == 0) { 00672 count_var = n * (n + 2.) / 12.; 00673 count_alpha = 0; 00674 count_beta = 0; 00675 } else if (targ.solvable()) { 00676 double m = (targ_counts[i] - targ.uniq_counts())/n; 00677 assert (m >= 0 && m <= 1); 00678 m = max(m, EPSILON); 00679 m = min(m, 1-EPSILON); 00680 m = log(m); 00681 double v = numeric_limits<double>::max(); 00682 if (sexp(targ.var_sum()) != 0 && targ.tot_ambig_mass() != LOG_0) { 00683 v = targ.var_sum() - targ.tot_ambig_mass(); 00684 } 00685 v = min(v, m + log_sub(log_sub(LOG_1, m), LOG_EPSILON - log(2.))); 00686 00687 count_alpha = m + (log_sub(log_add(m,v), m+m)) - v; 00688 count_alpha = sexp(min(LOG_MAX, count_alpha)); 00689 00690 count_beta = log_sub(LOG_1, m) + log_sub(log_add(m,v), m+m)- v; 00691 count_beta = sexp(min(LOG_MAX, count_beta)); 00692 00693 count_var = mass_var; 00694 00695 assert(count_alpha > 0 && count_beta > 0); 00696 assert(!isinf(count_alpha) && !isinf(count_beta)); 00697 assert(!isnan(count_var)); 00698 } else { 00699 count_var = n * (n + 2.) / 12.; 00700 count_alpha = 1; 00701 count_beta = 1; 00702 } 00703 } 00704 00705 t_id = targ.id(); 00706 00707 // Store results for output 00708 assert(t_id < size()); 00709 res[t_id].count_alpha = count_alpha; 00710 res[t_id].count_beta = count_beta; 00711 00712 double fpkm_constant = sexp(l_bil - l_eff_len - l_tot_counts); 00713 res[t_id].fpkm_std_dev = sexp(0.5*(mass_var + l_var_renorm)); 00714 res[t_id].fpkm = targ_counts[i] * fpkm_constant; 00715 res[t_id].fpkm_lo = max(0.0, 00716 (targ_counts[i] - 00717 2*res[t_id].fpkm_std_dev) * fpkm_constant); 00718 res[t_id].fpkm_hi = (targ_counts[i] + 00719 2*res[t_id].fpkm_std_dev) * fpkm_constant; 00720 00721 res[t_id].est_counts = targ_counts[i]; 00722 res[t_id].eff_len = sexp(l_eff_len); 00723 res[t_id].eff_counts = targ_counts[i] / res[t_id].eff_len * targ.length(); 00724 00725 res[t_id].cpb = targ_counts[i] / res[t_id].eff_len; 00726 00727 if (output_varcov) { 00728 for (size_t j = 0; j < bundle_targ.size(); ++j) { 00729 if (j) { 00730 varcov_file << "\t"; 00731 } 00732 if (i==j) { 00733 varcov_file << scientific << sexp(get_covar(targ.id(), targ.id()) 00734 + l_var_renorm); 00735 } else { 00736 varcov_file << scientific 00737 << -sexp(get_covar(targ.id(), bundle_targ[j]->id()) 00738 + l_var_renorm); 00739 } 00740 } 00741 varcov_file << endl; 00742 } 00743 00744 if (output_rdds) { 00745 const Sequence& targ_seq = targ.seq(); 00746 vector<double> p_vals; 00747 targ_seq.calc_p_vals(p_vals); 00748 for (size_t i = 0; i < p_vals.size(); ++i) { 00749 if (p_vals[i] < 0.01) { 00750 rdds_file << targ.name() << "\t" << i << "\t" << p_vals[i] 00751 << "\t" << NUCS[targ_seq.get_ref(i)]; 00752 for (size_t nuc=0; nuc < NUM_NUCS; nuc++) { 00753 rdds_file << "\t" << sexp(targ_seq.get_prob(i,nuc)); 00754 } 00755 for (size_t nuc=0; nuc < NUM_NUCS; nuc++) { 00756 rdds_file << "\t" << sexp(targ_seq.get_obs(i,nuc)); 00757 } 00758 for (size_t nuc=0; nuc < NUM_NUCS; nuc++) { 00759 rdds_file << "\t" << sexp(targ_seq.get_exp(i,nuc)); 00760 } 00761 rdds_file << endl; 00762 } 00763 } 00764 } 00765 } 00766 } else { 00767 for (size_t i = 0; i < bundle_targ.size(); ++i) { 00768 00769 00770 if (output_varcov) { 00771 for (size_t j = 0; j < bundle_targ.size(); ++j) { 00772 if (j) { 00773 varcov_file << "\t"; 00774 } 00775 varcov_file << scientific << 0.0; 00776 } 00777 varcov_file << endl; 00778 } 00779 } 00780 } 00781 } 00782 00783 // Calculate total counts per base 00784 double cpb_sum = 0.0; 00785 for (size_t i = 0; i < size(); ++i) { 00786 cpb_sum += res[i].cpb; 00787 } 00788 00789 // Calculate TPMs and output results 00790 const double l_mil = log(1000000.); 00791 bundle_id = 0; 00792 t_id = 0; 00793 foreach (Bundle* bundle, _bundle_table.bundles()) { 00794 ++bundle_id; 00795 00796 const vector<Target*>& bundle_targ = *(bundle->targets()); 00797 00798 for (size_t i = 0; i < bundle_targ.size(); ++i) { 00799 00800 Target& targ = *bundle_targ[i]; 00801 00802 t_id = targ.id(); 00803 00804 double tpm; 00805 00806 if (bundle->counts()) { 00807 double trans_frac = log(res[t_id].cpb / cpb_sum); 00808 tpm = sexp(trans_frac + l_mil); 00809 } else { 00810 res[targ.id()].set_zeros(); 00811 tpm = 0.0; 00812 } 00813 00814 fprintf(expr_file, "" SIZE_T_FMT "\t%s\t" SIZE_T_FMT "\t%f\t" SIZE_T_FMT 00815 "\t" SIZE_T_FMT "\t%f\t%f\t%e\t%e\t%e\t%e\t%e\t%c\t%e\n", 00816 bundle_id, targ.name().c_str(), targ.length(), res[t_id].eff_len, 00817 targ.tot_counts(), targ.uniq_counts(), res[t_id].est_counts, 00818 res[t_id].eff_counts, res[t_id].count_alpha, res[t_id].count_beta, 00819 res[t_id].fpkm, res[t_id].fpkm_lo, res[t_id].fpkm_hi, 00820 (targ.solvable())?'T':'F', tpm); 00821 ++t_id; 00822 } 00823 } 00824 fclose(expr_file); 00825 if (output_varcov) { 00826 varcov_file.close(); 00827 } 00828 if (output_rdds) { 00829 rdds_file.close(); 00830 } 00831 } 00832 00833 double TargetTable::total_fpb() const { 00834 boost::unique_lock<boost::mutex>(_fpb_mut); 00835 return _total_fpb; 00836 } 00837 00838 void TargetTable::update_total_fpb(double incr_amt) { 00839 boost::unique_lock<boost::mutex>(_fpb_mut); 00840 _total_fpb = log_add(_total_fpb, incr_amt); 00841 } 00842 00843 void TargetTable::asynch_bias_update(boost::mutex* mutex) { 00844 BiasBoss* bg_table = NULL; 00845 boost::scoped_ptr<BiasBoss> bias_table; 00846 boost::scoped_ptr<LengthDistribution> fld; 00847 00848 bool burned_out_before = false; 00849 00850 const Library& lib = _libs->curr_lib(); 00851 00852 while(running) { 00853 if (bg_table) { 00854 bg_table->normalize_expectations(); 00855 } 00856 { 00857 boost::unique_lock<boost::mutex> lock(*mutex); 00858 if(!fld) { 00859 fld.reset(new LengthDistribution(*(lib.fld))); 00860 } else { 00861 *fld = *(lib.fld); 00862 } 00863 if (lib.bias_table) { 00864 BiasBoss& lib_bias_table = *(lib.bias_table); 00865 if (!bias_table) { 00866 bias_table.reset(new BiasBoss(lib_bias_table)); 00867 } else { 00868 lib_bias_table.copy_expectations(*bg_table); 00869 bg_table->copy_observations(lib_bias_table); 00870 bias_table.reset(bg_table); 00871 } 00872 bg_table = new BiasBoss(lib_bias_table.order(), 0); 00873 } 00874 logger.info("Synchronized auxiliary parameter tables."); 00875 } 00876 00877 if (!edit_detect && burned_out && burned_out_before) { 00878 break; 00879 } 00880 00881 burned_out_before = burned_out; 00882 00883 vector<double> fl_cdf = fld->cmf(); 00884 00885 // Buffer results of long computations. 00886 foreach(Target* targ, _targ_map) { 00887 targ->lock(); 00888 targ->update_target_bias_buffer(bias_table.get(), fld.get()); 00889 if (bg_table) { 00890 bg_table->update_expectations(*targ, targ->rho(), fl_cdf); 00891 } 00892 targ->unlock(); 00893 } 00894 { 00895 boost::unique_lock<boost::mutex> lock(*mutex); 00896 // Do quick atomic swap 00897 foreach(Target* targ, _targ_map) { 00898 targ->lock(); 00899 } 00900 foreach(Target* targ, _targ_map) { 00901 targ->swap_bias_parameters(); 00902 targ->unlock(); 00903 } 00904 } 00905 } 00906 00907 if (bg_table) { 00908 delete bg_table; 00909 } 00910 }