eXpress “1.5”

src/targets.cpp

00001 
00009 #include "main.h"
00010 #include "targets.h"
00011 #include "lengthdistribution.h"
00012 #include "fragments.h"
00013 #include "biascorrection.h"
00014 #include "mismatchmodel.h"
00015 #include "mapparser.h"
00016 #include "library.h"
00017 #include <iostream>
00018 #include <fstream>
00019 #include <cassert>
00020 #include <stdio.h>
00021 #include <limits>
00022 #include <float.h>
00023 
00024 using namespace std;
00025 
00026 Target::Target(TargID id, const std::string& name, const std::string& seq,
00027                bool prob_seq, double alpha, const Librarian* libs,
00028                const BiasBoss* known_bias_boss, const LengthDistribution* known_fld)
00029    : _libs(libs),
00030      _id(id),
00031      _name(name),
00032      _seq_f(seq, 0, prob_seq),
00033      _seq_r(_seq_f),
00034      _alpha(log(alpha)),
00035      _ret_params(&_curr_params),
00036      _uniq_counts(0),
00037      _tot_counts(0),
00038      _avg_bias(0),
00039      _avg_bias_buffer(0),
00040      _solvable(false) {
00041   if ((_libs->curr_lib()).bias_table) {
00042     _start_bias.reset(new std::vector<float>(seq.length(),0));
00043     _start_bias_buffer.reset(new std::vector<float>(seq.length(),0));
00044     _end_bias.reset(new std::vector<float>(seq.length(),0));
00045     _end_bias_buffer.reset(new std::vector<float>(seq.length(),0));
00046   }
00047   update_target_bias_buffer(known_bias_boss, known_fld);
00048   swap_bias_parameters();
00049   _init_pseudo_mass = _cached_eff_len + _alpha;
00050 }
00051 
00052 void Target::add_hit(const FragHit& hit, double v, double m) {
00053   double p = hit.params()->posterior;
00054   _curr_params.mass = log_add(_curr_params.mass, p+m);
00055   double mass_with_pseudo = log_add(_ret_params->mass, _init_pseudo_mass);
00056   if (p != LOG_1 || v != LOG_0) {
00057     if (p != LOG_0) {
00058       _curr_params.ambig_mass = log_add(_curr_params.ambig_mass, p+m);
00059       _curr_params.tot_ambig_mass = log_add(_curr_params.tot_ambig_mass, m);
00060     }
00061     double p_hat = _curr_params.ambig_mass;
00062     if (_curr_params.tot_ambig_mass != LOG_0) {
00063       p_hat -= _curr_params.tot_ambig_mass;
00064     } else {
00065       assert(p_hat == LOG_0);
00066     }
00067     assert(p_hat == LOG_0 || p_hat <= LOG_1);
00068     _curr_params.var_sum = min(log_add(_curr_params.var_sum, v + m),
00069                                _curr_params.tot_ambig_mass + p_hat
00070                                + log_sub(LOG_1, p_hat));
00071     double var_update = log_add(p + 2*m, v + 2*m);
00072     _curr_params.mass_var = min(log_add(_curr_params.mass_var, var_update),
00073                                 mass_with_pseudo + log_sub(_bundle->mass(),
00074                                                       mass_with_pseudo));
00075   }
00076   if (_curr_params.haplotype) {
00077     _curr_params.haplotype->update_mass(this, hit.frag_name(),
00078                                         hit.params()->align_likelihood, p);
00079   }
00080   (_libs->curr_lib()).targ_table->update_total_fpb(m - _cached_eff_len);
00081 }
00082 
00083 void Target::round_reset() {
00084   _last_params = _curr_params;
00085   _curr_params = RoundParams();
00086   _ret_params = &_last_params;
00087   _init_pseudo_mass = LOG_0;
00088 }
00089 
00090 double Target::rho() const {
00091   double eff_len = cached_effective_length(false);
00092   if (eff_len == LOG_0) {
00093       return LOG_0;
00094   }
00095 
00096   return mass(true) - eff_len - (_libs->curr_lib()).targ_table->total_fpb();
00097 }
00098 
00099 double Target::mass(bool with_pseudo) const {
00100   if (!with_pseudo) {
00101     return _ret_params->mass;
00102   }
00103   return log_add(_ret_params->mass, _alpha+_cached_eff_len+_avg_bias);
00104 }
00105 
00106 double Target::mass_var() const {
00107   return _ret_params->mass_var;
00108 }
00109 
00110 double Target::sample_likelihood(bool with_pseudo,
00111                                  const vector<const Target*>* neighbors) const {
00112   const Library& lib = _libs->curr_lib();
00113 
00114   double ll = LOG_1;
00115   double tot_mass = mass(with_pseudo);
00116   double tot_eff_len = cached_effective_length(lib.bias_table);
00117   if (neighbors) {
00118     foreach (const Target* neighbor, *neighbors) {
00119       tot_mass = log_add(tot_mass, neighbor->mass(with_pseudo));
00120       tot_eff_len = log_add(tot_eff_len,
00121                             neighbor->cached_effective_length(lib.bias_table));
00122     }
00123   }
00124   ll += tot_mass - tot_eff_len;
00125   assert(!isnan(ll));
00126   return ll;
00127 }
00128 
00129 double Target::align_likelihood(const FragHit& frag) const {
00130 
00131   const Library& lib = _libs->curr_lib();
00132 
00133   double ll = LOG_1;
00134 
00135   const PairStatus ps = frag.pair_status();
00136 
00137   if (lib.mismatch_table) {
00138     ll += (lib.mismatch_table)->log_likelihood(frag);
00139   }
00140 
00141   if (lib.bias_table) {
00142     if (ps != RIGHT_ONLY) {
00143       ll += _start_bias->at(frag.left());
00144     }
00145     if (ps != LEFT_ONLY) {
00146       ll += _end_bias->at(frag.right() - 1);
00147     }
00148   }
00149   
00150   if (ps == PAIRED) {
00151     ll += (lib.fld)->pmf(frag.length());
00152   } else if (ps == LEFT_ONLY && length() - frag.left() < (lib.fld)->max_val()) {
00153     ll += (lib.fld)->cmf(length() - frag.left());
00154   } else if (ps == RIGHT_ONLY && frag.right() < (lib.fld)->max_val()) {
00155     ll += (lib.fld)->cmf(frag.right());
00156   }
00157 
00158   assert(!(isnan(ll)||isinf(ll)));
00159   return ll;
00160 }
00161 
00162 double Target::est_effective_length(const LengthDistribution* fld,
00163                                     bool with_bias) const {
00164   if (!fld) {
00165     fld = (_libs->curr_lib()).fld.get();
00166   }
00167 
00168   double eff_len = LOG_0;
00169 
00170   double log_length = log((double)length());
00171   if (log_length < fld->mean()) {
00172     eff_len = log_length;
00173   } else {
00174     for(size_t l = fld->min_val(); l <= min(length(), fld->max_val()); l++) {
00175       eff_len = log_add(eff_len, fld->pmf(l)+log((double)length()-l+1));
00176     }
00177   }
00178   
00179   if (with_bias) {
00180     eff_len += _avg_bias;
00181   }
00182 
00183   return eff_len;
00184 }
00185 
00186 double Target::cached_effective_length(bool with_bias) const {
00187   if (with_bias) {
00188     return _cached_eff_len + _avg_bias;
00189   }
00190   return _cached_eff_len;
00191 }
00192 
00193 void Target::update_target_bias_buffer(const BiasBoss* bias_table,
00194                                        const LengthDistribution* fld) {
00195   if (bias_table) {
00196     _avg_bias_buffer = bias_table->get_target_bias(*_start_bias_buffer,
00197                                                    *_end_bias_buffer, *this);
00198   }
00199   assert(!isnan(_avg_bias_buffer) && !isinf(_avg_bias_buffer));
00200   _cached_eff_len_buffer = est_effective_length(fld, false);
00201 }
00202 
00203 void Target::swap_bias_parameters() {
00204   _cached_eff_len = _cached_eff_len_buffer;
00205   _avg_bias = _avg_bias_buffer;
00206   _start_bias.swap(_start_bias_buffer);
00207   _end_bias.swap(_end_bias_buffer);
00208 }
00209 
00210 void HaplotypeHandler::commit_buffer() {
00211   if (_committed) {
00212     return;
00213   }
00214   
00215   bool all_eq = true;
00216   foreach (double val, _align_likelihoods_buff) {
00217     all_eq &= (val == _align_likelihoods_buff[0]);
00218   }
00219   if (!all_eq) {
00220     for (size_t i = 0; i < _targets.size(); ++i) {
00221       _haplo_taus.increment(0, i, _masses_buff[i]);
00222     }
00223   }
00224   
00225   for (size_t i = 0; i < _targets.size(); ++i) {
00226     _align_likelihoods_buff[i] = LOG_0;
00227     _masses_buff[i] = LOG_0;
00228   }
00229   _committed = true;
00230 }
00231 
00232 HaplotypeHandler::HaplotypeHandler(vector<Target*> targets, double alpha=1)
00233     : _haplo_taus(1, targets.size(), alpha),
00234       _frag_name_buff(""),
00235       _align_likelihoods_buff(vector<double>(targets.size(), LOG_0)),
00236       _masses_buff(vector<double>(targets.size(), LOG_0)),
00237       _committed(true) {
00238   
00239   boost::shared_ptr<HaplotypeHandler> handler(this);
00240   foreach(Target* targ, targets) {
00241     _targets.push_back(targ);
00242     targ->haplotype(handler);
00243   }
00244 }
00245 
00246 size_t HaplotypeHandler::find_target(const Target* targ) {
00247   for (size_t i = 0; i < _targets.size(); ++i) {
00248     if (_targets[i] == targ) {
00249       assert(_targets[i]->id() == targ->id());
00250       return i;
00251     }
00252   }
00253   logger.severe("Haplotype target not found.");
00254   return SSIZE_MAX;
00255 }
00256 
00257 double HaplotypeHandler::get_mass(const Target* targ, bool with_pseudo) {
00258   commit_buffer();
00259   
00260   TargID i = find_target(targ);;
00261   
00262   double total_mass = LOG_0;
00263   foreach(const Target* targ, _targets) {
00264     total_mass = log_add(total_mass, targ->_ret_params->mass);
00265     if (with_pseudo){
00266       total_mass = log_add(total_mass, targ->cached_effective_length());
00267     }
00268   }
00269   
00270   return _haplo_taus(0, i) + total_mass;
00271 }
00272 
00273 void HaplotypeHandler::update_mass(const Target* targ, const string& frag_name,
00274                                    double align_likelihood, double mass) {
00275   if (frag_name != _frag_name_buff) {
00276     commit_buffer();
00277     _frag_name_buff = frag_name;
00278   } else {
00279     assert(!_committed);
00280   }
00281   
00282   TargID i = find_target(targ);
00283 
00284   _align_likelihoods_buff[i] = log_add(_align_likelihoods_buff[i],
00285                                        align_likelihood);
00286   _masses_buff[i] = log_add(_masses_buff[i], mass);
00287   
00288   _committed = false;
00289 }
00290 
00291 
00292 TargetTable::TargetTable(string targ_fasta_file, string haplotype_file,
00293                          bool prob_seqs, bool known_aux_params, double alpha,
00294                          const AlphaMap* alpha_map, const Librarian* libs)
00295     :  _libs(libs) {
00296   string info_msg = "Loading target sequences";
00297   const Library& lib = _libs->curr_lib();
00298   const TransIndex& targ_index = lib.map_parser->targ_index();
00299   const TransIndex& targ_lengths = lib.map_parser->targ_lengths();
00300   if (lib.bias_table && !known_aux_params) {
00301     info_msg += " and measuring bias background";
00302   }
00303   info_msg += "...";
00304   logger.info(info_msg.c_str());
00305 
00306   size_t num_targs = targ_index.size();
00307   _targ_map = vector<Target*>(num_targs, NULL);
00308   _total_fpb = log(alpha*num_targs);
00309 
00310   boost::unordered_set<string> target_names;
00311       
00312   ifstream infile (targ_fasta_file.c_str());
00313   string line;
00314   string seq = "";
00315   string name = "";
00316   if (infile.is_open()) {
00317     while (infile.good()) {
00318       getline(infile, line, '\n');
00319       if (line.empty()) {
00320         continue;
00321       }
00322       if (line[0] == '>') {
00323         if (!name.empty()) {
00324           if (alpha_map) {
00325             alpha = alpha_map->find(name)->second;
00326           }
00327           add_targ(name, seq, prob_seqs, known_aux_params, alpha, targ_index,
00328                    targ_lengths);
00329         }
00330         name = line.substr(1,line.find(' ')-1);
00331         if (target_names.count(name)) {
00332           logger.severe("Target '%s' is duplicated in the input FASTA. Ensure "
00333                         "target names are unique and re-map before re-running "
00334                         "eXpress.", name.c_str());
00335         }
00336         if (alpha_map && !alpha_map->count(name)) {
00337           logger.severe("Target '%s' is was not found in the prior parameter "
00338                         "file.", name.c_str());
00339         }
00340         target_names.insert(name);
00341         seq = "";
00342       } else {
00343         seq += line;
00344       }
00345     }
00346     if (!name.empty()) {
00347       if (alpha_map) {
00348         alpha = alpha_map->find(name)->second;
00349       }
00350       add_targ(name, seq, prob_seqs, known_aux_params, alpha, targ_index,
00351                targ_lengths);
00352     }
00353 
00354     infile.close();
00355     if (lib.bias_table && !known_aux_params) {
00356       lib.bias_table->normalize_expectations();
00357     }
00358   } else {
00359     logger.severe("Unable to open MultiFASTA file '%s'.",
00360                   targ_fasta_file.c_str());
00361   }
00362 
00363   if (size() == 0) {
00364     logger.severe("No targets found in MultiFASTA file '%s'.",
00365                   targ_fasta_file.c_str());
00366   }
00367 
00368   for(TransIndex::const_iterator it = targ_index.begin();
00369       it != targ_index.end(); ++it) {
00370     if (!_targ_map[it->second]) {
00371       logger.severe("Sequence for target '%s' not found in MultiFASTA file "
00372                     "'%s'.", it->first.c_str(), targ_fasta_file.c_str());
00373     }
00374   }
00375   logger.info("Initialized %d targets.", size());
00376   
00377   // Load haplotype information, if provided
00378   if (haplotype_file.size()) {
00379     logger.info("Loading haplotype information...");
00380     infile.open(haplotype_file.c_str());
00381     size_t num_haplotype_groups = 0;
00382     if (infile.is_open()) {
00383       const size_t BUFF_SIZE = 99999;
00384       char line_buff[BUFF_SIZE];
00385       while (infile.good()) {
00386         infile.getline (line_buff, BUFF_SIZE, '\n');
00387 
00388         if (strlen(line_buff) == 0) {
00389           continue;
00390         }
00391         
00392         vector<Target*> haplotype_targets;
00393         char *p = strtok(line_buff, ",");
00394         do {
00395           if (targ_index.count(p) == 0) {
00396             logger.severe("Haplotype target '%s' does not exist in MultiFASTA "
00397                           "or alignment files.", p);
00398           }
00399           haplotype_targets.push_back(_targ_map[targ_index.at(p)]);
00400           p = strtok(NULL, ",");
00401         } while (p);
00402 
00403         if (haplotype_targets.size() < 2) {
00404           logger.severe("Haplotype groups must contain at least 2 targets.");
00405         }
00406 
00407         _haplotype_groups.insert(haplotype_targets);
00408         
00409         if (!alpha_map) {
00410           foreach(Target* targ, haplotype_targets) {
00411             targ->alpha(alpha/haplotype_targets.size());
00412           }
00413         }
00414         // Ownership is given to the targets.
00415         new HaplotypeHandler(haplotype_targets);
00416         num_haplotype_groups++;
00417       }
00418     }
00419     logger.info("Initialized " SIZE_T_FMT " haplotype groups.",
00420                 num_haplotype_groups);
00421   }
00422 }
00423 
00424 TargetTable::~TargetTable() {
00425   foreach(Target* targ, _targ_map) {
00426     delete targ;
00427   }
00428 }
00429 
00430 void TargetTable::add_targ(const string& name, const string& seq, bool prob_seq,
00431                            bool known_aux_params, double alpha,
00432                            const TransIndex& targ_index,
00433                            const TransIndex& targ_lengths) {
00434   TransIndex::const_iterator it = targ_index.find(name);
00435   if (it == targ_index.end()) {
00436     logger.warn("Target '%s' exists in MultiFASTA but not alignment "
00437                    "(SAM/BAM) file.", name.c_str());
00438     return;
00439   }
00440 
00441   if (targ_lengths.find(name)->second != seq.length()) {
00442     logger.severe("Target '%s' differs in length between MultiFASTA and "
00443                   "alignment (SAM/BAM) files (%d  vs. %d).", name.c_str(),
00444                   seq.length(), targ_lengths.find(name)->second);
00445   }
00446 
00447   const Library& lib = _libs->curr_lib();
00448   const BiasBoss* known_bias_boss = (known_aux_params) ? lib.bias_table.get()
00449                                                        : NULL;
00450   const LengthDistribution* known_fld = (known_aux_params) ? lib.fld.get()
00451                                                            : NULL;
00452   
00453   Target* targ = new Target(it->second, name, seq, prob_seq, alpha, _libs,
00454                             known_bias_boss, known_fld);
00455   if (lib.bias_table && !known_aux_params) {
00456     (lib.bias_table)->update_expectations(*targ);
00457   }
00458   _targ_map[targ->id()] = targ;
00459   targ->bundle(_bundle_table.create_bundle(targ));
00460 }
00461 
00462 Target* TargetTable::get_targ(TargID id) {
00463     return _targ_map[id];
00464 }
00465 
00466 Bundle* TargetTable::merge_bundles(Bundle* b1, Bundle* b2) {
00467   if (b1 != b2) {
00468     return _bundle_table.merge(b1, b2);
00469   }
00470   return b1;
00471 }
00472 
00473 void TargetTable::round_reset() {
00474   foreach(Target* targ, _targ_map) {
00475     targ->round_reset();
00476     targ->bundle()->incr_mass(targ->mass(false));
00477   }
00478   foreach(vector<Target*> hap_group, _haplotype_groups) {
00479     // Ownership is given to the targets.
00480     new HaplotypeHandler(hap_group);
00481   }
00482 }
00483 
00484 void project_to_polytope(vector<Target*> bundle_targ,
00485                          vector<double>& targ_counts, double bundle_counts) {
00486   vector<bool> polytope_bound(bundle_targ.size(), false);
00487   while (true) {
00488     double unbound_counts = 0;
00489     double bound_counts = 0;
00490     for (size_t i = 0; i < bundle_targ.size(); ++i) {
00491       Target& targ = *bundle_targ[i];
00492 
00493       if (targ_counts[i] > targ.tot_counts()) {
00494         targ_counts[i] = targ.tot_counts();
00495         polytope_bound[i] = true;
00496       } else if (targ_counts[i] < targ.uniq_counts()) {
00497         targ_counts[i] = targ.uniq_counts();
00498         polytope_bound[i] = true;
00499       }
00500 
00501       if (polytope_bound[i]) {
00502         bound_counts += targ_counts[i];
00503       } else {
00504         unbound_counts += targ_counts[i];
00505       }
00506     }
00507 
00508     if (approx_eq(unbound_counts + bound_counts, bundle_counts)) {
00509       return;
00510     }
00511    
00512      if (unbound_counts == 0) {
00513       polytope_bound = vector<bool>(bundle_targ.size(), false);
00514        unbound_counts = bound_counts;
00515        bound_counts = 0;
00516      }
00517 
00518     double normalizer = (bundle_counts - bound_counts)/unbound_counts;
00519     for (size_t i = 0; i < bundle_targ.size(); ++i) {
00520       if (!polytope_bound[i]) {
00521         targ_counts[i] *= normalizer;
00522       }
00523     }
00524   }
00525 }
00526 
00527 struct Result {
00528   double fpkm, fpkm_std_dev, fpkm_lo, fpkm_hi;
00529   double count_alpha, count_beta;
00530   double est_counts, eff_len, eff_counts;
00531   double cpb; // counts per base for TPM calculation
00532   
00533   void set_zeros() {
00534     fpkm = fpkm_std_dev = fpkm_lo = fpkm_hi =
00535     count_alpha = count_beta = est_counts =
00536     eff_len = eff_counts = cpb = 0.0;
00537   }
00538 };
00539 
00540 void TargetTable::masses_to_counts() {
00541   foreach (Bundle* bundle, _bundle_table.bundles()) {
00542     
00543     const vector<Target*>& bundle_targ = *(bundle->targets());
00544 
00545     // Calculate total counts for bundle and bundle-level rho
00546     // Do not include pseudo-mass because it will screw up multi-round results
00547     double l_bundle_mass = LOG_0;
00548     foreach (Target* targ, bundle_targ) {
00549       l_bundle_mass = log_add(l_bundle_mass, targ->mass(false));
00550     }
00551     
00552     if (bundle->counts()) {
00553       double l_bundle_counts = log((double)bundle->counts());
00554       double l_var_renorm = 2*(l_bundle_counts - l_bundle_mass);
00555 
00556       vector<double> targ_counts(bundle_targ.size(),0);
00557       bool requires_projection = false;
00558       
00559       for (size_t i = 0; i < bundle_targ.size(); ++i) {
00560         Target& targ = *bundle_targ[i];
00561         double l_targ_frac = targ.mass(false) - l_bundle_mass;
00562         targ_counts[i] = sexp(l_targ_frac + l_bundle_counts);
00563         requires_projection |= targ_counts[i] > (double)targ.tot_counts() ||
00564         targ_counts[i] < (double)targ.uniq_counts();
00565       }
00566       
00567       if (bundle_targ.size() > 1 && requires_projection) {
00568         project_to_polytope(bundle_targ, targ_counts, bundle->counts());
00569       }
00570       
00571       // Calculate individual counts and rhos
00572       for (size_t i = 0; i < bundle_targ.size(); ++i) {
00573         Target& targ = *bundle_targ[i];
00574         double mass = targ.mass(false);
00575         targ._curr_params.mass = log((double)targ_counts[i]);
00576         targ._curr_params.mass_var = min(targ.mass_var(),
00577                                          mass + log_sub(l_bundle_mass, mass))
00578                                      + l_var_renorm;
00579         targ._curr_params.var_sum = targ.var_sum() + l_var_renorm;
00580       }
00581     }
00582     
00583     bundle->reset_mass();
00584     bundle->incr_mass(log((double)bundle->counts()));
00585   }
00586 }
00587 
00588 void TargetTable::output_results(string output_dir, size_t tot_counts,
00589                                  bool output_varcov, bool output_rdds) {
00590   FILE * expr_file = fopen((output_dir + "/results.xprs").c_str(), "w");
00591   ofstream varcov_file;
00592   ofstream   rdds_file;
00593   if (output_varcov) {
00594       varcov_file.open((output_dir + "/varcov.xprs").c_str());
00595   }
00596   if (output_rdds) {
00597       rdds_file.open((output_dir + "/rdds.xprs").c_str());
00598       rdds_file << "target_id\tposition\tp_value\tref_nuc\tP(A)\tP(C)\tP(G)\t"
00599                << "P(T)\tobs_A\tobs_C\tobs_G\tobs_T\texp_A\texp_C\texp_G\texp_T"
00600                << endl;
00601   }
00602   fprintf(expr_file, "bundle_id\ttarget_id\tlength\teff_length\ttot_counts\t"
00603                      "uniq_counts\test_counts\teff_counts\tambig_distr_alpha\t"
00604                      "ambig_distr_beta\tfpkm\tfpkm_conf_low\tfpkm_conf_high\t"
00605                      "solvable\ttpm\n");
00606 
00607   const double l_bil = log(1000000000.);
00608   const double l_tot_counts = log((double)tot_counts);
00609   
00610   vector<Result> res(size());
00611   
00612   size_t bundle_id = 0;
00613   size_t t_id = 0;
00614   foreach (Bundle* bundle, _bundle_table.bundles()) {
00615     ++bundle_id;
00616 
00617     const vector<Target*>& bundle_targ = *(bundle->targets());
00618 
00619     if (output_varcov) {
00620       varcov_file << ">" << bundle_id << ": ";
00621       for (size_t i = 0; i < bundle_targ.size(); ++i) {
00622         if (i) {
00623           varcov_file << ", ";
00624         }
00625         varcov_file << bundle_targ[i]->name();
00626       }
00627       varcov_file << endl;
00628     }
00629 
00630     // Calculate total counts for bundle and bundle-level rho
00631     // Do not include pseudo-mass because it will screw up multi-round results
00632     double l_bundle_mass = LOG_0;
00633     foreach (Target* targ, bundle_targ) {
00634       l_bundle_mass = log_add(l_bundle_mass, targ->mass(false));
00635     }
00636 
00637     if (bundle->counts()) {
00638       const double l_bundle_counts = log((double)bundle->counts());
00639       const double l_var_renorm = 2*(l_bundle_counts - l_bundle_mass);
00640 
00641       vector<double> targ_counts(bundle_targ.size(),0);
00642       bool requires_projection = false;
00643 
00644       for (size_t i = 0; i < bundle_targ.size(); ++i) {
00645         Target& targ = *bundle_targ[i];
00646         const double l_targ_frac = targ.mass(false) - l_bundle_mass;
00647         targ_counts[i] = sexp(l_targ_frac + l_bundle_counts);
00648         requires_projection |= targ_counts[i] > (double)targ.tot_counts() ||
00649                                targ_counts[i] < (double)targ.uniq_counts();
00650       }
00651 
00652       if (bundle_targ.size() > 1 && requires_projection) {
00653         project_to_polytope(bundle_targ, targ_counts, bundle->counts());
00654       }
00655 
00656       // Calculate individual counts and rhos
00657       for (size_t i = 0; i < bundle_targ.size(); ++i) {
00658         Target& targ = *bundle_targ[i];
00659         const double l_eff_len = targ.est_effective_length();
00660 
00661         // Calculate count variance
00662         const double mass = targ.mass(false);
00663         const double mass_var = min(targ.mass_var(),
00664                               mass + log_sub(l_bundle_mass, mass));
00665         double count_alpha = 0;
00666         double count_beta = 0;
00667         double count_var = 0;
00668 
00669         if (targ.tot_counts() != targ.uniq_counts()) {
00670           double n = targ.tot_counts()-targ.uniq_counts();
00671           if (targ_counts[i] == 0) {
00672             count_var = n * (n + 2.) / 12.;
00673             count_alpha = 0;
00674             count_beta = 0;
00675           } else if (targ.solvable()) {
00676             double m = (targ_counts[i] - targ.uniq_counts())/n;
00677             assert (m >= 0 && m <= 1);
00678             m = max(m, EPSILON);
00679             m = min(m, 1-EPSILON);
00680             m = log(m);
00681             double v = numeric_limits<double>::max();
00682             if (sexp(targ.var_sum()) != 0 && targ.tot_ambig_mass() != LOG_0) {
00683               v = targ.var_sum() - targ.tot_ambig_mass();
00684             }
00685             v = min(v, m + log_sub(log_sub(LOG_1, m), LOG_EPSILON - log(2.)));
00686 
00687             count_alpha = m + (log_sub(log_add(m,v), m+m)) - v;
00688             count_alpha = sexp(min(LOG_MAX, count_alpha));
00689 
00690             count_beta = log_sub(LOG_1, m) + log_sub(log_add(m,v), m+m)- v;
00691             count_beta = sexp(min(LOG_MAX, count_beta));
00692             
00693             count_var = mass_var;
00694             
00695             assert(count_alpha > 0 && count_beta > 0);
00696             assert(!isinf(count_alpha) && !isinf(count_beta));
00697             assert(!isnan(count_var));
00698           } else {
00699             count_var = n * (n + 2.) / 12.;
00700             count_alpha = 1;
00701             count_beta = 1;
00702           }
00703         }
00704         
00705         t_id = targ.id();
00706 
00707         // Store results for output
00708         assert(t_id < size());
00709         res[t_id].count_alpha = count_alpha;
00710         res[t_id].count_beta = count_beta;
00711         
00712         double fpkm_constant = sexp(l_bil - l_eff_len - l_tot_counts);
00713         res[t_id].fpkm_std_dev = sexp(0.5*(mass_var + l_var_renorm));
00714         res[t_id].fpkm = targ_counts[i] * fpkm_constant;
00715         res[t_id].fpkm_lo = max(0.0,
00716                              (targ_counts[i] -
00717                               2*res[t_id].fpkm_std_dev) * fpkm_constant);
00718         res[t_id].fpkm_hi = (targ_counts[i] +
00719                              2*res[t_id].fpkm_std_dev) * fpkm_constant;
00720         
00721         res[t_id].est_counts = targ_counts[i];
00722         res[t_id].eff_len = sexp(l_eff_len);
00723         res[t_id].eff_counts = targ_counts[i] / res[t_id].eff_len * targ.length();
00724         
00725         res[t_id].cpb = targ_counts[i] / res[t_id].eff_len;
00726 
00727         if (output_varcov) {
00728           for (size_t j = 0; j < bundle_targ.size(); ++j) {
00729             if (j) {
00730               varcov_file << "\t";
00731             }
00732             if (i==j) {
00733               varcov_file << scientific << sexp(get_covar(targ.id(), targ.id())
00734                                                + l_var_renorm);
00735             } else {
00736               varcov_file << scientific
00737                          << -sexp(get_covar(targ.id(), bundle_targ[j]->id())
00738                                   + l_var_renorm);
00739             }
00740           }
00741           varcov_file << endl;
00742         }
00743         
00744         if (output_rdds) {
00745           const Sequence& targ_seq = targ.seq();
00746           vector<double> p_vals;
00747           targ_seq.calc_p_vals(p_vals);
00748           for (size_t i = 0; i < p_vals.size(); ++i) {
00749             if (p_vals[i] < 0.01) {
00750               rdds_file << targ.name() << "\t" << i << "\t" << p_vals[i]
00751               << "\t" << NUCS[targ_seq.get_ref(i)];
00752               for (size_t nuc=0; nuc < NUM_NUCS; nuc++) {
00753                 rdds_file << "\t" << sexp(targ_seq.get_prob(i,nuc));
00754               }
00755               for (size_t nuc=0; nuc < NUM_NUCS; nuc++) {
00756                 rdds_file << "\t" << sexp(targ_seq.get_obs(i,nuc));
00757               }
00758               for (size_t nuc=0; nuc < NUM_NUCS; nuc++) {
00759                 rdds_file << "\t" << sexp(targ_seq.get_exp(i,nuc));
00760               }
00761               rdds_file << endl;
00762             }
00763           }
00764         }
00765       }
00766     } else {
00767       for (size_t i = 0; i < bundle_targ.size(); ++i) {
00768         
00769         
00770         if (output_varcov) {
00771           for (size_t j = 0; j < bundle_targ.size(); ++j) {
00772             if (j) {
00773               varcov_file << "\t";
00774             }
00775             varcov_file << scientific << 0.0;
00776           }
00777           varcov_file << endl;
00778         }
00779       }
00780     }
00781   }
00782   
00783   // Calculate total counts per base
00784   double cpb_sum = 0.0;
00785   for (size_t i = 0; i < size(); ++i) {
00786     cpb_sum += res[i].cpb;
00787   }
00788   
00789   // Calculate TPMs and output results
00790   const double l_mil = log(1000000.);
00791   bundle_id = 0;
00792   t_id = 0;
00793   foreach (Bundle* bundle, _bundle_table.bundles()) {
00794     ++bundle_id;
00795     
00796     const vector<Target*>& bundle_targ = *(bundle->targets());
00797     
00798     for (size_t i = 0; i < bundle_targ.size(); ++i) {
00799       
00800       Target& targ = *bundle_targ[i];
00801       
00802       t_id = targ.id();
00803       
00804       double tpm;
00805       
00806       if (bundle->counts()) {
00807         double trans_frac = log(res[t_id].cpb / cpb_sum);
00808         tpm = sexp(trans_frac + l_mil);
00809       } else {
00810         res[targ.id()].set_zeros();
00811         tpm = 0.0;
00812       }
00813       
00814       fprintf(expr_file, "" SIZE_T_FMT "\t%s\t" SIZE_T_FMT "\t%f\t" SIZE_T_FMT
00815               "\t" SIZE_T_FMT "\t%f\t%f\t%e\t%e\t%e\t%e\t%e\t%c\t%e\n",
00816               bundle_id, targ.name().c_str(), targ.length(), res[t_id].eff_len,
00817               targ.tot_counts(), targ.uniq_counts(), res[t_id].est_counts,
00818               res[t_id].eff_counts, res[t_id].count_alpha, res[t_id].count_beta,
00819               res[t_id].fpkm, res[t_id].fpkm_lo, res[t_id].fpkm_hi,
00820               (targ.solvable())?'T':'F', tpm);
00821       ++t_id;
00822     }
00823   }
00824   fclose(expr_file);
00825   if (output_varcov) {
00826     varcov_file.close();
00827   }
00828   if (output_rdds) {
00829     rdds_file.close();
00830   }
00831 }
00832 
00833 double TargetTable::total_fpb() const {
00834   boost::unique_lock<boost::mutex>(_fpb_mut);
00835   return _total_fpb;
00836 }
00837 
00838 void TargetTable::update_total_fpb(double incr_amt) {
00839   boost::unique_lock<boost::mutex>(_fpb_mut);
00840   _total_fpb = log_add(_total_fpb, incr_amt);
00841 }
00842 
00843 void TargetTable::asynch_bias_update(boost::mutex* mutex) {
00844   BiasBoss* bg_table = NULL;
00845   boost::scoped_ptr<BiasBoss> bias_table;
00846   boost::scoped_ptr<LengthDistribution> fld;
00847 
00848   bool burned_out_before = false;
00849 
00850   const Library& lib = _libs->curr_lib();
00851 
00852   while(running) {
00853     if (bg_table) {
00854       bg_table->normalize_expectations();
00855     }
00856     {
00857       boost::unique_lock<boost::mutex> lock(*mutex);
00858       if(!fld) {
00859         fld.reset(new LengthDistribution(*(lib.fld)));
00860       } else {
00861         *fld = *(lib.fld);
00862       }
00863       if (lib.bias_table) {
00864         BiasBoss& lib_bias_table = *(lib.bias_table);
00865         if (!bias_table) {
00866           bias_table.reset(new BiasBoss(lib_bias_table));
00867         } else {
00868           lib_bias_table.copy_expectations(*bg_table);
00869           bg_table->copy_observations(lib_bias_table);
00870           bias_table.reset(bg_table);
00871         }
00872         bg_table = new BiasBoss(lib_bias_table.order(), 0);
00873       }
00874       logger.info("Synchronized auxiliary parameter tables.");
00875     }
00876 
00877     if (!edit_detect && burned_out && burned_out_before) {
00878       break;
00879     }
00880 
00881     burned_out_before = burned_out;
00882 
00883     vector<double> fl_cdf = fld->cmf();
00884 
00885     // Buffer results of long computations.
00886     foreach(Target* targ, _targ_map) {
00887       targ->lock();
00888       targ->update_target_bias_buffer(bias_table.get(), fld.get());
00889       if (bg_table) {
00890         bg_table->update_expectations(*targ, targ->rho(), fl_cdf);
00891       }
00892       targ->unlock();
00893     }
00894     {
00895       boost::unique_lock<boost::mutex> lock(*mutex);
00896       // Do quick atomic swap
00897       foreach(Target* targ, _targ_map) {
00898         targ->lock();
00899       }
00900       foreach(Target* targ, _targ_map) {
00901         targ->swap_bias_parameters();
00902         targ->unlock();
00903       }
00904     }
00905   }
00906 
00907   if (bg_table) {
00908     delete bg_table;
00909   }
00910 }
 All Classes Functions Variables