eXpress “1.5”

src/biascorrection.cpp

00001 
00010 #include <algorithm>
00011 #include <boost/assign.hpp>
00012 #include <cassert>
00013 
00014 #include "biascorrection.h"
00015 
00016 #include "fragments.h"
00017 #include "frequencymatrix.h"
00018 #include "main.h"
00019 #include "sequence.h"
00020 #include "targets.h"
00021 
00022 using namespace std;
00023 
00024 // size of bias window
00025 const int WINDOW = 21;
00026 // center position in window
00027 const int CENTER = 11;
00028 // number of bases on either side of center
00029 const int SURROUND = 10;
00030 
00031 SeqWeightTable::SeqWeightTable(size_t window_size, size_t order, double alpha)
00032     : _order(order),
00033       _observed(order, window_size, window_size, alpha),
00034       _expected(order, window_size, order+1, EPSILON) {
00035 }
00036 
00037 SeqWeightTable::SeqWeightTable(size_t window_size, size_t order,
00038                                string param_file_name, string identifier)
00039     : _order(order),
00040       _observed(order, window_size, window_size, 0),
00041       _expected(order, window_size, order+1, 0) {
00042         
00043   //TODO: Allow for orders and window sizes to be read from param file.
00044   
00045   ifstream infile (param_file_name.c_str());
00046   const size_t BUFF_SIZE = 99999;
00047   char line_buff[BUFF_SIZE];
00048   if (!infile.is_open()) {
00049     logger.severe("Unable to open parameter file '%s'.",
00050                   param_file_name.c_str());
00051   }
00052 
00053   do {
00054     infile.getline (line_buff, BUFF_SIZE, '\n');
00055   } while (strncmp(line_buff, identifier.c_str(), identifier.size()));
00056   infile.getline (line_buff, BUFF_SIZE, '\n');
00057 
00058   do {
00059     infile.getline (line_buff, BUFF_SIZE, '\n');
00060   } while (strcmp(line_buff, "\tObserved Conditional Probabilities"));
00061   infile.getline (line_buff, BUFF_SIZE, '\n');
00062   
00063   //FG
00064   for (size_t k = 0; k < (size_t)WINDOW; ++k) {
00065     infile.getline (line_buff, BUFF_SIZE, '\n');
00066     char *p = strtok(line_buff, "\t");
00067     for (size_t i=0; i < pow(4.0, (double)min(order, k)); ++i) {
00068       for (size_t j=0; j < NUM_NUCS; ++j) {
00069         p = strtok(NULL, "\t");
00070         _observed.update(k, i, j, log(strtod(p,NULL)));
00071       }
00072     }
00073   }
00074   
00075   infile.getline (line_buff, BUFF_SIZE, '\n');
00076   infile.getline (line_buff, BUFF_SIZE, '\n');
00077   //BG
00078   for (size_t k = 0; k < order+1; ++k) {
00079     infile.getline (line_buff, BUFF_SIZE, '\n');
00080     char *p = strtok(line_buff, "\t");
00081     for (size_t i=0; i < pow(4.0, (double)min(order, k)); ++i) {
00082       for (size_t j=0; j < NUM_NUCS; ++j) {
00083         p = strtok(NULL, "\t");
00084         _expected.update(k, i, j, log(strtod(p,NULL)));
00085       }
00086     }
00087   }
00088 }
00089 
00090 
00091 void SeqWeightTable::copy_observed(const SeqWeightTable& other) {
00092   _observed = other._observed;
00093 }
00094 
00095 void SeqWeightTable::copy_expected(const SeqWeightTable& other) {
00096   _expected = other._expected;
00097 }
00098 
00099 void SeqWeightTable::increment_expected(const Sequence& seq, double mass,
00100                                         const vector<double>& fl_cdf) {
00101   _expected.fast_learn(seq, mass, fl_cdf);
00102 }
00103 
00104 void SeqWeightTable::normalize_expected() {
00105   _expected.calc_marginals();
00106 }
00107 
00108 void SeqWeightTable::increment_observed(const Sequence& seq, size_t i,
00109                                         double mass) {
00110   int left = (int)i - SURROUND;
00111   _observed.update(seq, left, mass);
00112 }
00113 
00114 double SeqWeightTable::get_weight(const Sequence& seq, size_t i) const {
00115   int left = (int)i - SURROUND;
00116   return _observed.seq_prob(seq, left) - _expected.seq_prob(seq, left);
00117 }
00118 
00119 void SeqWeightTable::append_output(ofstream& outfile) const {
00120   char buff[200];
00121   string header = "";
00122   for (int i = 0; i < WINDOW; i++) {
00123     sprintf(buff, "\t%d", i-CENTER+1);
00124     header += buff;
00125   }
00126   header += '\n';
00127 
00128   outfile << "\tObserved Marginal Distribution\n" << header;
00129 
00130   for (size_t j = 0; j < NUM_NUCS; j++) {
00131     outfile << NUCS[j] << ":\t";
00132     for (int i = 0; i < WINDOW; i++) {
00133       outfile << scientific << sexp(_observed.marginal_prob(i,j)) << "\t";
00134     }
00135     outfile<<endl;
00136   }
00137 
00138   outfile << "\tObserved Conditional Probabilities\nPosition\t";
00139 
00140   for (size_t j = 0; j < pow((double)NUM_NUCS, (double)_order+1); j++) {
00141     string s = "->";
00142     s += NUCS[j & 3];
00143     size_t cond = j >> 2;
00144     for (size_t k = 0; k < _order; ++k) {
00145       s = NUCS[cond & 3] + s;
00146       cond = cond >> 2;
00147     }
00148     outfile << s << '\t';
00149   }
00150   outfile << endl;
00151 
00152   for (int i = 0; i < WINDOW; i++) {
00153     outfile << i-SURROUND << ":\t";
00154     for (size_t j = 0; j < pow((double)NUM_NUCS, (double)_order+1); j++) {
00155       outfile << scientific << sexp(_observed.transition_prob(i,j>>2,j&3))
00156               << "\t";
00157     }
00158     outfile<<endl;
00159   }
00160 
00161   outfile << "\tBackground Conditional Probabilities\nPosition\t";
00162 
00163   for (size_t j = 0; j < pow((double)NUM_NUCS, (double)_order+1); j++) {
00164     string s = "->";
00165     s += NUCS[j & 3];
00166     size_t cond = j >> 2;
00167     for (size_t k = 0; k < _order; ++k) {
00168       s = NUCS[cond & 3] + s;
00169       cond = cond >> 2;
00170     }
00171     outfile << s << '\t';
00172   }
00173   outfile << endl;
00174 
00175   for (size_t i = 0; i < _order+1; i++) {
00176     outfile << i+1 << ":\t";
00177     for (size_t j = 0; j <  pow((double)NUM_NUCS, (double)_order+1); j++) {
00178       outfile << scientific << sexp(_expected.transition_prob(i,j>>2,j&3))
00179               << "\t";
00180     }
00181     outfile<<endl;
00182   }
00183 }
00184 
00185 BiasBoss::BiasBoss(size_t order, double alpha)
00186     : _order(order),
00187       _5_seq_bias(WINDOW, order, alpha),
00188       _3_seq_bias(WINDOW, order, alpha){
00189 }
00190 
00191 BiasBoss::BiasBoss(size_t order, string param_file_name)
00192     : _order(order),
00193       _5_seq_bias(WINDOW, order, param_file_name, ">5"),
00194       _3_seq_bias(WINDOW, order, param_file_name, ">3") {
00195 }
00196 
00197 void BiasBoss::copy_observations(const BiasBoss& other) {
00198   _5_seq_bias.copy_observed(other._5_seq_bias);
00199   _3_seq_bias.copy_observed(other._3_seq_bias);
00200 }
00201 
00202 void BiasBoss::copy_expectations(const BiasBoss& other) {
00203   _5_seq_bias.copy_expected(other._5_seq_bias);
00204   _3_seq_bias.copy_expected(other._3_seq_bias);
00205 }
00206 
00207 void BiasBoss::update_expectations(const Target& targ, double mass,
00208                                    const vector<double>& fl_cdf) {
00209   if (mass == LOG_0) {
00210     return;
00211   }
00212 
00213   if (direction != R) {
00214     const Sequence& seq_fwd = targ.seq(0);
00215     _5_seq_bias.increment_expected(seq_fwd, mass, fl_cdf);
00216   }
00217   if (direction != F) {
00218     const Sequence& seq_rev = targ.seq(1);
00219     _3_seq_bias.increment_expected(seq_rev, mass, fl_cdf);
00220   }
00221 }
00222 
00223 void BiasBoss::normalize_expectations() {
00224   _5_seq_bias.normalize_expected();
00225   _3_seq_bias.normalize_expected();
00226 }
00227 
00228 void BiasBoss::update_observed(const FragHit& hit, double normalized_mass)
00229 {
00230   assert (hit.pair_status() != PAIRED || (int)hit.length() > WINDOW);
00231 
00232   const Sequence& t_seq_fwd = hit.target()->seq(0);
00233   const Sequence& t_seq_rev = hit.target()->seq(1);
00234 
00235   if (hit.pair_status() != RIGHT_ONLY) {
00236     _5_seq_bias.increment_observed(t_seq_fwd, hit.left(), normalized_mass);
00237   }
00238   if (hit.pair_status() != LEFT_ONLY) {
00239     _3_seq_bias.increment_observed(t_seq_rev, t_seq_rev.length()-hit.right(),
00240                                    normalized_mass);
00241   }
00242 }
00243 
00244 double BiasBoss::get_target_bias(std::vector<float>& start_bias,
00245                                  std::vector<float>& end_bias,
00246                                  const Target& targ) const {
00247   double tot_start = LOG_0;
00248   double tot_end = LOG_0;
00249 
00250   const Sequence& t_seq_fwd = targ.seq(0);
00251   const Sequence& t_seq_rev = targ.seq(1);
00252 
00253   for (size_t i = 0; i < targ.length(); ++i) {
00254     start_bias[i] = _5_seq_bias.get_weight(t_seq_fwd, i);
00255     end_bias[targ.length()-i-1] = _3_seq_bias.get_weight(t_seq_rev, i);
00256     tot_start = log_add(tot_start, start_bias[i]);
00257     tot_end = log_add(tot_end, end_bias[i]);
00258   }
00259 
00260   double avg_bias = (tot_start + tot_end) - (2*log((double)targ.length()));
00261   assert(!isnan(avg_bias));
00262   return avg_bias;
00263 }
00264 
00265 void BiasBoss::append_output(ofstream& outfile) const {
00266   outfile << ">5' Sequence-Specific Bias\n";
00267   _5_seq_bias.append_output(outfile);
00268   outfile << ">3' Sequence-Specific Bias\n";
00269   _3_seq_bias.append_output(outfile);
00270 }
 All Classes Functions Variables