eXpress “1.5”
|
00001 00010 #include <algorithm> 00011 #include <boost/assign.hpp> 00012 #include <cassert> 00013 00014 #include "biascorrection.h" 00015 00016 #include "fragments.h" 00017 #include "frequencymatrix.h" 00018 #include "main.h" 00019 #include "sequence.h" 00020 #include "targets.h" 00021 00022 using namespace std; 00023 00024 // size of bias window 00025 const int WINDOW = 21; 00026 // center position in window 00027 const int CENTER = 11; 00028 // number of bases on either side of center 00029 const int SURROUND = 10; 00030 00031 SeqWeightTable::SeqWeightTable(size_t window_size, size_t order, double alpha) 00032 : _order(order), 00033 _observed(order, window_size, window_size, alpha), 00034 _expected(order, window_size, order+1, EPSILON) { 00035 } 00036 00037 SeqWeightTable::SeqWeightTable(size_t window_size, size_t order, 00038 string param_file_name, string identifier) 00039 : _order(order), 00040 _observed(order, window_size, window_size, 0), 00041 _expected(order, window_size, order+1, 0) { 00042 00043 //TODO: Allow for orders and window sizes to be read from param file. 00044 00045 ifstream infile (param_file_name.c_str()); 00046 const size_t BUFF_SIZE = 99999; 00047 char line_buff[BUFF_SIZE]; 00048 if (!infile.is_open()) { 00049 logger.severe("Unable to open parameter file '%s'.", 00050 param_file_name.c_str()); 00051 } 00052 00053 do { 00054 infile.getline (line_buff, BUFF_SIZE, '\n'); 00055 } while (strncmp(line_buff, identifier.c_str(), identifier.size())); 00056 infile.getline (line_buff, BUFF_SIZE, '\n'); 00057 00058 do { 00059 infile.getline (line_buff, BUFF_SIZE, '\n'); 00060 } while (strcmp(line_buff, "\tObserved Conditional Probabilities")); 00061 infile.getline (line_buff, BUFF_SIZE, '\n'); 00062 00063 //FG 00064 for (size_t k = 0; k < (size_t)WINDOW; ++k) { 00065 infile.getline (line_buff, BUFF_SIZE, '\n'); 00066 char *p = strtok(line_buff, "\t"); 00067 for (size_t i=0; i < pow(4.0, (double)min(order, k)); ++i) { 00068 for (size_t j=0; j < NUM_NUCS; ++j) { 00069 p = strtok(NULL, "\t"); 00070 _observed.update(k, i, j, log(strtod(p,NULL))); 00071 } 00072 } 00073 } 00074 00075 infile.getline (line_buff, BUFF_SIZE, '\n'); 00076 infile.getline (line_buff, BUFF_SIZE, '\n'); 00077 //BG 00078 for (size_t k = 0; k < order+1; ++k) { 00079 infile.getline (line_buff, BUFF_SIZE, '\n'); 00080 char *p = strtok(line_buff, "\t"); 00081 for (size_t i=0; i < pow(4.0, (double)min(order, k)); ++i) { 00082 for (size_t j=0; j < NUM_NUCS; ++j) { 00083 p = strtok(NULL, "\t"); 00084 _expected.update(k, i, j, log(strtod(p,NULL))); 00085 } 00086 } 00087 } 00088 } 00089 00090 00091 void SeqWeightTable::copy_observed(const SeqWeightTable& other) { 00092 _observed = other._observed; 00093 } 00094 00095 void SeqWeightTable::copy_expected(const SeqWeightTable& other) { 00096 _expected = other._expected; 00097 } 00098 00099 void SeqWeightTable::increment_expected(const Sequence& seq, double mass, 00100 const vector<double>& fl_cdf) { 00101 _expected.fast_learn(seq, mass, fl_cdf); 00102 } 00103 00104 void SeqWeightTable::normalize_expected() { 00105 _expected.calc_marginals(); 00106 } 00107 00108 void SeqWeightTable::increment_observed(const Sequence& seq, size_t i, 00109 double mass) { 00110 int left = (int)i - SURROUND; 00111 _observed.update(seq, left, mass); 00112 } 00113 00114 double SeqWeightTable::get_weight(const Sequence& seq, size_t i) const { 00115 int left = (int)i - SURROUND; 00116 return _observed.seq_prob(seq, left) - _expected.seq_prob(seq, left); 00117 } 00118 00119 void SeqWeightTable::append_output(ofstream& outfile) const { 00120 char buff[200]; 00121 string header = ""; 00122 for (int i = 0; i < WINDOW; i++) { 00123 sprintf(buff, "\t%d", i-CENTER+1); 00124 header += buff; 00125 } 00126 header += '\n'; 00127 00128 outfile << "\tObserved Marginal Distribution\n" << header; 00129 00130 for (size_t j = 0; j < NUM_NUCS; j++) { 00131 outfile << NUCS[j] << ":\t"; 00132 for (int i = 0; i < WINDOW; i++) { 00133 outfile << scientific << sexp(_observed.marginal_prob(i,j)) << "\t"; 00134 } 00135 outfile<<endl; 00136 } 00137 00138 outfile << "\tObserved Conditional Probabilities\nPosition\t"; 00139 00140 for (size_t j = 0; j < pow((double)NUM_NUCS, (double)_order+1); j++) { 00141 string s = "->"; 00142 s += NUCS[j & 3]; 00143 size_t cond = j >> 2; 00144 for (size_t k = 0; k < _order; ++k) { 00145 s = NUCS[cond & 3] + s; 00146 cond = cond >> 2; 00147 } 00148 outfile << s << '\t'; 00149 } 00150 outfile << endl; 00151 00152 for (int i = 0; i < WINDOW; i++) { 00153 outfile << i-SURROUND << ":\t"; 00154 for (size_t j = 0; j < pow((double)NUM_NUCS, (double)_order+1); j++) { 00155 outfile << scientific << sexp(_observed.transition_prob(i,j>>2,j&3)) 00156 << "\t"; 00157 } 00158 outfile<<endl; 00159 } 00160 00161 outfile << "\tBackground Conditional Probabilities\nPosition\t"; 00162 00163 for (size_t j = 0; j < pow((double)NUM_NUCS, (double)_order+1); j++) { 00164 string s = "->"; 00165 s += NUCS[j & 3]; 00166 size_t cond = j >> 2; 00167 for (size_t k = 0; k < _order; ++k) { 00168 s = NUCS[cond & 3] + s; 00169 cond = cond >> 2; 00170 } 00171 outfile << s << '\t'; 00172 } 00173 outfile << endl; 00174 00175 for (size_t i = 0; i < _order+1; i++) { 00176 outfile << i+1 << ":\t"; 00177 for (size_t j = 0; j < pow((double)NUM_NUCS, (double)_order+1); j++) { 00178 outfile << scientific << sexp(_expected.transition_prob(i,j>>2,j&3)) 00179 << "\t"; 00180 } 00181 outfile<<endl; 00182 } 00183 } 00184 00185 BiasBoss::BiasBoss(size_t order, double alpha) 00186 : _order(order), 00187 _5_seq_bias(WINDOW, order, alpha), 00188 _3_seq_bias(WINDOW, order, alpha){ 00189 } 00190 00191 BiasBoss::BiasBoss(size_t order, string param_file_name) 00192 : _order(order), 00193 _5_seq_bias(WINDOW, order, param_file_name, ">5"), 00194 _3_seq_bias(WINDOW, order, param_file_name, ">3") { 00195 } 00196 00197 void BiasBoss::copy_observations(const BiasBoss& other) { 00198 _5_seq_bias.copy_observed(other._5_seq_bias); 00199 _3_seq_bias.copy_observed(other._3_seq_bias); 00200 } 00201 00202 void BiasBoss::copy_expectations(const BiasBoss& other) { 00203 _5_seq_bias.copy_expected(other._5_seq_bias); 00204 _3_seq_bias.copy_expected(other._3_seq_bias); 00205 } 00206 00207 void BiasBoss::update_expectations(const Target& targ, double mass, 00208 const vector<double>& fl_cdf) { 00209 if (mass == LOG_0) { 00210 return; 00211 } 00212 00213 if (direction != R) { 00214 const Sequence& seq_fwd = targ.seq(0); 00215 _5_seq_bias.increment_expected(seq_fwd, mass, fl_cdf); 00216 } 00217 if (direction != F) { 00218 const Sequence& seq_rev = targ.seq(1); 00219 _3_seq_bias.increment_expected(seq_rev, mass, fl_cdf); 00220 } 00221 } 00222 00223 void BiasBoss::normalize_expectations() { 00224 _5_seq_bias.normalize_expected(); 00225 _3_seq_bias.normalize_expected(); 00226 } 00227 00228 void BiasBoss::update_observed(const FragHit& hit, double normalized_mass) 00229 { 00230 assert (hit.pair_status() != PAIRED || (int)hit.length() > WINDOW); 00231 00232 const Sequence& t_seq_fwd = hit.target()->seq(0); 00233 const Sequence& t_seq_rev = hit.target()->seq(1); 00234 00235 if (hit.pair_status() != RIGHT_ONLY) { 00236 _5_seq_bias.increment_observed(t_seq_fwd, hit.left(), normalized_mass); 00237 } 00238 if (hit.pair_status() != LEFT_ONLY) { 00239 _3_seq_bias.increment_observed(t_seq_rev, t_seq_rev.length()-hit.right(), 00240 normalized_mass); 00241 } 00242 } 00243 00244 double BiasBoss::get_target_bias(std::vector<float>& start_bias, 00245 std::vector<float>& end_bias, 00246 const Target& targ) const { 00247 double tot_start = LOG_0; 00248 double tot_end = LOG_0; 00249 00250 const Sequence& t_seq_fwd = targ.seq(0); 00251 const Sequence& t_seq_rev = targ.seq(1); 00252 00253 for (size_t i = 0; i < targ.length(); ++i) { 00254 start_bias[i] = _5_seq_bias.get_weight(t_seq_fwd, i); 00255 end_bias[targ.length()-i-1] = _3_seq_bias.get_weight(t_seq_rev, i); 00256 tot_start = log_add(tot_start, start_bias[i]); 00257 tot_end = log_add(tot_end, end_bias[i]); 00258 } 00259 00260 double avg_bias = (tot_start + tot_end) - (2*log((double)targ.length())); 00261 assert(!isnan(avg_bias)); 00262 return avg_bias; 00263 } 00264 00265 void BiasBoss::append_output(ofstream& outfile) const { 00266 outfile << ">5' Sequence-Specific Bias\n"; 00267 _5_seq_bias.append_output(outfile); 00268 outfile << ">3' Sequence-Specific Bias\n"; 00269 _3_seq_bias.append_output(outfile); 00270 }