eXpress “1.5”

src/markovmodel.cpp

00001 
00009 #include "markovmodel.h"
00010 #include "frequencymatrix.h"
00011 #include "sequence.h"
00012 #include "main.h"
00013 
00014 using namespace std;
00015 
00016 MarkovModel::MarkovModel(size_t order, size_t window_size,
00017                          size_t num_pos, double alpha)
00018     : _order((int)order),
00019       _window_size((int)window_size),
00020       _num_pos((int)num_pos),
00021       _params(num_pos, FrequencyMatrix<double>((size_t)pow((double)NUM_NUCS,
00022                                                            (double)order),
00023                                                NUM_NUCS, alpha)),
00024       _bitclear((1<<(2*order))-1) {
00025 }
00026 
00027 size_t MarkovModel::get_indices(const Sequence& seq, int left, vector<char>& indices) {
00028   assert(_order <= 3);
00029   int i = 0;
00030   int j = left;
00031   int seq_len = (int)seq.length();
00032   
00033   size_t cond = 0;
00034   
00035   if (left != 0 && left < _order) {
00036     i = _order-left;
00037     for (j=0; j < _order; j++) {
00038       cond = (cond << 2) + seq[j];
00039     }
00040   }
00041   
00042   indices = vector<char>(_window_size - i, -1);
00043   size_t start_index = i;
00044   
00045   while (i < _window_size && j < seq_len) {
00046     size_t index = min(i, _num_pos-1) - start_index;
00047     size_t curr = seq[j];
00048     cond = (cond << 2) + curr;
00049     indices[index] = (char)cond;
00050     cond &= _bitclear;
00051     i++;
00052     j++;
00053   }
00054   return start_index;
00055 }
00056 
00057 void MarkovModel::update(const Sequence& seq, int left, double mass) {
00058   int i = 0;
00059   int j = left;
00060   int seq_len = (int)seq.length();
00061 
00062   size_t cond = 0;
00063   
00064   if (left != 0 && left < _order) {
00065     i = _order-left;
00066     for (j=0; j < _order; j++) {
00067       cond = (cond << 2) + seq[j];
00068     }
00069   }
00070 
00071   while (i < _window_size && j < seq_len) {
00072     size_t index = min(i, _num_pos-1);
00073     size_t curr = seq[j];
00074     //if (seq.prob()) {
00075     //  for (size_t nuc = 0; nuc < NUM_NUCS; nuc++) {
00076     //    _params[index].increment(cond, nuc, seq.get_prob(j, nuc) + mass);
00077     //  }
00078     //} else {
00079       _params[index].increment(cond, curr, mass);
00080     //}
00081     cond = (cond << 2) + curr;
00082     cond &= _bitclear;
00083     i++;
00084     j++;
00085   }
00086 }
00087 
00088 void MarkovModel::update(size_t p, size_t i, size_t j, double mass) {
00089   _params[p].increment(i, j, mass);
00090 }
00091 
00092 vector<char> MarkovModel::get_indices(const Sequence& seq) {
00093   vector<char> indices(seq.length() - _order, -1);
00094   
00095   if (seq.length() < (size_t)_order) {
00096     return indices;
00097   }
00098   
00099   size_t cond = 0;
00100   for (int i = 0; i < _order; ++i) {
00101     cond = (cond << 2) + seq[i];
00102   }
00103   
00104   for (size_t i = _order; i < seq.length(); ++i) {
00105     size_t curr = seq[i];
00106     cond = (cond << 2) + curr;
00107     indices[i-_order] = (char)cond;
00108     cond &= _bitclear;
00109   }
00110   return indices;
00111 }
00112 
00113 void MarkovModel::fast_learn(const Sequence& seq, double mass,
00114                              const vector<double>& fl_cmf) {
00115   assert(_num_pos==_order+1);
00116   if (seq.length() < (size_t)_order) {
00117     return;
00118   }
00119 
00120   size_t cond = 0;
00121   for (int i = 0; i < _order; ++i) {
00122      cond = (cond << 2) + seq[i];
00123   }
00124 
00125   for (size_t i = _order; i < seq.length(); ++i) {
00126     size_t curr = seq[i];
00127 
00128     double mass_i = mass;
00129     if (seq.length()-i < fl_cmf.size()) {
00130       mass_i += fl_cmf[seq.length()-i];
00131     }
00132 
00133     if (seq.prob()) {
00134       for (size_t nuc = 0; nuc < NUM_NUCS; nuc++) {
00135         _params[_order].increment(cond, nuc, seq.get_prob(i, nuc) + mass_i);
00136       }
00137     } else {
00138       _params[_order].increment(cond, curr, mass_i);
00139     }
00140     cond = (cond << 2) + curr;
00141     cond &= _bitclear;
00142   }
00143 }
00144 
00145 void MarkovModel::calc_marginals() {
00146   assert(_num_pos==_order+1);
00147   for (int i = 0; i < _order; ++i) {
00148     for (size_t cond = 0; cond < pow((double)NUM_NUCS,
00149                                      (double)(_order)); cond++) {
00150       for (size_t nuc = 0; nuc < NUM_NUCS; ++nuc) {
00151         _params[i].increment(cond & ((1 << (2*i))-1) , nuc,
00152                              _params[_order]((cond << 2) + nuc, false));
00153       }
00154     }
00155   }
00156 }
00157 
00158 double MarkovModel::transition_prob(size_t p, size_t cond, size_t curr) const {
00159   assert(p < _params.size());
00160   return _params[p](cond, curr);
00161 }
00162 
00163 double MarkovModel::seq_prob(const Sequence& seq, int left) const {
00164   int i = 0;
00165   int j = left;
00166   int seq_len = (int)seq.length();
00167 
00168   size_t cond = 0;
00169   double v = 0;
00170 
00171   if (left < _order) {
00172     i = _order-left;
00173     for (j=0; j < min(_order, i); j++) {
00174       cond = (cond << 2) + seq[j];
00175     }
00176     v = i*LOG_QUARTER;
00177   }
00178 
00179   while (i < _window_size && j < seq_len) {
00180     size_t index = min(i, _num_pos -1);
00181     size_t curr = seq[j];
00182     if (seq.prob()) {
00183       double prob = LOG_0;
00184       for (size_t nuc = 0; nuc < NUM_NUCS; nuc++) {
00185         assert(!isnan(seq.get_prob(j, nuc)));
00186         assert(!isnan(_params[index](cond, nuc)));
00187         prob = log_add(prob, seq.get_prob(j, nuc) + _params[index](cond, nuc));
00188       }
00189       v += prob;
00190     } else {
00191       assert(!isnan(_params[index](cond, curr)));
00192       v += _params[index](cond, curr);
00193     }
00194     cond = (cond << 2) + curr;
00195     cond &= _bitclear;
00196 
00197     i++;
00198     j++;
00199   }
00200   assert(!isnan(v));
00201   return v;
00202 }
00203 
00204 double MarkovModel::marginal_prob(size_t w, size_t nuc) const {
00205   assert(w < _params.size());
00206   double marg = LOG_0;
00207   double tot = LOG_0;
00208   for (size_t cond = 0; cond < pow((double)NUM_NUCS,
00209                                    (double)(_order)); cond++) {
00210     marg = log_add(marg, _params[w]((cond << 2) + nuc, false));
00211     tot = log_add(tot, _params[w].sum(cond));
00212   }
00213   return marg-tot;
00214 }
 All Classes Functions Variables