eXpress “1.5”
|
00001 00009 #include "markovmodel.h" 00010 #include "frequencymatrix.h" 00011 #include "sequence.h" 00012 #include "main.h" 00013 00014 using namespace std; 00015 00016 MarkovModel::MarkovModel(size_t order, size_t window_size, 00017 size_t num_pos, double alpha) 00018 : _order((int)order), 00019 _window_size((int)window_size), 00020 _num_pos((int)num_pos), 00021 _params(num_pos, FrequencyMatrix<double>((size_t)pow((double)NUM_NUCS, 00022 (double)order), 00023 NUM_NUCS, alpha)), 00024 _bitclear((1<<(2*order))-1) { 00025 } 00026 00027 size_t MarkovModel::get_indices(const Sequence& seq, int left, vector<char>& indices) { 00028 assert(_order <= 3); 00029 int i = 0; 00030 int j = left; 00031 int seq_len = (int)seq.length(); 00032 00033 size_t cond = 0; 00034 00035 if (left != 0 && left < _order) { 00036 i = _order-left; 00037 for (j=0; j < _order; j++) { 00038 cond = (cond << 2) + seq[j]; 00039 } 00040 } 00041 00042 indices = vector<char>(_window_size - i, -1); 00043 size_t start_index = i; 00044 00045 while (i < _window_size && j < seq_len) { 00046 size_t index = min(i, _num_pos-1) - start_index; 00047 size_t curr = seq[j]; 00048 cond = (cond << 2) + curr; 00049 indices[index] = (char)cond; 00050 cond &= _bitclear; 00051 i++; 00052 j++; 00053 } 00054 return start_index; 00055 } 00056 00057 void MarkovModel::update(const Sequence& seq, int left, double mass) { 00058 int i = 0; 00059 int j = left; 00060 int seq_len = (int)seq.length(); 00061 00062 size_t cond = 0; 00063 00064 if (left != 0 && left < _order) { 00065 i = _order-left; 00066 for (j=0; j < _order; j++) { 00067 cond = (cond << 2) + seq[j]; 00068 } 00069 } 00070 00071 while (i < _window_size && j < seq_len) { 00072 size_t index = min(i, _num_pos-1); 00073 size_t curr = seq[j]; 00074 //if (seq.prob()) { 00075 // for (size_t nuc = 0; nuc < NUM_NUCS; nuc++) { 00076 // _params[index].increment(cond, nuc, seq.get_prob(j, nuc) + mass); 00077 // } 00078 //} else { 00079 _params[index].increment(cond, curr, mass); 00080 //} 00081 cond = (cond << 2) + curr; 00082 cond &= _bitclear; 00083 i++; 00084 j++; 00085 } 00086 } 00087 00088 void MarkovModel::update(size_t p, size_t i, size_t j, double mass) { 00089 _params[p].increment(i, j, mass); 00090 } 00091 00092 vector<char> MarkovModel::get_indices(const Sequence& seq) { 00093 vector<char> indices(seq.length() - _order, -1); 00094 00095 if (seq.length() < (size_t)_order) { 00096 return indices; 00097 } 00098 00099 size_t cond = 0; 00100 for (int i = 0; i < _order; ++i) { 00101 cond = (cond << 2) + seq[i]; 00102 } 00103 00104 for (size_t i = _order; i < seq.length(); ++i) { 00105 size_t curr = seq[i]; 00106 cond = (cond << 2) + curr; 00107 indices[i-_order] = (char)cond; 00108 cond &= _bitclear; 00109 } 00110 return indices; 00111 } 00112 00113 void MarkovModel::fast_learn(const Sequence& seq, double mass, 00114 const vector<double>& fl_cmf) { 00115 assert(_num_pos==_order+1); 00116 if (seq.length() < (size_t)_order) { 00117 return; 00118 } 00119 00120 size_t cond = 0; 00121 for (int i = 0; i < _order; ++i) { 00122 cond = (cond << 2) + seq[i]; 00123 } 00124 00125 for (size_t i = _order; i < seq.length(); ++i) { 00126 size_t curr = seq[i]; 00127 00128 double mass_i = mass; 00129 if (seq.length()-i < fl_cmf.size()) { 00130 mass_i += fl_cmf[seq.length()-i]; 00131 } 00132 00133 if (seq.prob()) { 00134 for (size_t nuc = 0; nuc < NUM_NUCS; nuc++) { 00135 _params[_order].increment(cond, nuc, seq.get_prob(i, nuc) + mass_i); 00136 } 00137 } else { 00138 _params[_order].increment(cond, curr, mass_i); 00139 } 00140 cond = (cond << 2) + curr; 00141 cond &= _bitclear; 00142 } 00143 } 00144 00145 void MarkovModel::calc_marginals() { 00146 assert(_num_pos==_order+1); 00147 for (int i = 0; i < _order; ++i) { 00148 for (size_t cond = 0; cond < pow((double)NUM_NUCS, 00149 (double)(_order)); cond++) { 00150 for (size_t nuc = 0; nuc < NUM_NUCS; ++nuc) { 00151 _params[i].increment(cond & ((1 << (2*i))-1) , nuc, 00152 _params[_order]((cond << 2) + nuc, false)); 00153 } 00154 } 00155 } 00156 } 00157 00158 double MarkovModel::transition_prob(size_t p, size_t cond, size_t curr) const { 00159 assert(p < _params.size()); 00160 return _params[p](cond, curr); 00161 } 00162 00163 double MarkovModel::seq_prob(const Sequence& seq, int left) const { 00164 int i = 0; 00165 int j = left; 00166 int seq_len = (int)seq.length(); 00167 00168 size_t cond = 0; 00169 double v = 0; 00170 00171 if (left < _order) { 00172 i = _order-left; 00173 for (j=0; j < min(_order, i); j++) { 00174 cond = (cond << 2) + seq[j]; 00175 } 00176 v = i*LOG_QUARTER; 00177 } 00178 00179 while (i < _window_size && j < seq_len) { 00180 size_t index = min(i, _num_pos -1); 00181 size_t curr = seq[j]; 00182 if (seq.prob()) { 00183 double prob = LOG_0; 00184 for (size_t nuc = 0; nuc < NUM_NUCS; nuc++) { 00185 assert(!isnan(seq.get_prob(j, nuc))); 00186 assert(!isnan(_params[index](cond, nuc))); 00187 prob = log_add(prob, seq.get_prob(j, nuc) + _params[index](cond, nuc)); 00188 } 00189 v += prob; 00190 } else { 00191 assert(!isnan(_params[index](cond, curr))); 00192 v += _params[index](cond, curr); 00193 } 00194 cond = (cond << 2) + curr; 00195 cond &= _bitclear; 00196 00197 i++; 00198 j++; 00199 } 00200 assert(!isnan(v)); 00201 return v; 00202 } 00203 00204 double MarkovModel::marginal_prob(size_t w, size_t nuc) const { 00205 assert(w < _params.size()); 00206 double marg = LOG_0; 00207 double tot = LOG_0; 00208 for (size_t cond = 0; cond < pow((double)NUM_NUCS, 00209 (double)(_order)); cond++) { 00210 marg = log_add(marg, _params[w]((cond << 2) + nuc, false)); 00211 tot = log_add(tot, _params[w].sum(cond)); 00212 } 00213 return marg-tot; 00214 }