eXpress “1.5”
|
00001 // 00002 // sequence.cpp 00003 // express 00004 // 00005 // Created by Adam Roberts on 1/24/12. 00006 // Copyright 2012 Adam Roberts. All rights reserved. 00007 // 00008 00009 #include "sequence.h" 00010 #include <cassert> 00011 #include <boost/math/distributions/binomial.hpp> 00012 00013 using namespace std; 00014 using namespace boost::math; 00015 00016 string Sequence::serialize() { 00017 vector<char> seq; 00018 for (size_t i = 0; i < length(); i++) { 00019 if (i/4 == seq.size()) { 00020 seq.push_back(0); 00021 } 00022 seq.back() += operator[](i) << (2 * (i % 4)); 00023 } 00024 00025 return string(seq.begin(), seq.end()); 00026 } 00027 00028 SequenceFwd::SequenceFwd(): _ref_seq(NULL), _prob(0), _len(0) {} 00029 00030 SequenceFwd::SequenceFwd(const std::string& seq, bool rev, bool prob) 00031 : _prob(prob), _len(seq.length()) { 00032 if (prob) { 00033 _est_seq = FrequencyMatrix<float>(seq.length(), NUM_NUCS, 0.001); 00034 _obs_seq = FrequencyMatrix<float>(seq.length(), NUM_NUCS, LOG_0); 00035 _exp_seq = FrequencyMatrix<float>(seq.length(), NUM_NUCS, LOG_0); 00036 } 00037 set(seq, rev); 00038 } 00039 00040 SequenceFwd::SequenceFwd(const SequenceFwd& other) 00041 : _obs_seq(other._obs_seq), _exp_seq(other._exp_seq), 00042 _prob(other._prob), _len(other.length()) { 00043 if (other._ref_seq) { 00044 char* ref_seq = new char[_len]; 00045 std::copy(other._ref_seq.get(), other._ref_seq.get() + _len, ref_seq); 00046 _ref_seq.reset(ref_seq); 00047 } 00048 } 00049 00050 SequenceFwd& SequenceFwd::operator=(const SequenceFwd& other) { 00051 if (other._ref_seq) { 00052 _len = other.length(); 00053 char* ref_seq = new char[_len]; 00054 std::copy(other._ref_seq.get(), other._ref_seq.get() + _len, ref_seq); 00055 _ref_seq.reset(ref_seq); 00056 _obs_seq = other._obs_seq; 00057 _exp_seq = other._exp_seq; 00058 _prob = other._prob; 00059 } 00060 return *this; 00061 } 00062 00063 void SequenceFwd::set(const std::string& seq, bool rev) { 00064 char* ref_seq = new char[seq.length()]; 00065 for (size_t i = 0; i < seq.length(); i++) { 00066 ref_seq[i] = (rev) ? complement(ctoi(seq[seq.length()-1-i])) : ctoi(seq[i]); 00067 if (_prob) { 00068 _est_seq.increment(i, ref_seq[i], log((float)2)); 00069 } 00070 } 00071 _ref_seq.reset(ref_seq); 00072 _len = seq.length(); 00073 } 00074 00075 size_t SequenceFwd::operator[](const size_t index) const { 00076 assert(index < _len); 00077 if (_prob) { 00078 return _est_seq.argmax(index); 00079 } 00080 return _ref_seq[index]; 00081 } 00082 00083 size_t SequenceFwd::get_ref(const size_t index) const { 00084 assert(index < _len); 00085 //assert(_ref_seq[index] == operator[](index)); 00086 return _ref_seq[index]; 00087 } 00088 00089 float SequenceFwd::get_prob(const size_t index, const size_t nuc) const { 00090 assert(_prob); 00091 return _est_seq(index, nuc); 00092 } 00093 00094 float SequenceFwd::get_obs(const size_t index, const size_t nuc) const { 00095 assert(index < _len); 00096 return _obs_seq(index,nuc, false); 00097 } 00098 00099 float SequenceFwd::get_exp(const size_t index, const size_t nuc) const { 00100 assert(index < _len); 00101 return _exp_seq(index,nuc, false); 00102 } 00103 00104 void SequenceFwd::update_est(const size_t index, const size_t nuc, float mass) { 00105 assert(_prob); 00106 _est_seq.increment(index, nuc, mass); 00107 } 00108 00109 void SequenceFwd::update_obs(const size_t index, const size_t nuc, float mass) { 00110 assert(_prob); 00111 _obs_seq.increment(index, nuc, mass); 00112 } 00113 00114 void SequenceFwd::update_exp(const size_t index, const size_t nuc, float mass) { 00115 assert(_prob); 00116 _exp_seq.increment(index, nuc, mass); 00117 } 00118 00119 void SequenceFwd::calc_p_vals(vector<double>& p_vals) const { 00120 p_vals = vector<double>(_len, 1.0); 00121 for (size_t i = 0; i < _len; ++i) { 00122 double N = round(sexp(_obs_seq.sum(i))); 00123 if (N<5) { 00124 continue; 00125 } 00126 size_t ref_nuc = get_ref(i); 00127 double max_obs = 0; 00128 for (size_t nuc = 0; nuc < NUM_NUCS; ++nuc) { 00129 if (nuc == ref_nuc) { 00130 continue; 00131 } 00132 00133 double obs_n = round(sexp(_obs_seq(i,nuc,false))); 00134 max_obs = max(max_obs, obs_n); 00135 } 00136 00137 double p_val = 0; 00138 00139 for (size_t nuc = 0; nuc < NUM_NUCS; ++nuc) { 00140 if (nuc == ref_nuc) { 00141 continue; 00142 } 00143 00144 double exp_p = sexp(_exp_seq(i, nuc)); 00145 binomial binom(N, exp_p); 00146 p_val += log(cdf(binom, max_obs)); 00147 } 00148 p_vals[i] -= sexp(p_val); 00149 } 00150 } 00151 00152 void SequenceRev::calc_p_vals(vector<double>& p_vals) const 00153 { 00154 vector<double> temp; 00155 _seq->calc_p_vals(temp); 00156 p_vals = vector<double>(length()); 00157 for(size_t i = 0; i < length(); i++) { 00158 p_vals[i] = temp[length()-i-1]; 00159 } 00160 } 00161 00162 /* 00163 void SequenceFwd::calc_p_vals(vector<double>& p_vals) const 00164 { 00165 p_vals = vector<double>(_len, 1); 00166 for (size_t i = 0; i < _len; ++i) 00167 { 00168 double N = sexp(_obs_seq.sum(i)); 00169 00170 if (N==0) 00171 continue; 00172 00173 size_t ref_nuc = get_ref(i); 00174 double p_val = LOG_0; 00175 00176 vector<double> cdfs(4); 00177 for (size_t nuc = 0; nuc < NUM_NUCS; ++nuc) 00178 { 00179 if (nuc == ref_nuc) 00180 continue; 00181 00182 double p = sexp(_exp_seq(i,nuc)); 00183 double obs_n = sexp(_obs_seq(i,nuc,false)); 00184 normal norm (N*p, sqrt(N*p*(1-p))); 00185 cdfs[nuc] = cdf(norm, obs_n); 00186 } 00187 00188 for (size_t nuc1 = 0; nuc1 < NUM_NUCS; ++nuc1) 00189 { 00190 if (nuc1 == ref_nuc) 00191 continue; 00192 00193 double term = log(1-cdfs[nuc1]); 00194 00195 for(size_t nuc2 = 0; nuc2 < NUM_NUCS; ++nuc2) 00196 { 00197 if (nuc2 == nuc1 || nuc2 == ref_nuc) 00198 continue; 00199 term += log(cdfs[nuc2]); 00200 } 00201 p_val = log_add(p_val, term); 00202 } 00203 00204 p_vals[i] = sexp(p_val); 00205 } 00206 } 00207 */ 00208 00209 /* 00210 void SequenceFwd::calc_p_vals(vector<double>& p_vals) const 00211 { 00212 p_vals = vector<double>(_len, 1.0); 00213 boost::math::chi_squared_distribution<double> chisq(3); 00214 double obs_n,exp_n; 00215 for (size_t i = 0; i < _len; ++i) 00216 { 00217 if (_obs_seq.sum(i)==LOG_0) 00218 continue; 00219 00220 double S = 0; 00221 for (size_t nuc = 0; nuc < NUM_NUCS; ++nuc) 00222 { 00223 obs_n = sexp(_obs_seq(i,nuc,false)); 00224 exp_n = sexp(_exp_seq(i,nuc,false)); 00225 S += (obs_n-exp_n)*(obs_n-exp_n)/exp_n; 00226 } 00227 00228 p_vals[i] -= boost::math::cdf(chisq,S); 00229 } 00230 } 00231 */