eXpress “1.5”

src/sequence.cpp

00001 //
00002 //  sequence.cpp
00003 //  express
00004 //
00005 //  Created by Adam Roberts on 1/24/12.
00006 //  Copyright 2012 Adam Roberts. All rights reserved.
00007 //
00008 
00009 #include "sequence.h"
00010 #include <cassert>
00011 #include <boost/math/distributions/binomial.hpp>
00012 
00013 using namespace std;
00014 using namespace boost::math;
00015 
00016 string Sequence::serialize() {
00017   vector<char> seq;
00018   for (size_t i = 0; i < length(); i++) {
00019     if (i/4 == seq.size()) {
00020       seq.push_back(0);
00021     }
00022     seq.back() += operator[](i) << (2 * (i % 4));
00023   }
00024   
00025   return string(seq.begin(), seq.end());
00026 }
00027 
00028 SequenceFwd::SequenceFwd():  _ref_seq(NULL), _prob(0), _len(0) {}
00029 
00030 SequenceFwd::SequenceFwd(const std::string& seq, bool rev, bool prob)
00031     : _prob(prob), _len(seq.length()) {
00032   if (prob) {
00033     _est_seq = FrequencyMatrix<float>(seq.length(), NUM_NUCS, 0.001);
00034     _obs_seq = FrequencyMatrix<float>(seq.length(), NUM_NUCS, LOG_0);
00035     _exp_seq = FrequencyMatrix<float>(seq.length(), NUM_NUCS, LOG_0);
00036   }
00037   set(seq, rev);
00038 }
00039 
00040 SequenceFwd::SequenceFwd(const SequenceFwd& other)
00041     : _obs_seq(other._obs_seq), _exp_seq(other._exp_seq),
00042       _prob(other._prob), _len(other.length()) {
00043   if (other._ref_seq) {
00044     char* ref_seq = new char[_len];
00045     std::copy(other._ref_seq.get(), other._ref_seq.get() + _len, ref_seq);
00046     _ref_seq.reset(ref_seq);
00047   }
00048 }
00049 
00050 SequenceFwd& SequenceFwd::operator=(const SequenceFwd& other) {
00051   if (other._ref_seq) {
00052     _len = other.length();
00053     char* ref_seq = new char[_len];
00054     std::copy(other._ref_seq.get(), other._ref_seq.get() + _len, ref_seq);
00055     _ref_seq.reset(ref_seq);
00056     _obs_seq = other._obs_seq;
00057     _exp_seq = other._exp_seq;
00058     _prob = other._prob;
00059   }
00060   return *this;
00061 }
00062 
00063 void SequenceFwd::set(const std::string& seq, bool rev) {
00064   char* ref_seq = new char[seq.length()];
00065   for (size_t i = 0; i < seq.length(); i++) {
00066     ref_seq[i] = (rev) ? complement(ctoi(seq[seq.length()-1-i])) : ctoi(seq[i]);
00067     if (_prob) {
00068       _est_seq.increment(i, ref_seq[i], log((float)2));
00069     }
00070   }
00071   _ref_seq.reset(ref_seq);
00072   _len = seq.length();
00073 }
00074 
00075 size_t SequenceFwd::operator[](const size_t index) const {
00076   assert(index < _len);
00077   if (_prob) {
00078     return _est_seq.argmax(index);
00079   }
00080   return _ref_seq[index];
00081 }
00082 
00083 size_t SequenceFwd::get_ref(const size_t index) const {
00084   assert(index < _len);
00085   //assert(_ref_seq[index] == operator[](index));
00086   return _ref_seq[index];
00087 }
00088 
00089 float SequenceFwd::get_prob(const size_t index, const size_t nuc) const {
00090   assert(_prob);
00091   return _est_seq(index, nuc);
00092 }
00093 
00094 float SequenceFwd::get_obs(const size_t index, const size_t nuc) const {
00095   assert(index < _len);
00096   return _obs_seq(index,nuc, false);
00097 }
00098 
00099 float SequenceFwd::get_exp(const size_t index, const size_t nuc) const {
00100   assert(index < _len);
00101   return _exp_seq(index,nuc, false);
00102 }
00103 
00104 void SequenceFwd::update_est(const size_t index, const size_t nuc, float mass) {
00105   assert(_prob);
00106   _est_seq.increment(index, nuc, mass);
00107 }
00108 
00109 void SequenceFwd::update_obs(const size_t index, const size_t nuc, float mass) {
00110   assert(_prob);
00111   _obs_seq.increment(index, nuc, mass);
00112 }
00113 
00114 void SequenceFwd::update_exp(const size_t index, const size_t nuc, float mass) {
00115   assert(_prob);
00116   _exp_seq.increment(index, nuc, mass);
00117 }
00118 
00119 void SequenceFwd::calc_p_vals(vector<double>& p_vals) const {
00120   p_vals = vector<double>(_len, 1.0);
00121   for (size_t i = 0; i < _len; ++i) {
00122     double N = round(sexp(_obs_seq.sum(i)));
00123     if (N<5) {
00124       continue;
00125     }
00126     size_t ref_nuc = get_ref(i);
00127     double max_obs = 0;
00128     for (size_t nuc = 0; nuc < NUM_NUCS; ++nuc) {
00129       if (nuc == ref_nuc) {
00130         continue;
00131       }
00132 
00133       double obs_n = round(sexp(_obs_seq(i,nuc,false)));
00134       max_obs = max(max_obs, obs_n);
00135     }
00136     
00137     double p_val = 0;
00138 
00139     for (size_t nuc = 0; nuc < NUM_NUCS; ++nuc) {
00140       if (nuc == ref_nuc) {
00141         continue;
00142       }
00143 
00144       double exp_p = sexp(_exp_seq(i, nuc));
00145       binomial binom(N, exp_p);
00146       p_val += log(cdf(binom, max_obs));
00147     }
00148     p_vals[i] -= sexp(p_val);
00149   }
00150 }
00151 
00152 void SequenceRev::calc_p_vals(vector<double>& p_vals) const
00153 {
00154   vector<double> temp;
00155   _seq->calc_p_vals(temp);
00156   p_vals = vector<double>(length());
00157   for(size_t i = 0; i < length(); i++) {
00158     p_vals[i] = temp[length()-i-1];
00159   }
00160 }
00161 
00162 /*
00163 void SequenceFwd::calc_p_vals(vector<double>& p_vals) const
00164 {
00165     p_vals = vector<double>(_len, 1);
00166     for (size_t i = 0; i < _len; ++i)
00167     {
00168         double N = sexp(_obs_seq.sum(i));
00169 
00170         if (N==0)
00171             continue;
00172 
00173         size_t ref_nuc = get_ref(i);
00174         double p_val = LOG_0;
00175 
00176         vector<double> cdfs(4);
00177         for (size_t nuc = 0; nuc < NUM_NUCS; ++nuc)
00178         {
00179             if (nuc == ref_nuc)
00180                 continue;
00181 
00182             double p = sexp(_exp_seq(i,nuc));
00183             double obs_n = sexp(_obs_seq(i,nuc,false));
00184             normal norm (N*p, sqrt(N*p*(1-p)));
00185             cdfs[nuc] = cdf(norm, obs_n);
00186         }
00187 
00188         for (size_t nuc1 = 0; nuc1 < NUM_NUCS; ++nuc1)
00189         {
00190             if (nuc1 == ref_nuc)
00191                 continue;
00192 
00193             double term = log(1-cdfs[nuc1]);
00194 
00195             for(size_t nuc2 = 0; nuc2 < NUM_NUCS; ++nuc2)
00196             {
00197                 if (nuc2 == nuc1 || nuc2 == ref_nuc)
00198                     continue;
00199                 term += log(cdfs[nuc2]);
00200             }
00201             p_val = log_add(p_val, term);
00202         }
00203 
00204         p_vals[i] = sexp(p_val);
00205     }
00206 }
00207 */
00208 
00209 /*
00210 void SequenceFwd::calc_p_vals(vector<double>& p_vals) const
00211 {
00212     p_vals = vector<double>(_len, 1.0);
00213     boost::math::chi_squared_distribution<double> chisq(3);
00214     double obs_n,exp_n;
00215     for (size_t i = 0; i < _len; ++i)
00216     {
00217         if (_obs_seq.sum(i)==LOG_0)
00218             continue;
00219 
00220         double S = 0;
00221         for (size_t nuc = 0; nuc < NUM_NUCS; ++nuc)
00222         {
00223             obs_n = sexp(_obs_seq(i,nuc,false));
00224             exp_n = sexp(_exp_seq(i,nuc,false));
00225             S += (obs_n-exp_n)*(obs_n-exp_n)/exp_n;
00226         }
00227 
00228         p_vals[i] -= boost::math::cdf(chisq,S);
00229     }
00230 }
00231  */
 All Classes Functions Variables