eXpress “1.5”

src/lengthdistribution.cpp

00001 
00009 #include "lengthdistribution.h"
00010 #include "main.h"
00011 #include <numeric>
00012 #include <boost/assign.hpp>
00013 #include <iostream>
00014 #include <fstream>
00015 #include <boost/math/distributions/binomial.hpp>
00016 #include <boost/math/distributions/normal.hpp>
00017 
00018 using namespace std;
00019 
00020 LengthDistribution::LengthDistribution(double alpha, size_t max_val,
00021                                        size_t prior_mu, size_t prior_sigma,
00022                                        size_t kernel_n, double kernel_p,
00023                                        size_t bin_size)
00024     : _hist(max_val/bin_size+1),
00025       _tot_mass(LOG_0),
00026       _sum(LOG_0),
00027       _min(max_val/bin_size),
00028       _bin_size(bin_size) {
00029   
00030   max_val = max_val/bin_size;
00031   kernel_n = kernel_n/bin_size;
00032   assert(kernel_n % 2 == 0);
00033         
00034   double tot = log(alpha);
00035 
00036   // Set to prior distribution
00037   if (prior_mu) {
00038     boost::math::normal norm(prior_mu/bin_size,
00039                              prior_sigma/(bin_size*bin_size));
00040 
00041     for (size_t i = 0; i <= max_val; ++i) {
00042       double norm_mass = boost::math::cdf(norm, i+0.5) -
00043                          boost::math::cdf(norm, i-0.5);
00044       double mass = LOG_EPSILON;
00045       if (norm_mass != 0) {
00046         mass = tot + log(norm_mass);
00047       }
00048       _hist[i] = mass;
00049       _sum = log_add(_sum, log((double)i)+mass);
00050       _tot_mass = log_add(_tot_mass, mass);
00051     }
00052   } else {
00053     _hist = vector<double>(max_val + 1, tot - log((double)max_val));
00054     _hist[0] = LOG_0;
00055     _sum = _hist[1] + log((double)(max_val * (max_val + 1))) - log(2.);
00056     _tot_mass = tot;
00057   }
00058 
00059   // Define kernel
00060   boost::math::binomial_distribution<double> binom(kernel_n, kernel_p);
00061   _kernel = vector<double>(kernel_n + 1);
00062   for (size_t i = 0; i <= kernel_n; i++) {
00063     _kernel[i] = log(boost::math::pdf(binom, i));
00064   }
00065 }
00066 
00067 LengthDistribution::LengthDistribution(string param_file_name,
00068                                        string length_type) :
00069     _bin_size(1) {
00070   ifstream infile (param_file_name.c_str());
00071   const size_t BUFF_SIZE = 99999;
00072   char line_buff[BUFF_SIZE];
00073   
00074   if (!infile.is_open()) {
00075     logger.severe("Unable to open paramater file '%s'.",
00076                   param_file_name.c_str());
00077   }
00078 
00079   do {
00080     infile.getline (line_buff, BUFF_SIZE, '\n');
00081   } while (strncmp(line_buff + 1, length_type.c_str(), length_type.size()));
00082 
00083   infile.getline (line_buff, BUFF_SIZE, '\n');
00084   char *p = strtok(line_buff, "\t");
00085   size_t i = 0;
00086   
00087   _tot_mass = 0;
00088   _sum = 0;
00089   do {
00090     double val = strtod(p,NULL);
00091     _hist.push_back(log(val));
00092     _tot_mass += val;
00093     _sum += i*val;
00094     i++;
00095     p = strtok(NULL, "\t");
00096   } while (p);
00097   
00098   _tot_mass = log(_tot_mass);
00099   _sum = log(_sum);
00100   _min = max_val();;
00101 }
00102 
00103 size_t LengthDistribution::max_val() const {
00104   return (_hist.size()-1) * _bin_size;
00105 }
00106 
00107 size_t LengthDistribution::min_val() const {
00108   if (_min == _hist.size() - 1) {
00109     return 1;
00110   }
00111   return _min;
00112 }
00113 
00114 void LengthDistribution::add_val(size_t len, double mass) {
00115   assert(!isnan(mass));
00116   assert(_kernel.size());
00117   
00118   len /= _bin_size;
00119 
00120   if (len > max_val()) {
00121       len = max_val();
00122   }
00123   if (len < _min) {
00124     _min = len;
00125   }
00126 
00127   size_t offset = len - _kernel.size()/2;
00128 
00129   for (size_t i = 0; i < _kernel.size(); i++) {
00130     if (offset > 0 && offset < _hist.size()) {
00131       double k_mass = mass + _kernel[i];
00132       _hist[offset] = log_add(_hist[offset], k_mass);
00133       _sum = log_add(_sum, log((double)offset)+k_mass);
00134       _tot_mass = log_add(_tot_mass, k_mass);
00135     }
00136     offset++;
00137   }
00138 }
00139 
00140 double LengthDistribution::pmf(size_t len) const {
00141   len /= _bin_size;
00142   if (len > max_val()) {
00143     len = max_val();
00144   }
00145   return _hist[len]-_tot_mass;
00146 }
00147 
00148 double LengthDistribution::cmf(size_t len) const {
00149   double cum = LOG_0;
00150   vector<double> cdf(_hist.size());
00151   for (size_t i = 0; i < _hist.size(); ++i) {
00152     cum = log_add(cum, _hist[i]);
00153     
00154   }
00155   return cum - _tot_mass;
00156 }
00157 
00158 vector<double> LengthDistribution::cmf() const {
00159   double cum = LOG_0;
00160   vector<double> cdf(_hist.size());
00161   for (size_t i = 0; i < _hist.size(); ++i) {
00162     cum = log_add(cum, _hist[i]);
00163     cdf[i] = cum - _tot_mass;
00164   }
00165   assert(approx_eq(cum, _tot_mass));
00166 
00167   return cdf;
00168 }
00169 
00170 double LengthDistribution::tot_mass() const {
00171   return _tot_mass;
00172 }
00173 
00174 double LengthDistribution::mean() const {
00175   return _sum - tot_mass();
00176 }
00177 
00178 string LengthDistribution::to_string() const {
00179   string s = "";
00180   char buffer[50];
00181   for(size_t i = 0; i < _hist.size(); i++) {
00182     sprintf(buffer, "%e\t",sexp(pmf(i*_bin_size)));
00183     s += buffer;
00184   }
00185   s.erase(s.length()-1,1);
00186   return s;
00187 }
00188 
00189 void LengthDistribution::append_output(ofstream& outfile,
00190                                        string length_type) const {
00191   outfile << ">" << length_type << " Length Distribution (0-" << max_val()*_bin_size;
00192   outfile << ")\n" << to_string() << endl;
00193 }
 All Classes Functions Variables