eXpress “1.5”
|
00001 00009 #include "lengthdistribution.h" 00010 #include "main.h" 00011 #include <numeric> 00012 #include <boost/assign.hpp> 00013 #include <iostream> 00014 #include <fstream> 00015 #include <boost/math/distributions/binomial.hpp> 00016 #include <boost/math/distributions/normal.hpp> 00017 00018 using namespace std; 00019 00020 LengthDistribution::LengthDistribution(double alpha, size_t max_val, 00021 size_t prior_mu, size_t prior_sigma, 00022 size_t kernel_n, double kernel_p, 00023 size_t bin_size) 00024 : _hist(max_val/bin_size+1), 00025 _tot_mass(LOG_0), 00026 _sum(LOG_0), 00027 _min(max_val/bin_size), 00028 _bin_size(bin_size) { 00029 00030 max_val = max_val/bin_size; 00031 kernel_n = kernel_n/bin_size; 00032 assert(kernel_n % 2 == 0); 00033 00034 double tot = log(alpha); 00035 00036 // Set to prior distribution 00037 if (prior_mu) { 00038 boost::math::normal norm(prior_mu/bin_size, 00039 prior_sigma/(bin_size*bin_size)); 00040 00041 for (size_t i = 0; i <= max_val; ++i) { 00042 double norm_mass = boost::math::cdf(norm, i+0.5) - 00043 boost::math::cdf(norm, i-0.5); 00044 double mass = LOG_EPSILON; 00045 if (norm_mass != 0) { 00046 mass = tot + log(norm_mass); 00047 } 00048 _hist[i] = mass; 00049 _sum = log_add(_sum, log((double)i)+mass); 00050 _tot_mass = log_add(_tot_mass, mass); 00051 } 00052 } else { 00053 _hist = vector<double>(max_val + 1, tot - log((double)max_val)); 00054 _hist[0] = LOG_0; 00055 _sum = _hist[1] + log((double)(max_val * (max_val + 1))) - log(2.); 00056 _tot_mass = tot; 00057 } 00058 00059 // Define kernel 00060 boost::math::binomial_distribution<double> binom(kernel_n, kernel_p); 00061 _kernel = vector<double>(kernel_n + 1); 00062 for (size_t i = 0; i <= kernel_n; i++) { 00063 _kernel[i] = log(boost::math::pdf(binom, i)); 00064 } 00065 } 00066 00067 LengthDistribution::LengthDistribution(string param_file_name, 00068 string length_type) : 00069 _bin_size(1) { 00070 ifstream infile (param_file_name.c_str()); 00071 const size_t BUFF_SIZE = 99999; 00072 char line_buff[BUFF_SIZE]; 00073 00074 if (!infile.is_open()) { 00075 logger.severe("Unable to open paramater file '%s'.", 00076 param_file_name.c_str()); 00077 } 00078 00079 do { 00080 infile.getline (line_buff, BUFF_SIZE, '\n'); 00081 } while (strncmp(line_buff + 1, length_type.c_str(), length_type.size())); 00082 00083 infile.getline (line_buff, BUFF_SIZE, '\n'); 00084 char *p = strtok(line_buff, "\t"); 00085 size_t i = 0; 00086 00087 _tot_mass = 0; 00088 _sum = 0; 00089 do { 00090 double val = strtod(p,NULL); 00091 _hist.push_back(log(val)); 00092 _tot_mass += val; 00093 _sum += i*val; 00094 i++; 00095 p = strtok(NULL, "\t"); 00096 } while (p); 00097 00098 _tot_mass = log(_tot_mass); 00099 _sum = log(_sum); 00100 _min = max_val();; 00101 } 00102 00103 size_t LengthDistribution::max_val() const { 00104 return (_hist.size()-1) * _bin_size; 00105 } 00106 00107 size_t LengthDistribution::min_val() const { 00108 if (_min == _hist.size() - 1) { 00109 return 1; 00110 } 00111 return _min; 00112 } 00113 00114 void LengthDistribution::add_val(size_t len, double mass) { 00115 assert(!isnan(mass)); 00116 assert(_kernel.size()); 00117 00118 len /= _bin_size; 00119 00120 if (len > max_val()) { 00121 len = max_val(); 00122 } 00123 if (len < _min) { 00124 _min = len; 00125 } 00126 00127 size_t offset = len - _kernel.size()/2; 00128 00129 for (size_t i = 0; i < _kernel.size(); i++) { 00130 if (offset > 0 && offset < _hist.size()) { 00131 double k_mass = mass + _kernel[i]; 00132 _hist[offset] = log_add(_hist[offset], k_mass); 00133 _sum = log_add(_sum, log((double)offset)+k_mass); 00134 _tot_mass = log_add(_tot_mass, k_mass); 00135 } 00136 offset++; 00137 } 00138 } 00139 00140 double LengthDistribution::pmf(size_t len) const { 00141 len /= _bin_size; 00142 if (len > max_val()) { 00143 len = max_val(); 00144 } 00145 return _hist[len]-_tot_mass; 00146 } 00147 00148 double LengthDistribution::cmf(size_t len) const { 00149 double cum = LOG_0; 00150 vector<double> cdf(_hist.size()); 00151 for (size_t i = 0; i < _hist.size(); ++i) { 00152 cum = log_add(cum, _hist[i]); 00153 00154 } 00155 return cum - _tot_mass; 00156 } 00157 00158 vector<double> LengthDistribution::cmf() const { 00159 double cum = LOG_0; 00160 vector<double> cdf(_hist.size()); 00161 for (size_t i = 0; i < _hist.size(); ++i) { 00162 cum = log_add(cum, _hist[i]); 00163 cdf[i] = cum - _tot_mass; 00164 } 00165 assert(approx_eq(cum, _tot_mass)); 00166 00167 return cdf; 00168 } 00169 00170 double LengthDistribution::tot_mass() const { 00171 return _tot_mass; 00172 } 00173 00174 double LengthDistribution::mean() const { 00175 return _sum - tot_mass(); 00176 } 00177 00178 string LengthDistribution::to_string() const { 00179 string s = ""; 00180 char buffer[50]; 00181 for(size_t i = 0; i < _hist.size(); i++) { 00182 sprintf(buffer, "%e\t",sexp(pmf(i*_bin_size))); 00183 s += buffer; 00184 } 00185 s.erase(s.length()-1,1); 00186 return s; 00187 } 00188 00189 void LengthDistribution::append_output(ofstream& outfile, 00190 string length_type) const { 00191 outfile << ">" << length_type << " Length Distribution (0-" << max_val()*_bin_size; 00192 outfile << ")\n" << to_string() << endl; 00193 }