% Dirichlet process for samlping
clear all
datapath = './data/';
dataset = 'cerebellum';
resultpath = './results/';
figurepath = './figures/';
% load data
% first column is the count and
% second column is multiplicity, i.e. how many positions with that count
eval(sprintf('load %s%s; data.obs=%s(:,1)''; data.mul=%s(:,2)'';',...
datapath,dataset,dataset,dataset));
%
% Part 1 - Gibbs sampler for Dirichlet process model with parameters sigma and theta.
%
disp('Running part 1: Sampling parameters of Dirichlet process')
% set MCMC parameters
param.burnin = 20;
param.samples = 100;
% set interval of sigma values
sigmamin = -0.1;
sigmamax = 0.99;
sigmapoints = 1e4;
sigmad = (sigmamax-sigmamin)/(sigmapoints-1);
param.sigma = sigmamin:sigmad:sigmamax;
% set relative sigma prior (flat here)
param.sigmaprior = ones(size(param.sigma));
% set interval of theta values
thetamin = 1;
thetamax = 1e4;
thetapoints = 1e4;
thetad = (thetamax-thetamin)/(thetapoints-1);
param.theta = thetamin:thetad:thetamax;
% set relative theta prior (flat here)
param.thetaprior = ones(size(param.theta)); % relative prior over sigma
% draw sigma and theta samples using Gibbs sampling
samp = DPsample_sigma_theta(data,param);
%
% Part 2 - sample new tags using Dirichlet process sampling formula.
%
disp('Running part 2: Sampling tags')
% set cut-off
cutoff = 2;
% number of samples in data set
n = data.obs * data.mul';
% number of species
k = sum(data.mul) ;
% number of species meeting cutoff
kco = sum(data.mul(data.obs>=cutoff)) ;
% set size of new samples - default up to three times the original sample
m = n*(0.5:0.5:3);
% draw new tags using the sampling formula
res = DPsample_tags(data,samp,m,cutoff);
%
% Plot results
%
% plot sampled values and countour plot of loglikelihood surface
figure(1)
sigmad=max(samp.sigma)-min(samp.sigma);
sigmav=min(samp.sigma):sigmad/100:max(samp.sigma);
thetad=max(samp.theta)-min(samp.theta);
thetav=min(samp.theta):thetad/100:max(samp.theta);
[sigmac,thetac]=meshgrid(sigmav,thetav);
ll=DPloglikelihood(sigmac,thetac,data.obs,data.mul);
[C,h]=contour(sigmac,thetac,ll,30);
hold on
plot(samp.sigma,samp.theta,'x')
hold off
title(sprintf('%s log likelihood surface',dataset))
xlabel('\sigma')
ylabel('\theta')
bigfig
eval(sprintf('print -depsc %sloglikelihood_%s.eps',figurepath,dataset));
% plot expected coverages
figure(2)
EEcoverage = mean([ 1-(samp.theta+k*samp.sigma)./(n+samp.theta) res.Ecoverage]);
plot(n+[0 m],EEcoverage);
axis([0 n+m(end) EEcoverage(1) EEcoverage(end)]);
title(sprintf('%s expected coverage',dataset))
xlabel('sample size')
ylabel('coverage')
bigfig
eval(sprintf('print -depsc %scoverage_%s.eps',figurepath,dataset));
% plot expected number of species with 95% HPD intervals
figure(3)
EEpopsize = [ k mean(res.Epopsize)];
plot(n+[0 m],EEpopsize,n+m,res.popsizeHPDl,'--',n+m,res.popsizeHPDh,'--');
axis([0 n+m(end) 0 EEpopsize(end)]);
title(sprintf('%s expected number of species',dataset))
xlabel('sample size')
ylabel('k')
bigfig
eval(sprintf('print -depsc %spopsize_%s.eps',figurepath,dataset));
% plot expected number of species at cut-off+ with 95% HPD intervals
figure(4)
EEpopsizeco = [ kco mean(res.popsizeco)];
plot(n+[0 m],EEpopsizeco,n+m,res.popsizecoHPDl,'--',n+m,res.popsizecoHPDh,'--');
axis([0 n+m(end) 0 EEpopsizeco(end)]);
title(sprintf('%s expected number of species %d cutoff',dataset,cutoff))
xlabel('sample size')
ylabel('k_{co}')
bigfig
eval(sprintf('print -depsc %spopsizeco_%s.eps',figurepath,dataset));