BranchSiteModel.h 19.9 KB
Newer Older
1
2
3
4
5
6

#ifndef BRANCHSITEMODEL_H
#define BRANCHSITEMODEL_H

#include <vector>
#include <cstdlib>
mvalle's avatar
mvalle committed
7
#include <cfloat>
8
9
10
#include "TransitionMatrix.h"
#include "ProbabilityMatrixSet.h"
#include "Forest.h"
11
#include "CmdLine.h"
12
#include "TreeAndSetsDependencies.h"
13

mvalle's avatar
mvalle committed
14
15
16
//// Value used for the LRT test. It is chisq(.95, df=1)/2
static const double THRESHOLD_FOR_LRT = 1.92072941034706202;

17
/// Common routines for testing the two hypothesis (H0 and H1).
18
19
20
21
22
23
24
///
///     @author Mario Valle - Swiss National Supercomputing Centre (CSCS)
///     @date 2010-12-23 (initial version)
///     @version 1.0
///
class BranchSiteModel
{
25
protected:
26
27
28
29
30
31
32
33
34
35
36
	/// Constructor.
	///
	/// @param[in] aForest The forest for which the maximum likelihood should be computed
	/// @param[in] aNumBranches Number of tree branches
	/// @param[in] aNumSites Number of sites
	/// @param[in] aSeed Random number generator seed
	/// @param[in] aNumVariables Number of extra variables (k, w0, w2, p0, p1)
	/// @param[in] aOnlyInitialStep Compute only the first step, no optimization involved
	/// @param[in] aTrace If set print a trace of the maximization process
	/// @param[in] aOptAlgo Maximization algorithm to be used
	/// @param[in] aDeltaValueForGradient The variable increment to compute gradient
mvalle's avatar
mvalle committed
37
	/// @param[in] aRelativeError Relative error for convergence
mvalle's avatar
mvalle committed
38
	/// @param[in] aNoParallel If true no parallel execution support is setup
39
	/// @param[in] aVerbose The verbosity level
mvalle's avatar
mvalle committed
40
	/// @param[in] aExtraDebug Extra parameter for testing during development
41
	///
mvalle's avatar
mvalle committed
42
43
44
	BranchSiteModel(Forest&      aForest,
					size_t       aNumBranches,
					size_t       aNumSites,
45
46
					unsigned int aSeed,
					unsigned int aNumVariables,
mvalle's avatar
mvalle committed
47
48
					bool         aOnlyInitialStep,
					bool         aTrace,
49
					unsigned int aOptAlgo,
mvalle's avatar
mvalle committed
50
51
52
					double       aDeltaValueForGradient,
					double       aRelativeError,
					bool	     aNoParallel,
53
					unsigned int aVerbose,
54
					unsigned int aExtraDebug)
55
		: mForest(aForest),
56
		  mVar(aNumBranches+aNumVariables),
mvalle's avatar
mvalle committed
57
58
		  mFgScale(0.0),
		  mBgScale(0.0),
59
		  mMaxLnL(-DBL_MAX),
mvalle's avatar
mvalle committed
60
		  mDeltaForGradient((aDeltaValueForGradient > 0.0) ? aDeltaValueForGradient : sqrt(DBL_EPSILON)),
61
		  mLikelihoods(Nt*aNumSites),
62
63
64
65
		  mOnlyInitialStep(aOnlyInitialStep),
		  mTrace(aTrace),
		  mOptAlgo(aOptAlgo),
		  mInitType(INIT_TYPE_NONE),
mvalle's avatar
mvalle committed
66
		  mNumTimes(static_cast<unsigned int>(aNumBranches)),
mvalle's avatar
mvalle committed
67
		  mNumVariables(aNumVariables),
68
		  mExtraDebug(aExtraDebug),
69
		  mVerbose(aVerbose),
mvalle's avatar
mvalle committed
70
		  mNumEvaluations(0),
71
72
		  mDependencies(aForest, aVerbose),
		  mNoParallel(aNoParallel),
73
74
		  mSeed(aSeed),
		  mRelativeError(aRelativeError)
75
	{
mvalle's avatar
mvalle committed
76
		setLimits(aNumBranches, static_cast<size_t>(aNumVariables));
77
78
79
80
81
82
	}

	/// Destructor.
	///
	virtual ~BranchSiteModel() {}

83
public:
84
85
86
87
88
89
90
91
	/// Set the times on the tree from the variables
	///
	void saveComputedTimes(void) const {mForest.setLengthsFromTimes(mVar);}

	/// Formatted print of the maximizer variables array
	///
	/// @param[in] aVars The variables array to be printed
	/// @param[in] aLnl The likelihood value to be printed
92
	/// @param[in] aOut The stream on which the variables should be printed
93
	///
94
	void printVar(const std::vector<double>& aVars, double aLnl=DBL_MAX, std::ostream& aOut=std::cout) const;
95

mvalle's avatar
mvalle committed
96
97
98
99
	/// Formatted print of the maximized variables
	///
	/// @param[in] aOut The stream on which the variables should be printed
	///
100
	void printFinalVars(std::ostream& aOut=std::cout) const;
mvalle's avatar
mvalle committed
101

102
103
104
	/// Compute the maximum likelihood for the given forest
	///
	/// @param[in] aFgBranch The number of the internal branch to be marked as foreground
mvalle's avatar
mvalle committed
105
106
	/// @param[in] aStopIfBigger If true stop computation as soon as lnl is over aThreshold
	/// @param[in] aThreshold The threshold at which the maximization should be stopped
107
108
109
	///
	/// @return The maximum Likelihood value
	///
mvalle's avatar
mvalle committed
110
111
112
113
114
	/// @exception FastCodeMLFatal Exception in Ming2 computation
	/// @exception FastCodeMLFatal Invalid optimization algorithm identifier on the command line
	/// @exception FastCodeMLFatal Exception in computation
	///
	double maximizeLikelihood(size_t aFgBranch, bool aStopIfBigger, double aThreshold);
115

116
117
118
119
120
121
122
123
124
125
	/// Compute the likelihood for the given forest and the given set of parameters when computing gradient.
	///
	/// @param[in] aVar The optimizer variables
	/// @param[in] aTrace If set visualize the best result so far
	/// @param[in] aGradientVar Used in gradient computation to avoid unneeded computations.
	///
	/// @return The maximum Likelihood value
	///
	virtual double computeLikelihoodForGradient(const std::vector<double>& aVar, bool aTrace, size_t aGradientVar) =0;

126
127
128
129
130
131
132
	/// Compute the likelihood for the given forest and the given set of parameters.
	///
	/// @param[in] aVar The optimizer variables
	/// @param[in] aTrace If set visualize the best result so far
	///
	/// @return The maximum Likelihood value
	///
133
	virtual double computeLikelihood(const std::vector<double>& aVar, bool aTrace) =0;
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

	/// Compute the likelihood for the given forest and the given set of parameters.
	/// This version is for use inside Ming2 minimizer
	///
	/// @param[in] aVar The optimizer variables
	/// @param[in] aVarLen The optimizer variables array length
	/// @param[in] aTrace If set visualize the best result so far
	///
	/// @return The maximum Likelihood value
	///
	double computeLikelihood(double* aVar, int aVarLen, bool aTrace)
	{
		std::vector<double> x(aVar, aVar+aVarLen);
		return computeLikelihood(x, aTrace);
	}

	/// Get variable values
	///
	/// @param[out] aVariables Vector that will be filled with the variables
	///
	void getVariables(std::vector<double>& aVariables) const {aVariables = mVar;}

mvalle's avatar
mvalle committed
156
157
158
159
160
161
	/// Get variable values
	///
	/// @return The optimized variables
	///
	const std::vector<double>& getVariables(void) const {return mVar;}

162
163
164
165
166
167
168
169
170
171
172
173
174
175
	/// Get the number of function evaluation.
	///
	/// @return The number of likelihood function calls
	///
	unsigned int getNumEvaluations(void) const {return mNumEvaluations;}

	/// Perform the Likelihood Ratio Test.
	/// LRT test: -2*(lnl0-lnl1) > chisq(.95, df=1)
	///
	/// @param[in] aLnL0 Max LogLikelihood for H0
	/// @param[in] aLnL1 Max LogLikelihood for H1
	///
	///	@return True if the test is passed
	///
mvalle's avatar
mvalle committed
176
	static bool performLRT(double aLnL0, double aLnL1) {return (aLnL1 - aLnL0) > THRESHOLD_FOR_LRT;}
177
178
179
180
181
182
183
	
	/// Get site multeplicity values.
	///
	/// @return Reference to the array of site multiplicities
	///
	const std::vector<double>& getSiteMultiplicity(void) const {return mForest.getSiteMultiplicity();}

mvalle's avatar
mvalle committed
184
185
186
187
188
189
	/// Get the corresponding forest.
	///
	/// @return Reference to the forest
	///
	Forest& getForest(void) {return mForest;}

mvalle's avatar
mvalle committed
190
191
192
193
194
195
	/// Get the latest scale values suitable for BEB computation.
	///
	/// @param[out] aScales The array will get the fg scale in [1] and the bg scale in [0]
	///
	void getScales(std::vector<double>& aScales) const {aScales.resize(2); aScales[0] = mBgScale; aScales[1] = mFgScale;}

196
197
198
protected:
	/// Compute the four site proportions from the two values in the optimization variables
	///
mvalle's avatar
mvalle committed
199
200
201
202
203
204
205
206
207
	/// Meaning of the various classes:
	/// - class 0: purifying evolution
	/// - class 1: neutral evolution
	/// - class 2a: positive selection on foreground branch and purifying on background
	/// - class 2b: positive selection on foreground branch and neutral on background
	///
	/// @param[in] aV0 The first optimization variable
	/// @param[in] aV1 The second optimization variable
	/// @param[out] aProportions The four proportions (p0, p1, p2a, p2b) computed from aV0 and aV1
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
	///
	void getProportions(double aV0, double aV1, double* aProportions) const
	{
#ifdef USE_ORIGINAL_PROPORTIONS
		aProportions[0] = exp(aV0);
		aProportions[1] = exp(aV1);
		double tot = aProportions[0] + aProportions[1] + 1;
		aProportions[0] /= tot;
		aProportions[1] /= tot;
		tot = aProportions[0] + aProportions[1];

		aProportions[2] = (1. - tot)*aProportions[0]/tot;
		aProportions[3] = (1. - tot)*aProportions[1]/tot;
#else
		aProportions[0] = aV0*aV1;
223
224
225
		aProportions[1] = aV0*(1.-aV1);
		aProportions[2] = (1.-aV0)*aV1;
		aProportions[3] = (1.-aV0)*(1.-aV1);
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#endif
	}

	/// Initialize variables to be optimized.
	/// It uses mInitType to know what has been already initialized by initFromTree() or initFromResult()
	///
	void initVariables(void);

private:
	/// Set upper and lower limits for the maximization domain
	///
	/// @param[in] aNumTimes Number of times (ie branch lengths)
	/// @param[in] aNumVariables Number of other variables (4 for H0, 5 for H1)
	///
mvalle's avatar
mvalle committed
240
	void setLimits(size_t aNumTimes, size_t aNumVariables);
241
242
243
244
245
246
247

	/// Generate a double random number between 0 and 1
	///
	/// @return The random number
	///
	static inline double randFrom0to1(void) {return static_cast<double>(rand())/static_cast<double>(RAND_MAX);}

248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
	/// Valid values (on the command line) for the optimization algorithm
	///
	enum OptimAlgoIdentifier
	{
		OPTIM_LD_LBFGS		= 0,	///< Same optimizer as CodeML
		OPTIM_LD_VAR1		= 1,	///< Shifted limited-memory variable-metric rank-1 method
		OPTIM_LD_VAR2		= 2,	///< Shifted limited-memory variable-metric rank-2 method
		OPTIM_LD_SLSQP		= 3,	///< Sequential quadratic programming (SQP) algorithm

		OPTIM_LN_BOBYQA		= 11,	///< Derivative-free bound-constrained optimization using an iteratively constructed quadratic approximation for the objective function

		OPTIM_LD_MING2		= 22,	///< The optimizer extracted from CodeML

		OPTIM_MLSL_LDS		= 99	///< A global optimizer
	};

	/// Valid values for the mInitType variable depicting from where the variables have been initialized.
	///
	enum InitVarStatus
	{
		INIT_TYPE_NONE,			///< All variables to optimize should be initialized
		INIT_TYPE_TIMES,		///< All variables to optimize should be initialized except times
		INIT_TYPE_RES_4,		///< All variables to optimize should be initialized except times, w0, k, v1, v2
		INIT_TYPE_RES_5			///< All variables to optimize have been initialized
	};
273
274
275
276
277
278

public:
	/// Initialize the times from the input phylogenetic tree
	///
	void initFromTree(void);

279
280
281
	/// Initialize the times from the input phylogenetic tree and set the other values to hardcoded constants with P0=1 and P1=0.
	/// This routine could be used in place of initFromTree()
	///
mvalle's avatar
mvalle committed
282
283
	/// @exception FastCodeMLFatal Invalid p0 and p1 values
	///
284
	void initFromTreeAndParams(void);
285

286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
	/// Initialize variables from a previous optimization result
	///
	/// @param[in] aPreviousResult A previous result from another model (obtained by getVariables())
	/// @param[in] aValidLen If set gives how many values to take from aPreviousResult
	///
	void initFromResult(const std::vector<double>& aPreviousResult, unsigned int aValidLen=0);


private:
	/// Disabled assignment operator to avoid warning on Windows
	///
	/// @fn BranchSiteModel& operator=(const BranchSiteModel& aObj)
	///
	/// @param[in] aObj The object to be assigned
	///
	/// @return The object receiving the assignment
	///
303
	BranchSiteModel& operator=(const BranchSiteModel& /*aObj*/);
304
305
306
307
308
309
310
311


protected:
	Forest&						mForest;			///< The forest to be used
	std::vector<double>			mVar;				///< Variable to optimize (first the branch lengths then the remaining variables)
	std::vector<double>			mLowerBound;		///< Lower limits for the variables to be optimized
	std::vector<double>			mUpperBound;		///< Upper limits for the variables to be optimized
	double						mProportions[4];	///< The four proportions
mvalle's avatar
mvalle committed
312
313
	double						mFgScale;			///< The computed foreground branch scale
	double						mBgScale;			///< The computed background branches scale
314
	double						mMaxLnL;			///< Maximum value of LnL found during optimization
315
	double						mDeltaForGradient;	///< Value used to change the variables to compute gradient
316
317
318
319
320
	CacheAlignedDoubleVector	mLikelihoods;		///< Computed likelihoods at the root of all trees. Defined here to make it aligned.
	bool						mOnlyInitialStep;	///< Only the initial step is executed, no optimization
	bool						mTrace;				///< Enable maximization tracing
	unsigned int				mOptAlgo;			///< Optimization algorithm to use
	InitVarStatus				mInitType;			///< From where the variables have been initialized
321
322
	unsigned int				mNumTimes;			///< Number of branch lengths values
	unsigned int				mNumVariables;		///< The number of extra variables (4 for H0 and 5 for H1)
323
	unsigned int				mExtraDebug;		///< Parameter for extra development testing
324
	unsigned int				mVerbose;			///< Parameter for extra development testing
325
	unsigned int				mNumEvaluations;	///< Counter of the likelihood function evaluations
326
	TreeAndSetsDependencies		mDependencies;		///< The dependency list between trees to use in this run
mvalle's avatar
mvalle committed
327
	bool						mNoParallel;		///< True if no preparation for multithreading should be done
328
329
330

private:
	unsigned int				mSeed;				///< Random number generator seed to be used also by the optimizer
331
	double						mRelativeError;		///< Relative error to stop maximization
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
};



/// Null Hypothesis test.
///
///     @author Mario Valle - Swiss National Supercomputing Centre (CSCS)
///     @date 2010-12-23 (initial version)
///     @version 1.0
///
///
class BranchSiteModelNullHyp : public BranchSiteModel
{
public:
	/// Constructor.
	///
	/// @param[in] aForest The forest for which the maximum likelihood should be computed
mvalle's avatar
mvalle committed
349
	/// @param[in] aCmdLine The command line parameters
350
	///
351
352
353
	BranchSiteModelNullHyp(Forest& aForest, const CmdLine& aCmdLine)
		: BranchSiteModel(aForest, aForest.getNumBranches(), aForest.getNumSites(),
						  aCmdLine.mSeed, 4, aCmdLine.mNoMaximization, aCmdLine.mTrace,
354
						  aCmdLine.mOptimizationAlgo, aCmdLine.mDeltaValueForGradient,
355
						  aCmdLine.mRelativeError, aCmdLine.mForceSerial || aCmdLine.mDoNotReduceForest, aCmdLine.mVerboseLevel, aCmdLine.mExtraDebug),
356
						  mSet(aForest.getNumBranches()), mSetForGradient(aForest.getNumBranches()),
357
358
359
360
361
362
363
364
						  mPrevK(DBL_MAX), mPrevOmega0(DBL_MAX)
	{
		// Initialize the dependency set
		mDependencies.computeDependencies(3, mNoParallel);
		mDependencies.print("TEST FOR H0 (before optimization)");
		mDependencies.optimizeDependencies();
		mDependencies.print("TEST FOR H0");
	}
365
366
367
368

	/// Compute the null hypothesis log likelihood.
	///
	/// @param[in] aFgBranch The identifier for the branch marked as foreground branch
mvalle's avatar
mvalle committed
369
370
	/// @param[in] aStopIfBigger If true stop computation as soon as lnl is over aThreshold
	/// @param[in] aThreshold The threshold at which the maximization should be stopped
371
372
373
	///
	/// @return The log likelihood under the null hypothesis
	///
mvalle's avatar
mvalle committed
374
	double operator()(size_t aFgBranch, bool aStopIfBigger=false, double aThreshold=0.);
375

376
377
378
379
380
381
382
383
384
385
	/// Compute the likelihood for the given forest and the given set of parameters when computing gradient.
	///
	/// @param[in] aVar The optimizer variables
	/// @param[in] aTrace If set visualize the best result so far
	/// @param[in] aGradientVar Used in gradient computation to avoid unneeded computations.
	///
	/// @return The maximum Likelihood value
	///
	double computeLikelihoodForGradient(const std::vector<double>& aVar, bool aTrace, size_t aGradientVar);

386
387
388
389
390
391
392
	/// Compute the likelihood for the given forest and the given set of parameters.
	///
	/// @param[in] aVar The optimizer variables
	/// @param[in] aTrace If set visualize the best result so far
	///
	/// @return The maximum Likelihood value
	///
393
	double computeLikelihood(const std::vector<double>& aVar, bool aTrace);
394

mvalle's avatar
mvalle committed
395

396
397
398
399
400
401
402
403
404
private:
	/// Disabled assignment operator to avoid warning on Windows
	///
	/// @fn BranchSiteModelNullHyp& operator=(const BranchSiteModelNullHyp& aObj)
	///
	/// @param[in] aObj The object to be assigned
	///
	/// @return The object receiving the assignment
	///
405
	BranchSiteModelNullHyp& operator=(const BranchSiteModelNullHyp& /*aObj*/);
406

407
408
409
410
411
412
	/// Combine the sites' various codon classes likelihoods into one log-likelihood value
	///
	/// @return The tree log-likelihood value
	///
	double combineSiteLikelihoods(void);

413
private:
414
415
416
417
418
419
420
421
	TransitionMatrix 		mQw0;				///< Q matrix for the omega0 case
	TransitionMatrix 		mQ1;				///< Q matrix for the omega1 == 1 case
	ProbabilityMatrixSetH0	mSet;				///< Set of matrices used for the tree visits
	ProbabilityMatrixSetH0	mSetForGradient;	///< Set of matrices used for the tree visits
	double					mPrevK;				///< Previous k value used to compute matrices
	double					mPrevOmega0;		///< Previous w0 value used to compute matrices
	double					mScaleQw0;			///< Scale value for Qw0
	double					mScaleQ1;			///< Scale value for Q1
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
};



/// Alternate Hypothesis test.
///
///     @author Mario Valle - Swiss National Supercomputing Centre (CSCS)
///     @date 2010-12-23 (initial version)
///     @version 1.0
///
///
class BranchSiteModelAltHyp : public BranchSiteModel
{
public:
	/// Constructor.
	///
	/// @param[in] aForest The forest for which the maximum likelihood should be computed
mvalle's avatar
mvalle committed
439
	/// @param[in] aCmdLine The command line parameters
440
	///
441
442
443
	BranchSiteModelAltHyp(Forest& aForest, const CmdLine& aCmdLine)
		: BranchSiteModel(aForest, aForest.getNumBranches(), aForest.getNumSites(),
						  aCmdLine.mSeed, 5, aCmdLine.mNoMaximization, aCmdLine.mTrace,
444
						  aCmdLine.mOptimizationAlgo, aCmdLine.mDeltaValueForGradient,
445
						  aCmdLine.mRelativeError, aCmdLine.mForceSerial || aCmdLine.mDoNotReduceForest, aCmdLine.mVerboseLevel, aCmdLine.mExtraDebug),
446
						  mSet(aForest.getNumBranches()), mSetForGradient(aForest.getNumBranches()),
447
448
449
450
451
452
453
454
						  mPrevK(DBL_MAX), mPrevOmega0(DBL_MAX), mPrevOmega2(DBL_MAX)
	{
		// Initialize the dependency set
		mDependencies.computeDependencies(4, mNoParallel);
		mDependencies.print("TEST FOR H1 (before optimization)");
		mDependencies.optimizeDependencies();
		mDependencies.print("TEST FOR H1");
	}
455
456
457
458
459
460
461
462
463

	/// Compute the alternative hypothesis log likelihood.
	///
	/// @param[in] aFgBranch The identifier for the branch marked as foreground branch
	///
	/// @return The log likelihood under the alternative hypothesis
	///
	double operator()(size_t aFgBranch);

464
	/// Compute the likelihood for the given forest and the given set of parameters when computing gradient.
465
466
467
	///
	/// @param[in] aVar The optimizer variables
	/// @param[in] aTrace If set visualize the best result so far
mvalle's avatar
mvalle committed
468
	/// @param[in] aGradientVar Used in gradient computation to avoid unneeded computations. If set to UINT_MAX it is ignored
469
470
471
	///
	/// @return The maximum Likelihood value
	///
472
473
474
475
476
477
478
479
480
481
	double computeLikelihoodForGradient(const std::vector<double>& aVar, bool aTrace, size_t aGradientVar);

	/// Compute the likelihood for the given forest and the given set of parameters.
	///
	/// @param[in] aVar The optimizer variables
	/// @param[in] aTrace If set visualize the best result so far
	///
	/// @return The maximum Likelihood value
	///
	double computeLikelihood(const std::vector<double>& aVar, bool aTrace);
482
483

private:
484
	/// Disabled assignment operator to avoid warnings on Windows.
485
486
487
488
489
490
491
	///
	/// @fn BranchSiteModelAltHyp& operator=(const BranchSiteModelAltHyp& aObj)
	///
	/// @param[in] aObj The object to be assigned
	///
	/// @return The object receiving the assignment
	///
492
	BranchSiteModelAltHyp& operator=(const BranchSiteModelAltHyp& /*aObj*/);
493

494
495
496
497
498
499
	/// Combine the sites' various codon classes likelihoods into one log-likelihood value
	///
	/// @return The tree log-likelihood value
	///
	double combineSiteLikelihoods(void);

500
501

private:
502
503
504
505
506
507
508
509
510
511
512
	CheckpointableTransitionMatrix	mQw0;				///< Q matrix for the omega0 case
	TransitionMatrix				mQw2;				///< Q matrix for the omega2 case
	CheckpointableTransitionMatrix	mQ1;  				///< Q matrix for the omega1 == 1 case
	ProbabilityMatrixSetH1			mSet;				///< Set of matrices used for the tree visits
	ProbabilityMatrixSetH1			mSetForGradient;	///< Set of matrices used for the tree visits
	double							mPrevK;				///< Previous k value used to compute matrices
	double							mPrevOmega0;		///< Previous w0 value used to compute matrices
	double							mPrevOmega2;		///< Previous w2 value used to compute matrices
	double							mScaleQw0;			///< Scale value for Qw0
	double							mScaleQw2;			///< Scale value for Qw2
	double							mScaleQ1;			///< Scale value for Q1
513
514
515
};

#endif