Difference between revisions of "Example Levenberg-Marquardt"
(Created page with "Levenberg-Marquardt is a popular non-linear optimization algorithm. This example demonstrate how a basic implementation of Levenberg-Marquardt can be created using EJML's Pr...") |
|||
(8 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
− | Levenberg-Marquardt is a popular non-linear optimization algorithm. This example demonstrate how a basic implementation of Levenberg-Marquardt can be created using EJML's [[Procedural|procedural]] interface. Unnecessary allocation of new memory is avoided by reshaping matrices. When a matrix is reshaped its width and height is changed but new memory is not declared unless the new shape requires more memory than is available. | + | Levenberg-Marquardt (LM) is a popular non-linear optimization algorithm. This example demonstrate how a basic implementation of Levenberg-Marquardt can be created using EJML's [[Procedural|procedural]] interface. Unnecessary allocation of new memory is avoided by reshaping matrices. When a matrix is reshaped its width and height is changed but new memory is not declared unless the new shape requires more memory than is available. |
− | + | LM works by being provided a function which computes the residual error. Residual error is defined has the difference between the predicted output and the actual observed output, e.g. f(x)-y. Optimization works | |
+ | by finding a set of parameters which minimize the magnitude of the residuals based on the F2-norm. | ||
'''Note:''' This is a simple straight forward implementation of Levenberg-Marquardt and is not as robust as Minpack's implementation. If you are looking for a robust non-linear least-squares minimization library in Java check out [http://ddogleg.org DDogleg]. | '''Note:''' This is a simple straight forward implementation of Levenberg-Marquardt and is not as robust as Minpack's implementation. If you are looking for a robust non-linear least-squares minimization library in Java check out [http://ddogleg.org DDogleg]. | ||
− | + | External Resources: | |
− | [https://github.com/lessthanoptimal/ejml/blob/ | + | * [https://github.com/lessthanoptimal/ejml/blob/v0.41/examples/src/org/ejml/example/LevenbergMarquardt.java LevenbergMarquardt.java code] |
+ | * <disqus>Discuss this example</disqus> | ||
== Example Code == | == Example Code == | ||
Line 32: | Line 34: | ||
* A matrix that is reshaped won't grow unless the new shape requires more memory than it has available. | * A matrix that is reshaped won't grow unless the new shape requires more memory than it has available. | ||
* </p> | * </p> | ||
+ | * | ||
* @author Peter Abeles | * @author Peter Abeles | ||
*/ | */ | ||
public class LevenbergMarquardt { | public class LevenbergMarquardt { | ||
+ | // Convergence criteria | ||
+ | private int maxIterations = 100; | ||
+ | private double ftol = 1e-12; | ||
+ | private double gtol = 1e-12; | ||
+ | |||
// how much the numerical jacobian calculation perturbs the parameters by. | // how much the numerical jacobian calculation perturbs the parameters by. | ||
// In better implementation there are better ways to compute this delta. See Numerical Recipes. | // In better implementation there are better ways to compute this delta. See Numerical Recipes. | ||
private final static double DELTA = 1e-8; | private final static double DELTA = 1e-8; | ||
+ | // Dampening. Larger values means it's more like gradient descent | ||
private double initialLambda; | private double initialLambda; | ||
// the function that is optimized | // the function that is optimized | ||
− | private | + | private ResidualFunction function; |
// the optimized parameters and associated costs | // the optimized parameters and associated costs | ||
− | private | + | private DMatrixRMaj candidateParameters = new DMatrixRMaj(1, 1); |
private double initialCost; | private double initialCost; | ||
private double finalCost; | private double finalCost; | ||
// used by matrix operations | // used by matrix operations | ||
− | private | + | private DMatrixRMaj g = new DMatrixRMaj(1, 1); // gradient |
− | private | + | private DMatrixRMaj H = new DMatrixRMaj(1, 1); // Hessian approximation |
− | private | + | private DMatrixRMaj Hdiag = new DMatrixRMaj(1, 1); |
− | private | + | private DMatrixRMaj negativeStep = new DMatrixRMaj(1, 1); |
− | |||
// variables used by the numerical jacobian algorithm | // variables used by the numerical jacobian algorithm | ||
− | private | + | private DMatrixRMaj temp0 = new DMatrixRMaj(1, 1); |
− | private | + | private DMatrixRMaj temp1 = new DMatrixRMaj(1, 1); |
// used when computing d and H variables | // used when computing d and H variables | ||
− | private | + | private DMatrixRMaj residuals = new DMatrixRMaj(1, 1); |
// Where the numerical Jacobian is stored. | // Where the numerical Jacobian is stored. | ||
− | private | + | private DMatrixRMaj jacobian = new DMatrixRMaj(1, 1); |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
public double getInitialCost() { | public double getInitialCost() { | ||
Line 102: | Line 81: | ||
} | } | ||
− | public | + | /** |
− | + | * @param initialLambda Initial value of dampening parameter. Try 1 to start | |
+ | */ | ||
+ | public LevenbergMarquardt( double initialLambda ) { | ||
+ | this.initialLambda = initialLambda; | ||
+ | } | ||
+ | |||
+ | /** | ||
+ | * Specifies convergence criteria | ||
+ | * | ||
+ | * @param maxIterations Maximum number of iterations | ||
+ | * @param ftol convergence based on change in function value. try 1e-12 | ||
+ | * @param gtol convergence based on residual magnitude. Try 1e-12 | ||
+ | */ | ||
+ | public void setConvergence( int maxIterations, double ftol, double gtol ) { | ||
+ | this.maxIterations = maxIterations; | ||
+ | this.ftol = ftol; | ||
+ | this.gtol = gtol; | ||
} | } | ||
Line 109: | Line 104: | ||
* Finds the best fit parameters. | * Finds the best fit parameters. | ||
* | * | ||
− | * @param | + | * @param function The function being optimized |
− | + | * @param parameters (Input/Output) initial parameter estimate and storage for optimized parameters | |
− | * @param | ||
* @return true if it succeeded and false if it did not. | * @return true if it succeeded and false if it did not. | ||
*/ | */ | ||
− | public boolean optimize( | + | public boolean optimize( ResidualFunction function, DMatrixRMaj parameters ) { |
− | + | configure(function, parameters.getNumElements()); | |
− | |||
− | |||
− | configure( | ||
// save the cost of the initial parameters so that it knows if it improves or not | // save the cost of the initial parameters so that it knows if it improves or not | ||
− | initialCost = cost( | + | double previousCost = initialCost = cost(parameters); |
// iterate until the difference between the costs is insignificant | // iterate until the difference between the costs is insignificant | ||
− | // or | + | double lambda = initialLambda; |
− | if( | + | |
− | + | // if it should recompute the Jacobian in this iteration or not | |
− | + | boolean computeHessian = true; | |
− | + | ||
+ | for (int iter = 0; iter < maxIterations; iter++) { | ||
+ | if (computeHessian) { | ||
+ | // compute some variables based on the gradient | ||
+ | computeGradientAndHessian(parameters); | ||
+ | computeHessian = false; | ||
+ | |||
+ | // check for convergence using gradient test | ||
+ | boolean converged = true; | ||
+ | for (int i = 0; i < g.getNumElements(); i++) { | ||
+ | if (Math.abs(g.data[i]) > gtol) { | ||
+ | converged = false; | ||
+ | break; | ||
+ | } | ||
+ | } | ||
+ | if (converged) | ||
+ | return true; | ||
+ | } | ||
+ | |||
+ | // H = H + lambda*I | ||
+ | for (int i = 0; i < H.numRows; i++) { | ||
+ | H.set(i, i, Hdiag.get(i) + lambda); | ||
+ | } | ||
− | + | // In robust implementations failure to solve is handled much better | |
− | + | if (!CommonOps_DDRM.solve(H, g, negativeStep)) { | |
+ | return false; | ||
+ | } | ||
− | + | // compute the candidate parameters | |
− | + | CommonOps_DDRM.subtract(parameters, negativeStep, candidateParameters); | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | double cost = cost(candidateParameters); | |
− | + | if (cost <= previousCost) { | |
− | + | // the candidate parameters produced better results so use it | |
+ | computeHessian = true; | ||
+ | parameters.setTo(candidateParameters); | ||
− | + | // check for convergence | |
− | + | // ftol <= (cost(k) - cost(k+1))/cost(k) | |
− | + | boolean converged = ftol*previousCost >= previousCost - cost; | |
− | |||
− | |||
− | + | previousCost = cost; | |
− | + | lambda /= 10.0; | |
− | |||
− | |||
− | |||
− | + | if (converged) { | |
− | if( | + | return true; |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
} | } | ||
+ | } else { | ||
+ | lambda *= 10.0; | ||
} | } | ||
− | |||
− | |||
− | |||
− | |||
} | } | ||
− | finalCost = | + | finalCost = previousCost; |
return true; | return true; | ||
} | } | ||
Line 185: | Line 179: | ||
* a matrix it will only declare new memory when needed. | * a matrix it will only declare new memory when needed. | ||
*/ | */ | ||
− | protected void configure( | + | protected void configure( ResidualFunction function, int numParam ) { |
− | + | this.function = function; | |
− | + | int numFunctions = function.numFunctions(); | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | int | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
// reshaping a matrix means that new memory is only declared when needed | // reshaping a matrix means that new memory is only declared when needed | ||
− | + | candidateParameters.reshape(numParam, 1); | |
− | + | g.reshape(numParam, 1); | |
− | + | H.reshape(numParam, numParam); | |
− | + | negativeStep.reshape(numParam, 1); | |
− | |||
+ | // Normally these variables are thought of as row vectors, but it works out easier if they are column | ||
+ | temp0.reshape(numFunctions, 1); | ||
+ | temp1.reshape(numFunctions, 1); | ||
+ | residuals.reshape(numFunctions, 1); | ||
+ | jacobian.reshape(numFunctions, numParam); | ||
} | } | ||
/** | /** | ||
− | * Computes the d and H parameters. | + | * Computes the d and H parameters. |
− | * H | + | * |
+ | * d = J'*(f(x)-y) <--- that's also the gradient | ||
+ | * H = J'*J | ||
*/ | */ | ||
− | private void | + | private void computeGradientAndHessian( DMatrixRMaj param ) { |
− | + | // residuals = f(x) - y | |
− | + | function.compute(param, residuals); | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | computeNumericalJacobian(param, jacobian); | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | CommonOps_DDRM.multTransA(jacobian, residuals, g); | |
− | + | CommonOps_DDRM.multTransA(jacobian, jacobian, H); | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | CommonOps_DDRM.extractDiag(H, Hdiag); | |
− | |||
− | |||
− | |||
} | } | ||
Line 263: | Line 217: | ||
* Computes the "cost" for the parameters given. | * Computes the "cost" for the parameters given. | ||
* | * | ||
− | * cost = (1/N) Sum (f(x | + | * cost = (1/N) Sum (f(x) - y)^2 |
*/ | */ | ||
− | private double cost( | + | private double cost( DMatrixRMaj param ) { |
− | + | function.compute(param, residuals); | |
− | |||
− | double error = | + | double error = NormOps_DDRM.normF(residuals); |
− | return error*error / (double) | + | return error*error/(double)residuals.numRows; |
} | } | ||
Line 277: | Line 230: | ||
* Computes a simple numerical Jacobian. | * Computes a simple numerical Jacobian. | ||
* | * | ||
− | * @param param The set of parameters that the Jacobian is to be computed at. | + | * @param param (input) The set of parameters that the Jacobian is to be computed at. |
− | * @param | + | * @param jacobian (output) Where the jacobian will be stored |
− | |||
*/ | */ | ||
− | protected void computeNumericalJacobian( | + | protected void computeNumericalJacobian( DMatrixRMaj param, |
− | + | DMatrixRMaj jacobian ) { | |
− | |||
− | |||
double invDelta = 1.0/DELTA; | double invDelta = 1.0/DELTA; | ||
− | + | function.compute(param, temp0); | |
// compute the jacobian by perturbing the parameters slightly | // compute the jacobian by perturbing the parameters slightly | ||
// then seeing how it effects the results. | // then seeing how it effects the results. | ||
− | for( int i = 0; i < param. | + | for (int i = 0; i < param.getNumElements(); i++) { |
param.data[i] += DELTA; | param.data[i] += DELTA; | ||
− | + | function.compute(param, temp1); | |
// compute the difference between the two parameters and divide by the delta | // compute the difference between the two parameters and divide by the delta | ||
− | add(invDelta,temp1,-invDelta,temp0,temp1); | + | // temp1 = (temp1 - temp0)/delta |
+ | CommonOps_DDRM.add(invDelta, temp1, -invDelta, temp0, temp1); | ||
+ | |||
// copy the results into the jacobian matrix | // copy the results into the jacobian matrix | ||
− | + | // J(i,:) = temp1 | |
+ | CommonOps_DDRM.insert(temp1, jacobian, 0, i); | ||
param.data[i] -= DELTA; | param.data[i] -= DELTA; | ||
Line 304: | Line 257: | ||
/** | /** | ||
− | * The function that is being optimized. | + | * The function that is being optimized. Returns the residual. f(x) - y |
*/ | */ | ||
− | public interface Function | + | public interface ResidualFunction { |
+ | /** | ||
+ | * Computes the residual vector given the set of input parameters | ||
+ | * Function which goes from N input to M outputs | ||
+ | * | ||
+ | * @param param (Input) N by 1 parameter vector | ||
+ | * @param residual (Output) M by 1 output vector to store the residual = f(x)-y | ||
+ | */ | ||
+ | void compute( DMatrixRMaj param, DMatrixRMaj residual ); | ||
+ | |||
/** | /** | ||
− | * | + | * Number of functions in output |
* | * | ||
− | * @ | + | * @return function count |
− | |||
− | |||
*/ | */ | ||
− | + | int numFunctions(); | |
} | } | ||
} | } | ||
</syntaxhighlight> | </syntaxhighlight> |
Latest revision as of 07:26, 7 July 2021
Levenberg-Marquardt (LM) is a popular non-linear optimization algorithm. This example demonstrate how a basic implementation of Levenberg-Marquardt can be created using EJML's procedural interface. Unnecessary allocation of new memory is avoided by reshaping matrices. When a matrix is reshaped its width and height is changed but new memory is not declared unless the new shape requires more memory than is available.
LM works by being provided a function which computes the residual error. Residual error is defined has the difference between the predicted output and the actual observed output, e.g. f(x)-y. Optimization works by finding a set of parameters which minimize the magnitude of the residuals based on the F2-norm.
Note: This is a simple straight forward implementation of Levenberg-Marquardt and is not as robust as Minpack's implementation. If you are looking for a robust non-linear least-squares minimization library in Java check out DDogleg.
External Resources:
- LevenbergMarquardt.java code
- <disqus>Discuss this example</disqus>
Example Code
/**
* <p>
* This is a straight forward implementation of the Levenberg-Marquardt (LM) algorithm. LM is used to minimize
* non-linear cost functions:<br>
* <br>
* S(P) = Sum{ i=1:m , [y<sub>i</sub> - f(x<sub>i</sub>,P)]<sup>2</sup>}<br>
* <br>
* where P is the set of parameters being optimized.
* </p>
*
* <p>
* In each iteration the parameters are updated using the following equations:<br>
* <br>
* P<sub>i+1</sub> = (H + λ I)<sup>-1</sup> d <br>
* d = (1/N) Sum{ i=1..N , (f(x<sub>i</sub>;P<sub>i</sub>) - y<sub>i</sub>) * jacobian(:,i) } <br>
* H = (1/N) Sum{ i=1..N , jacobian(:,i) * jacobian(:,i)<sup>T</sup> }
* </p>
* <p>
* Whenever possible the allocation of new memory is avoided. This is accomplished by reshaping matrices.
* A matrix that is reshaped won't grow unless the new shape requires more memory than it has available.
* </p>
*
* @author Peter Abeles
*/
public class LevenbergMarquardt {
// Convergence criteria
private int maxIterations = 100;
private double ftol = 1e-12;
private double gtol = 1e-12;
// how much the numerical jacobian calculation perturbs the parameters by.
// In better implementation there are better ways to compute this delta. See Numerical Recipes.
private final static double DELTA = 1e-8;
// Dampening. Larger values means it's more like gradient descent
private double initialLambda;
// the function that is optimized
private ResidualFunction function;
// the optimized parameters and associated costs
private DMatrixRMaj candidateParameters = new DMatrixRMaj(1, 1);
private double initialCost;
private double finalCost;
// used by matrix operations
private DMatrixRMaj g = new DMatrixRMaj(1, 1); // gradient
private DMatrixRMaj H = new DMatrixRMaj(1, 1); // Hessian approximation
private DMatrixRMaj Hdiag = new DMatrixRMaj(1, 1);
private DMatrixRMaj negativeStep = new DMatrixRMaj(1, 1);
// variables used by the numerical jacobian algorithm
private DMatrixRMaj temp0 = new DMatrixRMaj(1, 1);
private DMatrixRMaj temp1 = new DMatrixRMaj(1, 1);
// used when computing d and H variables
private DMatrixRMaj residuals = new DMatrixRMaj(1, 1);
// Where the numerical Jacobian is stored.
private DMatrixRMaj jacobian = new DMatrixRMaj(1, 1);
public double getInitialCost() {
return initialCost;
}
public double getFinalCost() {
return finalCost;
}
/**
* @param initialLambda Initial value of dampening parameter. Try 1 to start
*/
public LevenbergMarquardt( double initialLambda ) {
this.initialLambda = initialLambda;
}
/**
* Specifies convergence criteria
*
* @param maxIterations Maximum number of iterations
* @param ftol convergence based on change in function value. try 1e-12
* @param gtol convergence based on residual magnitude. Try 1e-12
*/
public void setConvergence( int maxIterations, double ftol, double gtol ) {
this.maxIterations = maxIterations;
this.ftol = ftol;
this.gtol = gtol;
}
/**
* Finds the best fit parameters.
*
* @param function The function being optimized
* @param parameters (Input/Output) initial parameter estimate and storage for optimized parameters
* @return true if it succeeded and false if it did not.
*/
public boolean optimize( ResidualFunction function, DMatrixRMaj parameters ) {
configure(function, parameters.getNumElements());
// save the cost of the initial parameters so that it knows if it improves or not
double previousCost = initialCost = cost(parameters);
// iterate until the difference between the costs is insignificant
double lambda = initialLambda;
// if it should recompute the Jacobian in this iteration or not
boolean computeHessian = true;
for (int iter = 0; iter < maxIterations; iter++) {
if (computeHessian) {
// compute some variables based on the gradient
computeGradientAndHessian(parameters);
computeHessian = false;
// check for convergence using gradient test
boolean converged = true;
for (int i = 0; i < g.getNumElements(); i++) {
if (Math.abs(g.data[i]) > gtol) {
converged = false;
break;
}
}
if (converged)
return true;
}
// H = H + lambda*I
for (int i = 0; i < H.numRows; i++) {
H.set(i, i, Hdiag.get(i) + lambda);
}
// In robust implementations failure to solve is handled much better
if (!CommonOps_DDRM.solve(H, g, negativeStep)) {
return false;
}
// compute the candidate parameters
CommonOps_DDRM.subtract(parameters, negativeStep, candidateParameters);
double cost = cost(candidateParameters);
if (cost <= previousCost) {
// the candidate parameters produced better results so use it
computeHessian = true;
parameters.setTo(candidateParameters);
// check for convergence
// ftol <= (cost(k) - cost(k+1))/cost(k)
boolean converged = ftol*previousCost >= previousCost - cost;
previousCost = cost;
lambda /= 10.0;
if (converged) {
return true;
}
} else {
lambda *= 10.0;
}
}
finalCost = previousCost;
return true;
}
/**
* Performs sanity checks on the input data and reshapes internal matrices. By reshaping
* a matrix it will only declare new memory when needed.
*/
protected void configure( ResidualFunction function, int numParam ) {
this.function = function;
int numFunctions = function.numFunctions();
// reshaping a matrix means that new memory is only declared when needed
candidateParameters.reshape(numParam, 1);
g.reshape(numParam, 1);
H.reshape(numParam, numParam);
negativeStep.reshape(numParam, 1);
// Normally these variables are thought of as row vectors, but it works out easier if they are column
temp0.reshape(numFunctions, 1);
temp1.reshape(numFunctions, 1);
residuals.reshape(numFunctions, 1);
jacobian.reshape(numFunctions, numParam);
}
/**
* Computes the d and H parameters.
*
* d = J'*(f(x)-y) <--- that's also the gradient
* H = J'*J
*/
private void computeGradientAndHessian( DMatrixRMaj param ) {
// residuals = f(x) - y
function.compute(param, residuals);
computeNumericalJacobian(param, jacobian);
CommonOps_DDRM.multTransA(jacobian, residuals, g);
CommonOps_DDRM.multTransA(jacobian, jacobian, H);
CommonOps_DDRM.extractDiag(H, Hdiag);
}
/**
* Computes the "cost" for the parameters given.
*
* cost = (1/N) Sum (f(x) - y)^2
*/
private double cost( DMatrixRMaj param ) {
function.compute(param, residuals);
double error = NormOps_DDRM.normF(residuals);
return error*error/(double)residuals.numRows;
}
/**
* Computes a simple numerical Jacobian.
*
* @param param (input) The set of parameters that the Jacobian is to be computed at.
* @param jacobian (output) Where the jacobian will be stored
*/
protected void computeNumericalJacobian( DMatrixRMaj param,
DMatrixRMaj jacobian ) {
double invDelta = 1.0/DELTA;
function.compute(param, temp0);
// compute the jacobian by perturbing the parameters slightly
// then seeing how it effects the results.
for (int i = 0; i < param.getNumElements(); i++) {
param.data[i] += DELTA;
function.compute(param, temp1);
// compute the difference between the two parameters and divide by the delta
// temp1 = (temp1 - temp0)/delta
CommonOps_DDRM.add(invDelta, temp1, -invDelta, temp0, temp1);
// copy the results into the jacobian matrix
// J(i,:) = temp1
CommonOps_DDRM.insert(temp1, jacobian, 0, i);
param.data[i] -= DELTA;
}
}
/**
* The function that is being optimized. Returns the residual. f(x) - y
*/
public interface ResidualFunction {
/**
* Computes the residual vector given the set of input parameters
* Function which goes from N input to M outputs
*
* @param param (Input) N by 1 parameter vector
* @param residual (Output) M by 1 output vector to store the residual = f(x)-y
*/
void compute( DMatrixRMaj param, DMatrixRMaj residual );
/**
* Number of functions in output
*
* @return function count
*/
int numFunctions();
}
}