//   SVM, version 1.03

/*
    Copyright Vladimir Kolmogorov vnk@ist.ac.at 2014

    This file is part of SVM.

    SVM is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    SVM is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with SVM.  If not, see <http://www.gnu.org/licenses/>.
*/

/*
	Implements the MP-BCFW algorithm for training structural SVMs described in

		Neel Shah, Vladimir Kolmogorov, Christoph H. Lampert
		A Multi-Plane Block-Coordinate Frank-Wolfe Algorithm for Structural SVMs with a Costly max-Oracle
		Technical report arXiv:1408.6804, August 2014

	With parameters cp_max = approx_max = 0 reduces to the BCFW algorithm described in

		S. Lacoste-Julien, M. Jaggi, M. Schmidt, P. Pletscher
		Block-Coordinate Frank-Wolfe Optimization for Structural SVMs
		ICML 2013, Atlanta, USA, June 2013

	If you use this software for research purposes you should cite the aforementioned paper(s) in any resulting publication.

	///////////////////////////////////////////////////////////////////////////////////////////////

	Solves the following problem:
		min  1/2 \lambda ||w||^2 + \mu \sum_{i=1}^n H_i(w)
	where \lambda, \mu > 0 and
		H_i(w) = \max_y <a^{iy},[w 1]>

	It is assumed H_i(w) can be evaluated efficiently, i.e. the problem
		\max_y <a^{iy},[w 1]>
	can be solved efficiently for a given i and w.
*/

#ifndef OAISJNHFOASFASFASFASFNVASF
#define OAISJNHFOASFASFASFASFNVASF

#include "block.h"

// Type of a function that solves the maximization problem (should be provided by the user).
// INPUT: i, w, user_arg
// OUTPUT: vector a=a^{iy}  where  y \in \argmax_y <a^{iy},[w 1]>
// 
// 'w' and 'a' are arrays of size 'd' and 'd+1' respectively (already allocated). The last element in 'a' corresponds to '1'
typedef void (*MaxFn)(int i, double* w, double* a, void* user_arg);

class SVM
{
public:
	// d = dimension of w, n = # of examples.
	//
	// If zero_lower_bound is true then H_i(w)\ge 0 for all i and w
	// (this is the case for the SVM objective - this lower bound is given by the ground truth labeling y^i).
	// If this flag is on then the algorithm is initialized with the plane (0,...,0,0) for each term,
	// otherwise the initial lower bound plane is obtained by calling the oracle for some w.
	//
	// Internally, the terms are partitioned into groups of size 'group_size'. group_size=n corresponds to (non-block coordinate) Frank-Wolfe.
	SVM(int d, int n, double lambda, double mu, MaxFn max_fn, void* user_arg, bool zero_lower_bound, int group_size=1);
	~SVM();

	double* Solve(); // returns a pointer to an array of size 'd' containing solution (vector w).
	                 // For options to Solve(), see SVM::options below

	void GetBounds(double& lower_bound, double& upper_bound); // of internally stored solution. Expensive - calls n oracles!

	double Evaluate(double* w); // returns the value of the objective function for given w. Expensive - calls n oracles!

	void* GetUserArg() { return user_arg; }

	// To get block-coordinate Frank-Wolfe, set cp_max = approx_max = 0.
	// ('cp' stands for 'cutting plane')
	struct Options
	{
		Options() :
			randomize_method(2),
			avg_flag(3),

			iter_max(300),
			approx_max(1000), // <--- probably will not be reached (due to the param below)
			approx_limit_ratio(1.0),
			kernel_max(1),

			cp_max(100), // <--- probably will not be reached (due to the param below). Can be decreased if memory is an issue
			cp_inactive_iter_max(10), // <--- PERHAPS THE MOST IMPORTANT PARAMETER:
			                          //      for how many iterations inactive planes are kept in memory

			callback_freq(5),
			callback_fn(default_callback_fn),
			gap_threshold(1e-10), 
			print_flag(2),
			exclude_callback_time(false)
		{
		};

		int randomize_method; // 0: use default order for every iteration (0,1,...,n-1)
		                      // 1: generate a random permutation, use it for every iteration
		                      // 2: generate a new random permutation at every iteration
		                      // 3: generate a new random permutation at every exact & approximate pass
		                      // 4: for every step sample example in {0,1,...,n-1} uniformly at random

		int avg_flag; // 0: don't use averaging
		              // 1: compute weighted average of vectors after each update, as described in (Lacoste-Julien et al. ICML'13)
		              // 2: compute weighted average of vectors after each exact update, as described in (Lacoste-Julien et al. ICML'13)
		              // 3: compute two vectors: avg_exact - weighted average of vectors after each exact update
		              //                         avg_approx - weighted average of vectors after each approx update
		              //    return their best interpolation

		int iter_max;
		int approx_max; // >= 0. Each iter first performs one pass with calls to the 'real' oracle,
		                //       and then up to 'approx_max' passes with calls to the 'approximate' oracle
		                //       It is recommended to set it to a large number and rely on the criterion below.
		double approx_limit_ratio; // extra stopping criterion: approx. pass is stopped if
		                           //    approx_limit_ratio * (increase of the lower bound during B) / (time of B) 
		                           //                       < (increase of the lower bound during A) / (time of A)
		                           // where B corresponds to the last approx. pass and A corresponds to the sequence of steps
		                           // from the beginning of the current iter (including the exact pass) until B

		int kernel_max; // >= 1. Could be helpful if 'd' is very large.
		                // During approximate passes each term is processed 'kernel_max' times.
		                // If >1 then a specialized implementation with kernels is used, where the inner products between different planes are cached.

		///////////////////////////////
		// cutting planes parameters //
		///////////////////////////////

		// If there are more than 'cp_max' planes then remove the plane that has been inactive the longest time.
		// (A plane is active when it is added or when it is returned by the approximate oracle.)
		// Also after each approximate oracle call remove a plane if it hasn't been active during the last 'cp_inactive_iter_max' outer iterations (including the current one)
		int cp_max; // >= 0. 
		int cp_inactive_iter_max;  // if == 0 then this option is not used (so 0 corresponds to +\infty)

		///////////////////////////////////////////////
		// stopping criteria and printed information //
		///////////////////////////////////////////////

		// if callback_fn != NULL then this function will be called after every 'callback_freq' iterations (=callback_freq*n calls to max_fn).
		// If this function returns false then Solve() will terminate.
		// The default function checks the duality gap and prints all bounds.
		bool (*callback_fn)(SVM* svm);

		int callback_freq;
		double gap_threshold;
		int print_flag; // 0: don't print anything
		                // 1: print bounds and gap
		                // 2: print bounds and gap + average # of planes per term + # approx passes in the last outer iter
		bool exclude_callback_time; // when printing time in default_callback_fn().

		static bool default_callback_fn(SVM* svm)
		{
			double lower_bound, upper_bound;
			svm->GetBounds(lower_bound, upper_bound); // this is expensive! (calls real oracles)
			if (lower_bound < svm->lower_bound_last) lower_bound = svm->lower_bound_last; // can happen if averaging is used
			double dual_gap_bound = upper_bound - lower_bound;

			double t = svm->time_from_start; if (svm->options.exclude_callback_time) t -= svm->callback_time;

			if (svm->options.print_flag>0) printf("%d, %.2f sec:   %f %f, gap %f", svm->iter, t, lower_bound, upper_bound, dual_gap_bound);
			if (svm->options.print_flag>1) printf("   %.1f cp, %d it.", (double)svm->total_plane_num / svm->n, svm->approx_pass);
			printf("\n");

			if (dual_gap_bound < svm->options.gap_threshold) return false;
			return true;
		}
	} options;











//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////

private:
	class Term // represents term H(w) = \max_y <a,[w 1]> and a 'current' linear lower bound
	{
	public:
	
		Term(int d, SVM* svm, Buffer* buf, bool maintain_products);
		~Term();

		int num; // number of planes 'y'
		double* current; // array of size d+1
		double** a; // a[t] points to an array of size d+1, 0<=t<num.
		float* last_accessed;  // timestamps, of size num

#define NOT_YET_COMPUTED (3e103) // some random number
		double** products; // products[t1][t2]=dot product of vectors a[t1] and a[t2], ignoring the last (d+1)-th coordinate (0<=t1,t2<num).
						   // valid only if maintain_products was true in the constructor. Computed on demand.

		SVM* svm;

		///////////////////////////////////////////////////////////////////////////////////////

		bool isDuplicate(double* a);
		int AddPlane(double* a, int cp_max); // if num>=cp_max then the plane with the lowest 'counter' will be deleted
		                                     // and the new plane 'a' will be inserted instead.
		                                     // returns id of the added plane
		void DeletePlane(int t); // plane 'num-1' is moved to position 't'.

		int Maximize(double* w); // returns id of the cutting plane 'a' that maximizes <[w 1],a>.

		void UpdateStats(int t); // increases 'counter' for 't' and decreases it for other planes, with parameter 'cp_history_size' (see implementation)

		void RemoveUnusedPlanes();


		////////////////////////////////////
	private:
		int d, num_max;
		Buffer* buf;
		char* my_buf;

		void Allocate(int num_max_new, bool maintain_products);
	};

	int d, n, n0, group_size;
	double lambda_mu; // = lambda / mu
	double lambda_mu_inv; // = mu / lambda
	double mu;
	MaxFn max_fn0;
	void* user_arg;
	Buffer buf;
	ReusableBuffer rbuf_SolveWithKernel;

	double* w; // of size d
	double* current_sum; // of size d+1
	Term** terms; // of size n

	float timestamp, timestamp_threshold;
	int total_plane_num;
	int iter, approx_pass, total_pass;
	double lower_bound_last;
	double time_start, time_from_start, callback_time;

	void max_fn(int i, double* a); // calls max_fn0 for current w
	bool zero_lower_bound;
	double* max_fn_buf; // of size n+1

	void InitSolver();
	void AddCuttingPlane(int i, double* a);
	void SolveWithKernel(int i, int iter_max);
	void InterpolateBest(double* current_sum1, double* current_sum2, double* current_sum_best);
};



#endif
