QpBoxLinear.h
Go to the documentation of this file.
1 //===========================================================================
2 /*!
3  *
4  *
5  * \brief Quadratic programming solver linear SVM training without bias
6  *
7  *
8  *
9  *
10  * \author T. Glasmachers
11  * \date -
12  *
13  *
14  * \par Copyright 1995-2015 Shark Development Team
15  *
16  * <BR><HR>
17  * This file is part of Shark.
18  * <http://image.diku.dk/shark/>
19  *
20  * Shark is free software: you can redistribute it and/or modify
21  * it under the terms of the GNU Lesser General Public License as published
22  * by the Free Software Foundation, either version 3 of the License, or
23  * (at your option) any later version.
24  *
25  * Shark is distributed in the hope that it will be useful,
26  * but WITHOUT ANY WARRANTY; without even the implied warranty of
27  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28  * GNU Lesser General Public License for more details.
29  *
30  * You should have received a copy of the GNU Lesser General Public License
31  * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32  *
33  */
34 //===========================================================================
35 
36 
37 #ifndef SHARK_ALGORITHMS_QP_QPBOXLINEAR_H
38 #define SHARK_ALGORITHMS_QP_QPBOXLINEAR_H
39 
40 #include <shark/Core/Timer.h>
42 #include <shark/Data/Dataset.h>
43 #include <shark/Data/DataView.h>
44 #include <shark/LinAlg/Base.h>
45 #include <cmath>
46 #include <iostream>
47 
48 
49 namespace shark {
50 
51 
52 // strategy constants
53 #define CHANGE_RATE 0.2
54 #define PREF_MIN 0.05
55 #define PREF_MAX 20.0
56 
57 
58 ///
59 /// \brief Quadratic program solver for box-constrained problems with linear kernel
60 ///
61 /// \par
62 /// The QpBoxLinear class is a decomposition-based solver for linear
63 /// support vector machine classifiers trained with the hinge loss.
64 /// Its optimization is largely based on the paper<br>
65 /// "A Dual Coordinate Descent Method for Large-scale Linear SVM"
66 /// by Hsieh, Chang, and Lin, ICML, 2007.
67 /// However, the present algorithm differs quite a bit, since it
68 /// learns variable preferences as a replacement for the missing
69 /// working set selection. At the same time, this method replaces
70 /// the shrinking heuristic.
71 ///
72 template <class InputT>
74 {
75 public:
78 
79  ///
80  /// \brief Constructor
81  ///
82  /// \param dataset training data
83  /// \param dim problem dimension
84  ///
85  QpBoxLinear(const DatasetType& dataset, std::size_t dim)
86  : m_data(dataset)
88  , m_dim(dim)
89  {
90  SHARK_ASSERT(dim > 0);
91 
92  // pre-compute squared norms
93  for (std::size_t i=0; i<m_data.size(); i++)
94  {
95  ElementType x_i = m_data[i];
96  m_xSquared(i) = inner_prod(x_i.input, x_i.input);
97  }
98  }
99 
100  ///
101  /// \brief Solve the SVM training problem.
102  ///
103  /// \param C regularization constant of the SVM
104  /// \param stop stopping condition(s)
105  /// \param prop solution properties
106  /// \param verbose if true, the solver prints status information and solution statistics
107  ///
108  RealVector solve(
109  double C,
110  QpStoppingCondition& stop,
111  QpSolutionProperties* prop = NULL,
112  bool verbose = false)
113  {
114  // sanity checks
115  SHARK_ASSERT(C > 0.0);
116 
117  // measure training time
118  Timer timer;
119  timer.start();
120 
121  // prepare dimensions and vectors
122  std::size_t ell = m_data.size();
123  RealVector alpha(ell, 0.0);
124  RealVector w(m_dim, 0.0);
125  RealVector pref(ell, 1.0); // measure of success of individual steps
126  double prefsum = ell; // normalization constant
127  std::vector<std::size_t> schedule(ell);
128 
129  // prepare counters
130  std::size_t epoch = 0;
131  std::size_t steps = 0;
132 
133  // prepare performance monitoring for self-adaptation
134  double max_violation = 0.0;
135  const double gain_learning_rate = 1.0 / ell;
136  double average_gain = 0.0;
137  bool canstop = true;
138 
139  // outer optimization loop
140  while (true)
141  {
142  // define schedule
143  double psum = prefsum;
144  prefsum = 0.0;
145  std::size_t pos = 0;
146  for (std::size_t i=0; i<ell; i++)
147  {
148  double p = pref[i];
149  double num = (psum < 1e-6) ? ell - pos : std::min((double)(ell - pos), (ell - pos) * p / psum);
150  std::size_t n = (std::size_t)std::floor(num);
151  double prob = num - n;
152  if (Rng::uni() < prob) n++;
153  for (std::size_t j=0; j<n; j++)
154  {
155  schedule[pos] = i;
156  pos++;
157  }
158  psum -= p;
159  prefsum += p;
160  }
161  SHARK_ASSERT(pos == ell);
162  for (std::size_t i=0; i<ell; i++) std::swap(schedule[i], schedule[Rng::discrete(0, ell - 1)]);
163 
164  // inner loop
165  max_violation = 0.0;
166  for (std::size_t j=0; j<ell; j++)
167  {
168  // active variable
169  std::size_t i = schedule[j];
170  ElementType e_i = m_data[i];
171  double y_i = (e_i.label > 0) ? +1.0 : -1.0;
172 
173  // compute gradient and projected gradient
174  double a = alpha(i);
175  double wyx = y_i * inner_prod(w, e_i.input);
176  double g = 1.0 - wyx;
177  double pg = (a == 0.0 && g < 0.0) ? 0.0 : (a == C && g > 0.0 ? 0.0 : g);
178 
179  // update maximal KKT violation over the epoch
180  max_violation = std::max(max_violation, std::abs(pg));
181  double gain = 0.0;
182 
183  // perform the step
184  if (pg != 0.0)
185  {
186  // SMO-style coordinate descent step
187  double q = m_xSquared(i);
188  double mu = g / q;
189  double new_a = a + mu;
190 
191  // numerically stable update
192  if (new_a <= 0.0)
193  {
194  mu = -a;
195  new_a = 0.0;
196  }
197  else if (new_a >= C)
198  {
199  mu = C - a;
200  new_a = C;
201  }
202 
203  // update both representations of the weight vector: alpha and w
204  alpha(i) = new_a;
205  w += (mu * y_i) * e_i.input;
206  gain = mu * (g - 0.5 * q * mu);
207 
208  steps++;
209  }
210 
211  // update gain-based preferences
212  {
213  if (epoch == 0) average_gain += gain / (double)ell;
214  else
215  {
216  double change = CHANGE_RATE * (gain / average_gain - 1.0);
217  double newpref = std::min(PREF_MAX, std::max(PREF_MIN, pref(i) * std::exp(change)));
218  prefsum += newpref - pref(i);
219  pref[i] = newpref;
220  average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
221  }
222  }
223  }
224 
225  epoch++;
226 
227  // stopping criteria
228  if (stop.maxIterations > 0 && ell * epoch >= stop.maxIterations)
229  {
230  if (prop != NULL) prop->type = QpMaxIterationsReached;
231  break;
232  }
233 
234  if (timer.stop() >= stop.maxSeconds)
235  {
236  if (prop != NULL) prop->type = QpTimeout;
237  break;
238  }
239 
240  if (max_violation < stop.minAccuracy)
241  {
242  if (verbose) std::cout << "#" << std::flush;
243  if (canstop)
244  {
245  if (prop != NULL) prop->type = QpAccuracyReached;
246  break;
247  }
248  else
249  {
250  // prepare full sweep for a reliable checking of the stopping criterion
251  canstop = true;
252  for (std::size_t i=0; i<ell; i++) pref[i] = 1.0;
253  prefsum = ell;
254  }
255  }
256  else
257  {
258  if (verbose) std::cout << "." << std::flush;
259  canstop = false;
260  }
261  }
262 
263  timer.stop();
264 
265  // compute solution statistics
266  std::size_t free_SV = 0;
267  std::size_t bounded_SV = 0;
268  double objective = -0.5 * shark::blas::inner_prod(w, w);
269  for (std::size_t i=0; i<ell; i++)
270  {
271  double a = alpha(i);
272  objective += a;
273  if (a > 0.0)
274  {
275  if (a == C) bounded_SV++;
276  else free_SV++;
277  }
278  }
279 
280  // return solution statistics
281  if (prop != NULL)
282  {
283  prop->accuracy = max_violation; // this is approximate, but a good guess
284  prop->iterations = ell * epoch;
285  prop->value = objective;
286  prop->seconds = timer.lastLap();
287  }
288 
289  // output solution statistics
290  if (verbose)
291  {
292  std::cout << std::endl;
293  std::cout << "training time (seconds): " << timer.lastLap() << std::endl;
294  std::cout << "number of epochs: " << epoch << std::endl;
295  std::cout << "number of iterations: " << (ell * epoch) << std::endl;
296  std::cout << "number of non-zero steps: " << steps << std::endl;
297  std::cout << "dual accuracy: " << max_violation << std::endl;
298  std::cout << "dual objective value: " << objective << std::endl;
299  std::cout << "number of free support vectors: " << free_SV << std::endl;
300  std::cout << "number of bounded support vectors: " << bounded_SV << std::endl;
301  }
302 
303  // return the solution
304  return w;
305  }
306 
307 protected:
308  DataView<const DatasetType> m_data; ///< view on training data
309  RealVector m_xSquared; ///< diagonal entries of the quadratic matrix
310  std::size_t m_dim; ///< input space dimension
311 };
312 
313 
314 template < >
315 class QpBoxLinear<CompressedRealVector>
316 {
317 public:
319 
320  ///
321  /// \brief Constructor
322  ///
323  /// \param dataset training data
324  /// \param dim problem dimension
325  ///
326  QpBoxLinear(const DatasetType& dataset, std::size_t dim)
327  : x(dataset.numberOfElements())
328  , y(dataset.numberOfElements())
329  , diagonal(dataset.numberOfElements())
330  , m_dim(dim)
331  {
332  SHARK_ASSERT(dim > 0);
333 
334  // transform ublas sparse vectors into a fast format
335  // (yes, ublas is slow...), and compute the diagonal
336  // elements of the quadratic matrix
337  SparseVector sparse;
338  for (std::size_t b=0, j=0; b<dataset.numberOfBatches(); b++)
339  {
340  DatasetType::const_batch_reference batch = dataset.batch(b);
341  for (std::size_t i=0; i<batch.size(); i++)
342  {
343  CompressedRealVector x_i = shark::get(batch, i).input;
344  if (x_i.nnz() == 0) continue;
345 
346  unsigned int y_i = shark::get(batch, i).label;
347  y[j] = 2.0 * y_i - 1.0;
348  double d = 0.0;
349  for (CompressedRealVector::const_iterator it=x_i.begin(); it != x_i.end(); ++it)
350  {
351  double v = *it;
352  sparse.index = it.index();
353  sparse.value = v;
354  storage.push_back(sparse);
355  d += v * v;
356  }
357  sparse.index = (std::size_t)-1;
358  storage.push_back(sparse);
359  diagonal(j) = d;
360  j++;
361  }
362  }
363  for (std::size_t i=0, j=0, k=0; i<x.size(); i++)
364  {
365  CompressedRealVector x_i = dataset.element(i).input;
366  if (x_i.nnz() == 0) continue;
367 
368  x[j] = &storage[k];
369  for (; storage[k].index != (std::size_t)-1; k++);
370  k++;
371  j++;
372  }
373  }
374 
375  ///
376  /// \brief Solve the SVM training problem.
377  ///
378  /// \param C regularization constant of the SVM
379  /// \param stop stopping condition(s)
380  /// \param prop solution properties
381  /// \param verbose if true, the solver prints status information and solution statistics
382  ///
383  RealVector solve(
384  double C,
385  QpStoppingCondition& stop,
386  QpSolutionProperties* prop = NULL,
387  bool verbose = false)
388  {
389  // sanity checks
390  SHARK_ASSERT(C > 0.0);
391 
392  // measure training time
393  Timer timer;
394  timer.start();
395 
396  // prepare dimensions and vectors
397  std::size_t ell = x.size();
398  RealVector alpha(ell, 0.0);
399  RealVector w(m_dim, 0.0);
400  RealVector pref(ell, 1.0); // measure of success of individual steps
401  double prefsum = ell; // normalization constant
402  std::vector<std::size_t> schedule(ell);
403 
404  // prepare counters
405  std::size_t epoch = 0;
406  std::size_t steps = 0;
407 
408  // prepare performance monitoring for self-adaptation
409  double max_violation = 0.0;
410  const double gain_learning_rate = 1.0 / ell;
411  double average_gain = 0.0;
412  bool canstop = true;
413 
414  // outer optimization loop
415  while (true)
416  {
417  // define schedule
418  double psum = prefsum;
419  prefsum = 0.0;
420  std::size_t pos = 0;
421  for (std::size_t i=0; i<ell; i++)
422  {
423  double p = pref[i];
424  double num = (psum < 1e-6) ? ell - pos : std::min((double)(ell - pos), (ell - pos) * p / psum);
425  std::size_t n = (std::size_t)std::floor(num);
426  double prob = num - n;
427  if (Rng::uni() < prob) n++;
428  for (std::size_t j=0; j<n; j++)
429  {
430  schedule[pos] = i;
431  pos++;
432  }
433  psum -= p;
434  prefsum += p;
435  }
436  SHARK_ASSERT(pos == ell);
437  for (std::size_t i=0; i<ell; i++) std::swap(schedule[i], schedule[Rng::discrete(0, ell - 1)]);
438 
439  // inner loop
440  max_violation = 0.0;
441  for (std::size_t j=0; j<ell; j++)
442  {
443  // active variable
444  std::size_t i = schedule[j];
445  const SparseVector* x_i = x[i];
446 
447  // compute gradient and projected gradient
448  double a = alpha(i);
449  double wyx = y(i) * inner_prod(w, x_i);
450  double g = 1.0 - wyx;
451  double pg = (a == 0.0 && g < 0.0) ? 0.0 : (a == C && g > 0.0 ? 0.0 : g);
452 
453  // update maximal KKT violation over the epoch
454  max_violation = std::max(max_violation, std::abs(pg));
455  double gain = 0.0;
456 
457  // perform the step
458  if (pg != 0.0)
459  {
460  // SMO-style coordinate descent step
461  double q = diagonal(i);
462  double mu = g / q;
463  double new_a = a + mu;
464 
465  // numerically stable update
466  if (new_a <= 0.0)
467  {
468  mu = -a;
469  new_a = 0.0;
470  }
471  else if (new_a >= C)
472  {
473  mu = C - a;
474  new_a = C;
475  }
476 
477  // update both representations of the weight vector: alpha and w
478  alpha(i) = new_a;
479  // w += (mu * y(i)) * x_i;
480  axpy(w, mu * y(i), x_i);
481  gain = mu * (g - 0.5 * q * mu);
482 
483  steps++;
484  }
485 
486  // update gain-based preferences
487  {
488  if (epoch == 0) average_gain += gain / (double)ell;
489  else
490  {
491  double change = CHANGE_RATE * (gain / average_gain - 1.0);
492  double newpref = std::min(PREF_MAX, std::max(PREF_MIN, pref(i) * std::exp(change)));
493  prefsum += newpref - pref(i);
494  pref[i] = newpref;
495  average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
496  }
497  }
498  }
499 
500  epoch++;
501 
502  // stopping criteria
503  if (stop.maxIterations > 0 && ell * epoch >= stop.maxIterations)
504  {
505  if (prop != NULL) prop->type = QpMaxIterationsReached;
506  break;
507  }
508 
509  if (timer.stop() >= stop.maxSeconds)
510  {
511  if (prop != NULL) prop->type = QpTimeout;
512  break;
513  }
514 
515  if (max_violation < stop.minAccuracy)
516  {
517  if (verbose) std::cout << "#" << std::flush;
518  if (canstop)
519  {
520  if (prop != NULL) prop->type = QpAccuracyReached;
521  break;
522  }
523  else
524  {
525  // prepare full sweep for a reliable checking of the stopping criterion
526  canstop = true;
527  for (std::size_t i=0; i<ell; i++) pref[i] = 1.0;
528  prefsum = ell;
529  }
530  }
531  else
532  {
533  if (verbose) std::cout << "." << std::flush;
534  canstop = false;
535  }
536  }
537 
538  timer.stop();
539 
540  // compute solution statistics
541  std::size_t free_SV = 0;
542  std::size_t bounded_SV = 0;
543  double objective = -0.5 * shark::blas::inner_prod(w, w);
544  for (std::size_t i=0; i<ell; i++)
545  {
546  double a = alpha(i);
547  objective += a;
548  if (a > 0.0)
549  {
550  if (a == C) bounded_SV++;
551  else free_SV++;
552  }
553  }
554 
555  // return solution statistics
556  if (prop != NULL)
557  {
558  prop->accuracy = max_violation; // this is approximate, but a good guess
559  prop->iterations = ell * epoch;
560  prop->value = objective;
561  prop->seconds = timer.lastLap();
562  }
563 
564  // output solution statistics
565  if (verbose)
566  {
567  std::cout << std::endl;
568  std::cout << "training time (seconds): " << timer.lastLap() << std::endl;
569  std::cout << "number of epochs: " << epoch << std::endl;
570  std::cout << "number of iterations: " << (ell * epoch) << std::endl;
571  std::cout << "number of non-zero steps: " << steps << std::endl;
572  std::cout << "dual accuracy: " << max_violation << std::endl;
573  std::cout << "dual objective value: " << objective << std::endl;
574  std::cout << "number of free support vectors: " << free_SV << std::endl;
575  std::cout << "number of bounded support vectors: " << bounded_SV << std::endl;
576  }
577 
578  // return the solution
579  return w;
580  }
581 
582 protected:
583  /// \brief Data structure for sparse vectors.
584  struct SparseVector
585  {
586  std::size_t index;
587  double value;
588  };
589 
590  /// \brief Famous "axpy" product, here adding a multiple of a sparse vector to a dense one.
591  static inline void axpy(RealVector& w, double alpha, const SparseVector* xi)
592  {
593  while (true)
594  {
595  if (xi->index == (std::size_t)-1) return;
596  w[xi->index] += alpha * xi->value;
597  xi++;
598  }
599  }
600 
601  /// \brief Inner product between a dense and a sparse vector.
602  static inline double inner_prod(RealVector const& w, const SparseVector* xi)
603  {
604  double ret = 0.0;
605  while (true)
606  {
607  if (xi->index == (std::size_t)-1) return ret;
608  ret += w[xi->index] * xi->value;
609  xi++;
610  }
611  }
612 
613  std::vector<SparseVector> storage; ///< storage for sparse vectors
614  std::vector<SparseVector*> x; ///< sparse vectors
615  RealVector y; ///< +1/-1 labels
616  RealVector diagonal; ///< diagonal entries of the quadratic matrix
617  std::size_t m_dim; ///< input space dimension
618 };
619 
620 
621 }
622 #endif