### 2002

Herbrich, Ralf; Graepel, Thore

A PAC-Bayesian Margin Bound for Linear Classifiers Journal Article

In: IEEE Transactions on Information Theory, vol. 48, no. 12, pp. 3140–3150, 2002.

@article{herbrich2002marginbound,

title = {A PAC-Bayesian Margin Bound for Linear Classifiers},

author = {Ralf Herbrich and Thore Graepel},

url = {https://www.herbrich.me/papers/ieee-pacbayes.pdf},

year = {2002},

date = {2002-01-01},

journal = {IEEE Transactions on Information Theory},

volume = {48},

number = {12},

pages = {3140–3150},

abstract = {We present a bound on the generalisation error of linear classifiers in terms of a refined margin quantity on the training sample. The result is obtained in a PAC-Bayesian framework and is based on geometrical arguments in the space of linear classifiers. The new bound constitutes an exponential improvement of the so far tightest margin bound, which was developed in the luckiness framework, and scales logarithmically in the inverse margin. Even in the case of less training examples than input dimensions sufficiently large margins lead to non-trivial bound values and â€'' for maximum margins â€'' to a vanishing complexity term. In contrast to previous results, however, the new bound does depend on the dimensionality of feature space. The analysis shows that the classical margin is too coarse a measure for the essential quantity that controls the generalisation error: the fraction of hypothesis space consistent with the training sample. The practical relevance of the result lies in the fact that the well-known support vector machine is optimal with respect to the new bound only if the feature vectors in the training sample are all of the same length. As a consequence we recommend to use SVMs on normalised feature vectors only. Numerical simulations support this recommendation and demonstrate that the new error bound can be used for the purpose of model selection.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Herbrich, Ralf; Williamson, Robert C

Learning and Generalization: Theoretical Bounds Book Section

In: Handbook of Brain Theory and Neural Networks, pp. 619–623, The MIT Press, 2002.

@incollection{herbrich2002learning,

title = {Learning and Generalization: Theoretical Bounds},

author = {Ralf Herbrich and Robert C Williamson},

url = {https://www.herbrich.me/papers/HerWil02.pdf},

year = {2002},

date = {2002-01-01},

booktitle = {Handbook of Brain Theory and Neural Networks},

pages = {619–623},

publisher = {The MIT Press},

edition = {2nd edition},

keywords = {},

pubstate = {published},

tppubtype = {incollection}

}

Herbrich, Ralf; Williamson, Robert C

Algorithmic Luckiness Journal Article

In: Journal of Machine Learning Research, vol. 3, pp. 175–212, 2002.

@article{herbrich2002algorithmicluckiness,

title = {Algorithmic Luckiness},

author = {Ralf Herbrich and Robert C Williamson},

url = {https://www.herbrich.me/papers/algoluck.pdf},

year = {2002},

date = {2002-01-01},

journal = {Journal of Machine Learning Research},

volume = {3},

pages = {175–212},

abstract = {Classical statistical learning theory studies the generalisation performance of machine learning algorithms rather indirectly. One of the main detours is that algorithms are studied in terms of the hypothesis class that they draw their hypotheses from. In this paper, motivated by the luckiness framework of Shawe-Taylor et al. (1998), we study learning algorithms more directly and in a way that allows us to exploit the serendipity of the training sample. The main difference to previous approaches lies in the complexity measure; rather than covering all hypotheses in a given hypothesis space it is only necessary to cover the functions which could have been learned using the fixed learning algorithm. We show how the resulting framework relates to the VC, luckiness and compression frameworks. Finally, we present an application of this framework to the maximum margin algorithm for linear classifiers which results in a bound that exploits the margin, the sparsity of the resultant weight vector, and the degree of clustering of the training data in feature space.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Hill, Simon; Zaragoza, Hugo; Herbrich, Ralf; Rayner, Peter

Average Precision and the Problem of Generalisation Proceedings Article

In: Proceedings of the ACM SIGIR Workshop on Mathematical and Formal Methods in Information Retrieval, 2002.

@inproceedings{hill2002ap,

title = {Average Precision and the Problem of Generalisation},

author = {Simon Hill and Hugo Zaragoza and Ralf Herbrich and Peter Rayner},

url = {https://www.herbrich.me/papers/hill.pdf},

year = {2002},

date = {2002-01-01},

booktitle = {Proceedings of the ACM SIGIR Workshop on Mathematical and Formal Methods in Information Retrieval},

abstract = {In this paper we study the problem of generalisation in information retrieval. In particular we study precision-recall curves and the average precision value. We provide two types of bounds: large-deviation bounds of the average precision and maximum deviation bounds with respect to a given point of the precision recall curve. The first type of bounds are useful to answer the question: how far can true average precision be from the value observed on a test collection? The second is useful for obtaining bounds on average precision when tight bounds on a particular point of the curve can be established, as is the case when training SVMs or Perceptrons for document categorisation.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Lawrence, Neil; Seeger, Matthias; Herbrich, Ralf

Fast Sparse Gaussian Process Methods: The Informative Vector Machine Proceedings Article

In: Advances in Neural Information Processing Systems 15, pp. 609–616, 2002.

@inproceedings{lawrence2002ivm,

title = {Fast Sparse Gaussian Process Methods: The Informative Vector Machine},

author = {Neil Lawrence and Matthias Seeger and Ralf Herbrich},

url = {https://www.herbrich.me/papers/ivm.pdf},

year = {2002},

date = {2002-01-01},

booktitle = {Advances in Neural Information Processing Systems 15},

pages = {609–616},

abstract = {We present a framework for sparse Gaussian process (GP) methods which uses forward selection with criteria based on information-theoretical principles, previously suggested for active learning. In contrast to most previous work on sparse GPs, our goal is not only to learn sparse predictors (which can be evaluated in $O(d)$ rather than $O(n)$, $d łl n$, $n$ the number of training points), but also to perform training under strong restrictions on time and memory requirements. The scaling of our method is at most $O(n cdot d^2)$, and in large real-world classification experiments we show that it can match prediction performance of the popular support vector machine (SVM), yet it requires only a fraction of the training time. In contrast to the SVM, our approximation produces estimates of predictive probabilities ("error bars"), allows for Bayesian model selection and is less complex in implementation.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Li, Yaoyong; Zaragoza, Hugo; Herbrich, Ralf; Shawe-Taylor, John; Kandola, Jasvinder

The Perceptron Algorithm with Uneven Margins Proceedings Article

In: Proceedings of the 19th International Conference of Machine Learning, pp. 379–386, 2002.

@inproceedings{li2002unevenmargins,

title = {The Perceptron Algorithm with Uneven Margins},

author = {Yaoyong Li and Hugo Zaragoza and Ralf Herbrich and John Shawe-Taylor and Jasvinder Kandola},

url = {https://www.herbrich.me/papers/paum.pdf},

year = {2002},

date = {2002-01-01},

booktitle = {Proceedings of the 19th International Conference of Machine Learning},

pages = {379–386},

abstract = {The perceptron algorithm with margins is a simple, fast and effective learning algorithm for linear classifiers; it produces decision hyperplanes within some constant ratio of the maximal margin. In this paper we study this algorithm and a new variant: the perceptron algorithm with uneven margins, tailored for document categorisation problems (i.e. problems where classes are highly unbalanced and performance depends on the ranking of patterns). We discuss the interest of these algorithms from a theoretical point of view, provide a generalisation of Novikoff's theorem for uneven margins, give a geometrically description of these algorithms and show experimentally that both algorithms yield equal or better performances than support vector machines, while reducing training time and sparsity, in classification (USPS) and document categorisation (Reuters) problems.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Robertson, Stephen E.; Walker, Stephen; Zaragoza, Hugo; Herbrich, Ralf

Microsoft Cambridge at TREC 2002: Filtering Track Proceedings Article

In: Proceedings of the 9th Text Retrieval Conference (TREC-9), pp. 361–368, 2002.

@inproceedings{robertson2002trec,

title = {Microsoft Cambridge at TREC 2002: Filtering Track},

author = {Stephen E. Robertson and Stephen Walker and Hugo Zaragoza and Ralf Herbrich},

url = {https://www.herbrich.me/papers/trec2002.pdf},

year = {2002},

date = {2002-01-01},

booktitle = {Proceedings of the 9th Text Retrieval Conference (TREC-9)},

pages = {361–368},

abstract = {Six runs were submitted for the Adaptive Filtering track, four on the adaptive filtering task (ok11af??), and two on the routing task (msPUM?). The adaptive filtering system has been somewhat modified from the one used for TREC-10, largely for efficiently and flexibility reasons; the basic filtering algorithms remain similar to those used in recent TRECs. For the routing task, a completely new system based on perceptrons with uneven margins was used.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

### 2001

Graepel, Thore; Goutrié, Mike; Krüger, Marco; Herbrich, Ralf

Learning on Graphs in the Game of Go Proceedings Article

In: Proceedings of the International Conference on Artifical Neural Networks, pp. 347–352, 2001.

@inproceedings{graepel2001go,

title = {Learning on Graphs in the Game of Go},

author = {Thore Graepel and Mike Goutrié and Marco Krüger and Ralf Herbrich},

url = {https://www.herbrich.me/papers/go.pdf},

year = {2001},

date = {2001-01-01},

booktitle = {Proceedings of the International Conference on Artifical Neural Networks},

pages = {347–352},

abstract = {We consider the game of Go from the point of view of machine learning and as a well-defined domain for learning on graph representations. We discuss the representation of both board positions and candidate moves and introduce the common fate graph (CFG) as an adequate representation of board positions for learning. Single candidate moves are represented as feature vectors with features given by subgraphs relative to the given move in the CFG. Using this representation we train a support vector machine (SVM) and a kernel perceptron to discriminate good moves from bad moves on a collection of life-and-death problems and on 9x9 game records. We thus obtain kernel machines that solve Go problems and play 9x9 Go.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Gretton, Arthur; Doucet, Arnaud; Herbrich, Ralf; Rayner, Peter; Schölkopf, Bernhard

Support Vector Regression for Black-Box System Identification Proceedings Article

In: Proceedings of the 11th IEEE Workshop on Statistical Signal Processing, pp. 341–344, 2001.

@inproceedings{gretton2001svr,

title = {Support Vector Regression for Black-Box System Identification},

author = {Arthur Gretton and Arnaud Doucet and Ralf Herbrich and Peter Rayner and Bernhard Schölkopf},

year = {2001},

date = {2001-01-01},

booktitle = {Proceedings of the 11th IEEE Workshop on Statistical Signal Processing},

pages = {341–344},

abstract = {In this paper, we demonstrate the use of support vector regression (SVR) techniques for black-box system identification. There methods derive from statistical learning theory, and are of great theoretical and practical interest. We briefly describe the theory underpinning SVR, and compare support vector methods with other approaches using radial basis networks. Finally, we apply SVR to modeling the behaviour of a hydralic robot arm, and show that SVR improves on previously published results.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Herbrich, Ralf; Graepel, Thore; Campbell, Colin

Bayes Point Machines Journal Article

In: Journal of Machine Learning Research, vol. 1, pp. 245–279, 2001.

@article{herbrich2001bpm,

title = {Bayes Point Machines},

author = {Ralf Herbrich and Thore Graepel and Colin Campbell},

url = {https://www.herbrich.me/papers/bpm.pdf},

year = {2001},

date = {2001-01-01},

journal = {Journal of Machine Learning Research},

volume = {1},

pages = {245–279},

abstract = {Kernel-classifiers comprise a powerful class of non-linear decision functions for binary classification. The support vector machine is an example of a learning algorithm for kernel classifiers that singles out the consistent classifier with the largest margin, i.e. minimal real-valued output on the training sample, within the set of consistent hypotheses, the so-called version space. We suggest the Bayes point machine as a well-founded improvement which approximates the Bayes-optimal decision by the centre of mass of version space. We present two algorithms to stochastically approximate the centre of mass of version space: a billiard sampling algorithm and a sampling algorithm based on the well known perceptron algorithm. It is shown how both algorithms can be extended to allow for soft-boundaries in order to admit training errors. Experimentally, we find that — for the zero training error case — Bayes point machines consistently outperform support vector machines on both surrogate data and real-world benchmark data sets. In the soft-boundary/soft-margin case, the improvement over support vector machines is shown to be reduced. Finally, we demonstrate that the real-valued output of single Bayes points on novel test points is a valid confidence measure and leads to a steady decrease in generalisation error when used as a rejection criterion.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Herbrich, Ralf; Williamson, Robert C

Algorithmic Luckiness Proceedings Article

In: Advances in Neural Information Processing Systems 14, pp. 391–397, 2001.

@inproceedings{herbrich2001algoluck,

title = {Algorithmic Luckiness},

author = {Ralf Herbrich and Robert C Williamson},

url = {https://www.herbrich.me/papers/nips01_algoluck.pdf},

year = {2001},

date = {2001-01-01},

booktitle = {Advances in Neural Information Processing Systems 14},

pages = {391–397},

abstract = {In contrast to standard statistical learning theory which studies uniform bounds on the expected error we present a framework that exploits the specific learning algorithm used. Motivated by the luckiness framework [Taylor et al., 1998] we are also able to exploit the serendipity of the training sample. The main difference to previous approaches lies in the complexity measure; rather than covering all hypotheses in a given hypothesis space it is only necessary to cover the functions which could have been learned using the fixed learning algorithm. We show how the resulting framework relates to the VC, luckiness and compression frameworks. Finally, we present an application of this framework to the maximum margin algorithm for linear classifiers which results in a bound that exploits both the margin and the distribution of the data in feature space.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Schölkopf, Bernhard; Herbrich, Ralf; Smola, Alexander

A Generalized Representer Theorem Proceedings Article

In: Proceedings of the Fourteenth Annual Conference on Computational Learning Theory, pp. 416–426, 2001.

@inproceedings{scholkopf2001representer,

title = {A Generalized Representer Theorem},

author = {Bernhard Schölkopf and Ralf Herbrich and Alexander Smola},

url = {https://www.herbrich.me/papers/colt2001.pdf},

year = {2001},

date = {2001-01-01},

booktitle = {Proceedings of the Fourteenth Annual Conference on Computational Learning Theory},

pages = {416–426},

abstract = {Wahba's classical representer theorem states that the solutions of certain risk minimization problems involving an empirical risk term and a quadratic regularizer can be written as expansions in terms of the training examples. We generalize the theorem to a larger class of regularizers and empirical risk terms, and give a self-contained proof utilizing the feature space associated with a kernel. The result shows that a wide range of problems have optimal solutions that live in the finite dimensional span of the training examples mapped into feature space, thus enabling us to carry out kernel algorithms independent of the (potentially infinite) dimensionality of the feature space.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

### 2000

Graepel, Thore; Herbrich, Ralf

The Kernel Gibbs Sampler Proceedings Article

In: Advances in Neural Information Processing Systems 13, pp. 514–520, The MIT Press, 2000.

@inproceedings{herbrich2000kernelgibbs,

title = {The Kernel Gibbs Sampler},

author = {Thore Graepel and Ralf Herbrich},

url = {https://www.herbrich.me/papers/kernel-gibbs-sampler.pdf},

year = {2000},

date = {2000-01-01},

booktitle = {Advances in Neural Information Processing Systems 13},

pages = {514–520},

publisher = {The MIT Press},

abstract = {We present an algorithm that samples the hypothesis space of kernel classifiers. Given a uniform prior over normalised weight vectors and a likelihood based on a model of label noise leads to a piecewise constant posterior that can be sampled by the kernel Gibbs sampler (KGS). The KGS is a Markov Chain Monte Carlo method that chooses a random direction in parameter space and samples from the resulting piecewise constant density along the line chosen. The KGS can be used as an analytical tool for the exploration of Bayesian transduction, Bayes point machines, active learning, and evidence-based model selection on small data sets that are contaminated with label noise. For a simple toy example we demonstrate experimentally how a Bayes point machine based on the KGS outperforms an SVM that is incapable of taking into account label noise.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Graepel, Thore; Herbrich, Ralf; Shawe-Taylor, John

Generalisation Error Bounds for Sparse Linear Classifiers Proceedings Article

In: Proceedings of the 13th Annual Conference on Computational Learning Theory, pp. 298–303, 2000.

@inproceedings{graepel2000sparse,

title = {Generalisation Error Bounds for Sparse Linear Classifiers},

author = {Thore Graepel and Ralf Herbrich and John Shawe-Taylor},

url = {https://www.herbrich.me/papers/colt00_sparse.pdf},

year = {2000},

date = {2000-01-01},

booktitle = {Proceedings of the 13th Annual Conference on Computational Learning Theory},

pages = {298–303},

abstract = {We provide small sample size bounds on the generalisation error of linear classiers that are sparse in their dual representation given by the expansion coecients of the weight vector in terms of the training data. These results theoretically justify algorithms like the Support Vector Machine, the Relevance Vector Machine and K-nearest-neighbour. The

bounds are a-posteriori bounds to be evaluated after learning when the attained level of sparsity is known. In a PAC-Bayesian style prior knowledge about the expected sparsity is incorporated into the bounds. The proofs avoid the use of double sample arguments by taking into account the sparsity that leaves unused training points for the evaluation of classiers. We furthermore give a PAC-Bayesian bound on the average generalisation error over subsets of parameter space that may pave the way combining sparsity in the expansion coefficients and margin in a single bound. Finally, reinterpreting a mistake bound for the classical perceptron algorithm due to Noviko we demonstrate that our new results put classifiers found by this algorithm on a firm theoretical basis.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

bounds are a-posteriori bounds to be evaluated after learning when the attained level of sparsity is known. In a PAC-Bayesian style prior knowledge about the expected sparsity is incorporated into the bounds. The proofs avoid the use of double sample arguments by taking into account the sparsity that leaves unused training points for the evaluation of classiers. We furthermore give a PAC-Bayesian bound on the average generalisation error over subsets of parameter space that may pave the way combining sparsity in the expansion coefficients and margin in a single bound. Finally, reinterpreting a mistake bound for the classical perceptron algorithm due to Noviko we demonstrate that our new results put classifiers found by this algorithm on a firm theoretical basis.

Graepel, Thore; Herbrich, Ralf; Williamson, Robert C

From Margin to Sparsity Proceedings Article

In: Advances in Neural Information Processing Systems 13, pp. 210–216, The MIT Press, 2000.

@inproceedings{graepel2000marginsparsity,

title = {From Margin to Sparsity},

author = {Thore Graepel and Ralf Herbrich and Robert C Williamson},

url = {https://www.herbrich.me/papers/perc.pdf},

year = {2000},

date = {2000-01-01},

booktitle = {Advances in Neural Information Processing Systems 13},

pages = {210–216},

publisher = {The MIT Press},

abstract = {We present an improvement of Novikoff's perceptron convergence theorem. Reinterpreting this mistake bound as a margin dependent sparsity guarantee allows us to give a PAC-style generalisation error bound for the classifier learned by the dual perceptron learning algorithm. The bound value crucially depends on the margin a support vector machine would achieve on the same data set using the same kernel. Ironically, the bound yields better guarantees than are currently available for the support vector solution itself.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Herbrich, Ralf

Learning Linear Classifiers - Theory and Algorithms PhD Thesis

Technical University of Berlin, 2000.

@phdthesis{herbrich2000phd,

title = {Learning Linear Classifiers - Theory and Algorithms},

author = {Ralf Herbrich},

year = {2000},

date = {2000-01-01},

school = {Technical University of Berlin},

keywords = {},

pubstate = {published},

tppubtype = {phdthesis}

}

Herbrich, Ralf; Graepel, Thore

Large Scale Bayes Point Machines Proceedings Article

In: Advances in Neural Information Processing Systems 13, pp. 528–534, The MIT Press, 2000.

@inproceedings{herbrich2000largebpm,

title = {Large Scale Bayes Point Machines},

author = {Ralf Herbrich and Thore Graepel},

url = {https://www.herbrich.me/papers/mnist.pdf},

year = {2000},

date = {2000-01-01},

booktitle = {Advances in Neural Information Processing Systems 13},

pages = {528–534},

publisher = {The MIT Press},

abstract = {The concept of averaging over classifiers is fundamental to the Bayesian analysis of learning. Based on this viewpoint, it has recently been demonstrated for linear classifiers that the centre of mass of version space (the set of all classifiers consistent with the training set) - also known as the Bayes point - exhibits excellent generalisation abilities. However, the billiard algorithm as presented in [Herbrich et al., 2000] is restricted to small sample size because it requires O(m*m) of memory and O(N*m*m) computational steps where m is the number of training patterns and N is the number of random draws from the posterior distribution. In this paper we present a method based on the simple perceptron learning algorithm which allows to overcome this algorithmic drawback. The method is algorithmically simple and is easily extended to the multi-class case. We present experimental results on the MNIST data set of handwritten digits which show that Bayes Point Machines are competitive with the current world champion, the support vector machine. In addition, the computational complexity of BPMs can be tuned by varying the number of samples from the posterior. Finally, rejecting test points on the basis of their (approximative) posterior probability leads to a rapid decrease in generalisation error, e.g. 0.1% generalisation error for a given rejection rate of 10%.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Herbrich, Ralf; Graepel, Thore

A PAC-Bayesian Margin Bound for Linear Classifiers: Why SVMs work Proceedings Article

In: Advances in Neural Information Processing Systems 13, pp. 224–230, 2000.

@inproceedings{herbrich2000pacbayesiansvm,

title = {A PAC-Bayesian Margin Bound for Linear Classifiers: Why SVMs work},

author = {Ralf Herbrich and Thore Graepel},

url = {https://www.herbrich.me/papers/pacbayes.pdf},

year = {2000},

date = {2000-01-01},

booktitle = {Advances in Neural Information Processing Systems 13},

pages = {224–230},

abstract = {We present a bound on the generalisation error of linear classifiers in terms of a refined margin quantity on the training set. The result is obtained in a PAC-Bayesian framework and is based on geometrical arguments in the space of linear classifiers. The new bound constitutes an exponential improvement of the so far tightest margin bound by Shawe-Taylor et al. and scales logarithmically in the inverse margin. Even in the case of less training examples than input dimensions sufficiently large margins lead to non-trivial bound values and - for maximum margins - to a vanishing complexity term. Furthermore, the classical margin is too coarse a measure for the essential quantity that controls the generalisation error: the volume ratio between the whole hypothesis space and the subset of consistent hypotheses. The practical relevance of the result lies in the fact that the well-known support vector machine is optimal w.r.t. the new bound only if the feature vectors are all of the same length. As a consequence we recommend to use SVMs on normalised feature vectors only - a recommendation that is well supported by our numerical experiments on two benchmark data sets.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Herbrich, Ralf; Graepel, Thore; Campbell, Colin

Robust Bayes Point Machines Proceedings Article

In: Proceedings of European Symposium on Artificial Neural Networks, pp. 49–54, 2000.

@inproceedings{herbrich2000robustbpm,

title = {Robust Bayes Point Machines},

author = {Ralf Herbrich and Thore Graepel and Colin Campbell},

url = {https://www.herbrich.me/papers/esann00.pdf},

year = {2000},

date = {2000-01-01},

booktitle = {Proceedings of European Symposium on Artificial Neural Networks},

pages = {49–54},

abstract = {Support Vector Machines choose the hypothesis corresponding to the centre of the largest hypersphere that can be inscribed in version space. If version space is elongated or irregularly shaped a potentially superior approach is take into account the whole of version space. We propose to construct the Bayes point which is approximated by the centre of mass. Our implementation of a Bayes Point Machine (BPM) uses an ergodic billiard to estimate this point in the kernel space. We show that BPMs outperform hard margin Support Vector Machines (SVMs) on real world datasets. We introduce a technique that allows the BPM to construct hypotheses with non-zero training error similar to soft margin SVMs with quadratic penelisation of the margin slacks. An experimental study reveals that with decreasing penelisation of training error the improvement of BPMs over SVMs decays, a finding that is explained by geometrical considerations.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Herbrich, Ralf; Graepel, Thore; Shawe-Taylor, John

Sparsity vs. Large Margins for Linear Classifiers Proceedings Article

In: Proceedings of the 13th1 Annual Conference on Computational Learning Theory, pp. 304–308, 2000.

@inproceedings{herbrich2000sparsitymarginlinear,

title = {Sparsity vs. Large Margins for Linear Classifiers},

author = {Ralf Herbrich and Thore Graepel and John Shawe-Taylor},

url = {https://www.herbrich.me/papers/colt00_sparsemargin.pdf},

year = {2000},

date = {2000-01-01},

booktitle = {Proceedings of the 13th1 Annual Conference on Computational Learning Theory},

pages = {304–308},

abstract = {We provide small sample size bounds on the generalisation error of linear classifiers that take advantage of large observed margins on the training set and sparsity in the data dependent expansion coefficients. It is already known from results in the luckiness framework that both criteria independently have a large impact on the generalisation error. Our new results show that they can be combined which theoretically justifies learning algorithms like the Support Vector Machine or the Relevance Vector Machine. In contrast to previous studies we avoid using the classical technique of symmetrisation by a ghost sample but directly using the sparsity for the estimation of the generalisation error. We demonstrate that our result leads to practical useful results even in case of small sample size if the training set witnesses our prior belief in sparsity and large margins.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

### 1999

Graepel, Thore; Herbrich, Ralf; Obermayer, Klaus

Bayesian Transduction Proceedings Article

In: Advances in Neural Information Processing Systems 12, pp. 456–462, The MIT Press, 1999.

@inproceedings{graepel1999bayesiantransduction,

title = {Bayesian Transduction},

author = {Thore Graepel and Ralf Herbrich and Klaus Obermayer},

url = {https://www.herbrich.me/papers/trans.pdf},

year = {1999},

date = {1999-01-01},

booktitle = {Advances in Neural Information Processing Systems 12},

pages = {456–462},

publisher = {The MIT Press},

abstract = {Transduction is an inference principle that takes a training sample and aims at estimating the values of a function at given points contained in the so-called working sample. Hence, transduction is a less ambitious task than induction which aims at inferring a functional dependency on the whole of input space. As a consequence, however, transduction provides a confidence measure on single predictions rather than classifiers, a feature particularly important for risk-sensitive applications. We consider the case of binary classification by linear discriminant functions (perceptrons) in kernel space. From the transductive point of view, the infinite number of perceptrons is boiled down to a finite number of equivalence classes on the working sample each of which corresponds to a polyhedron in parameter space. In the Bayesian spirit the posteriori probability of a labelling of the working sample is determined as the ratio between the volume of the corresponding polyhedron and the volume of version space. Then the maximum posteriori scheme recommends to choose the labelling of maximum volume. We suggest to sample version space by an ergodic billiard in kernel space. Experimental results on real world data indicate that Bayesian Transduction compares favourably to the well-known Support Vector Machine, in particular if the posteriori probability of labellings is used as a confidence measure to exclude test points of low confidence.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Graepel, Thore; Herbrich, Ralf; Schölkopf, Bernhard; Smola, Alex; Bartlett, Peter; Müller, Klaus Robert; Obermayer, Klaus; Williamson, Robert C

Classification on Proximity Data with LP-Machines Proceedings Article

In: Proceedings of the 9th International Conference on Artificial Neural Networks, pp. 304–309, 1999.

@inproceedings{graepel1999proximity,

title = {Classification on Proximity Data with LP-Machines},

author = {Thore Graepel and Ralf Herbrich and Bernhard Schölkopf and Alex Smola and Peter Bartlett and Klaus Robert Müller and Klaus Obermayer and Robert C Williamson},

url = {https://www.herbrich.me/papers/icann99_proxy.pdf},

year = {1999},

date = {1999-01-01},

booktitle = {Proceedings of the 9th International Conference on Artificial Neural Networks},

pages = {304–309},

abstract = {We provide a new linear program to deal with classification of data in the case of data given in terms of pairwise proximities. This allows to avoid the problems inherent in using feature spaces with indefinite metric in Support Vector Machines, since the notion of a margin is purely needed in input space where the classification actually occurs. Moreover in our approach we can enforce sparsity in the proximity representation by sacrificing training error. This turns out to be favorable for proximity data. Similar to nu-SV methods, the only parameter needed in the algorithm is the (asymptotical) number of data points being classified with a margin. Finally, the algorithm is successfully compared with nu-SV learning in proximity space and K-nearest-neighbors on real world data from Neuroscience and molecular biology.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Herbrich, Ralf; Graepel, Thore; Campbell, Colin

Bayes Point Machines : Estimating the Bayes Point in Kernel Space Proceedings Article

In: Proceedings of IJCAI Workshop Support Vector Machines, pp. 23–27, 1999.

@inproceedings{herbrich1999bpm,

title = {Bayes Point Machines : Estimating the Bayes Point in Kernel Space},

author = {Ralf Herbrich and Thore Graepel and Colin Campbell},

url = {https://www.herbrich.me/papers/ijcai99.pdf},

year = {1999},

date = {1999-01-01},

booktitle = {Proceedings of IJCAI Workshop Support Vector Machines},

pages = {23–27},

abstract = {From a Bayesian perspective Support Vector Machines choose the hypothesis corresponding to the largest possible hypersphere that can be inscribed in version space, i.e. in the space of all consistent hypotheses given a training set. Those boundaries of version space which are tangent to the hypersphere define the support vectors. An alternative and potentially better approach is to construct the hypothesis using the whole of version space. This is achieved by using a Bayes Point Machine which finds the midpoint of the region of intersection of all hyperplanes bisecting version space into two halves of equal volume (the Bayes point). It is known that the center of mass of version space approximates the Bayes point. We suggest estimating the center of mass by averaging over the trajectory of a billiard ball bouncing in version space. Experimental results are presented indicating that Bayes Point Machines consistently outperform Support Vector Machines.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Herbrich, Ralf; Graepel, Thore; Obermayer, Klaus

Support Vector Learning for Ordinal Regression Proceedings Article

In: Proceedings of the 9th International Conference on Artificial Neural Networks, pp. 97–102, 1999.

@inproceedings{herbrich1999ordinalregression,

title = {Support Vector Learning for Ordinal Regression},

author = {Ralf Herbrich and Thore Graepel and Klaus Obermayer},

url = {https://www.herbrich.me/papers/icann99_ordinal.pdf},

year = {1999},

date = {1999-01-01},

booktitle = {Proceedings of the 9th International Conference on Artificial Neural Networks},

pages = {97–102},

abstract = {We investigate the problem of predicting variables of ordinal scale. This task is referred to as ordinal regression and is complementary to the standard machine learning tasks of classification and metric regression. In contrast to statistical models we present a distribution independent formulation of the problem together with uniform bounds of the risk functional. The approach presented is based on a mapping from objects to scalar utility values. Similar to Support Vector methods we derive a new learning algorithm for the task of ordinal regression based on large margin rank boundaries. We give experimental results for an information retrieval task: learning the order of documents w.r.t. an initial query. Experimental results indicate that the presented algorithm outperforms more naive approaches to ordinal regression such as Support Vector classification and Support Vector regression in the case of more than two ranks.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Herbrich, Ralf; Graepel, Thore; Obermayer, Klause

Large Margin Rank Boundaries for Ordinal Regression Book Section

In: Advances in Large Margin Classifiers, pp. 115–132, The MIT Press, 1999.

@incollection{herbrich1999largemarginrank,

title = {Large Margin Rank Boundaries for Ordinal Regression},

author = {Ralf Herbrich and Thore Graepel and Klause Obermayer},

url = {https://www.herbrich.me/papers/nips98_ordinal.pdf},

year = {1999},

date = {1999-01-01},

booktitle = {Advances in Large Margin Classifiers},

pages = {115–132},

publisher = {The MIT Press},

chapter = {7},

abstract = {In contrast to the standard machine learning tasks of classification and metric regression we investigate the problem of predicting variables of ordinal scale, a setting referred to as ordinal regression. This problem arises frequently in the social sciences and in information retrieval where human preferences play a major role. Whilst approaches proposed in statistics rely on a probability model of a latent (unobserved) variable we present a distribution independent risk formulation of ordinal regression which allows us to derive a uniform convergence bound. Applying this bound we present a large margin algorithm that is based on a mapping from objects to scalar utility values thus classifying pairs of objects. We give experimental results for an information retrieval task which show that our algorithm outperforms more naive approaches to ordinal regression such as Support Vector Classification and Support Vector Regression in the case of more than two ranks.},

keywords = {},

pubstate = {published},

tppubtype = {incollection}

}

Herbrich, Ralf; Keilbach, Max; Bollmann-Sdorra, Peter; Obermayer, Klaus

Neural Networks in Economics : Background, Applications and New Developments Journal Article

In: Advances in Computational Economics, vol. 11, pp. 169–196, 1999.

@article{herbrich1999neuralnetworks,

title = {Neural Networks in Economics : Background, Applications and New Developments},

author = {Ralf Herbrich and Max Keilbach and Peter Bollmann-Sdorra and Klaus Obermayer},

url = {https://www.herbrich.me/papers/herkeilgraebollober99.pdf},

year = {1999},

date = {1999-01-01},

journal = {Advances in Computational Economics},

volume = {11},

pages = {169–196},

abstract = {Neural Networks were developed in the sixties as devices for classification and regression. The approach was originally inspired from Neuroscience. Its attractiveness lies in the ability to 'learn', i.e. to generalize to as yet unseen observations. One aim of this paper is to give an introduction to the technique of Neural Networks and an overview of the most popular architectures. We start from statistical learning theory to introduce the basics of learning. Then, we give an overview of the general principles of neural networks and of their use in the field of Economics. A second purpose is to introduce a recently developed Neural Network Learning technique, so called Support Vector Network Learning, which is an application of ideas from statistical learning theory. This approach has shown very promising results on problems with a limited amount of training examples. Moreover, utilizing a technique that is known as the kernel trick, Support Vector Networks can easily be adapted to nonlinear models. Finally, we present an economic application of this approach from the field of preference learning.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Herbrich, Ralf; Weston, Jason

Adaptive Margin Support Vector Machines for Classification Proceedings Article

In: Proceedings of the 9th International Conference on Artificial Neural Networks, pp. 97–102, 1999.

@inproceedings{herbrich1999ann,

title = {Adaptive Margin Support Vector Machines for Classification},

author = {Ralf Herbrich and Jason Weston},

url = {https://www.herbrich.me/papers/icann99_ann.pdf},

year = {1999},

date = {1999-01-01},

booktitle = {Proceedings of the 9th International Conference on Artificial Neural Networks},

pages = {97–102},

abstract = {In this paper we propose a new learning algorithm for classification learning based on the Support Vector Machine (SVM) approach. Existing approaches for constructing SVMs are based on minimization of a regularized margin loss where the margin is treated equivalently for each train- ing pattern. We propose a reformulation of the minimization problem such that adaptive margins for each training pattern are utilized, which we call the Adaptive Margin (AM)-SVM. We give bounds on the generalization error of AM-SVMs which justify their robustness against outliers, and show experimentally that the generalization error of AM-SVMs is comparable to classical SVMs on benchmark datasets from the UCI repository.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Weston, Jason; Herbrich, Ralf

Adaptive Margin Support Vector Machines Book Section

In: Advances in Large Margin Classifiers, pp. 281–296, The MIT Press, 1999.

@incollection{weston1999ann,

title = {Adaptive Margin Support Vector Machines},

author = {Jason Weston and Ralf Herbrich},

url = {https://www.herbrich.me/papers/nips98_ann.pdf},

year = {1999},

date = {1999-01-01},

booktitle = {Advances in Large Margin Classifiers},

pages = {281–296},

publisher = {The MIT Press},

chapter = {15},

abstract = {In this chapter we present a new learning algorithm, Leave-One-Out (LOO -) SVMs and its generalization Adaptive Margin (AM-) SVMs, inspired by a recent upper bound on the leave-one-out error proved for kernel classifiers by Jaakkola and Haussler. The new approach minimizes the expression given by the bound in an attempt to minimize the leave-one-out error. This gives a convex optimization problem which constructs a sparse linear classifier in feature space using the kernel technique. As such the algorithm possesses many of the same properties as SVMs and Linear Programming (LP-) SVMs. These former techniques are based on the minimization of a regularized margin loss, where the margin is treated equivalently for each training pattern. We propose a minimization problem such that adaptive margins for each training pattern are utilized. Furthermore, we give bounds on the generalization error of the approach which justifies its robustness against outliers. We show experimentally that the generalization error of AM-SVMs is comparable to SVMs and LP-SVMs on benchmark datasets from the UCI repository.},

keywords = {},

pubstate = {published},

tppubtype = {incollection}

}

### 1998

Graepel, Thore; Herbrich, Ralf; Bollmann-Sdorra, Peter; Obermayer, Klaus

Classification on Pairwise Proximity Data Proceedings Article

In: Advances in Neural Information Processing Systems 11, pp. 438–444, The MIT Press, 1998.

@inproceedings{graepel1998classificationpairwise,

title = {Classification on Pairwise Proximity Data},

author = {Thore Graepel and Ralf Herbrich and Peter Bollmann-Sdorra and Klaus Obermayer},

url = {https://www.herbrich.me/papers/graeherbollober99.pdf},

year = {1998},

date = {1998-01-01},

booktitle = {Advances in Neural Information Processing Systems 11},

pages = {438–444},

publisher = {The MIT Press},

abstract = {We investigate the problem of learning a classification task on data represented in terms of their pairwise proximities. This representation does not refer to an explicit feature representation of the data items and is thus more general than the standard approach of using Euclidean feature vectors, from which pairwise proximities can always be calculated. Our first approach is based on a combined linear embedding and classification procedure resulting in an extension of the Optimal Hyperplane algorithm to pseudo-Euclidean data. As an alternative we present another approach based on a linear threshold model in the proximity values themselves, which is optimized using Structural Risk Minimization. We show that prior knowledge about the problem can be incorporated by the choice of distance measures and examine different metrics w.r.t. their generalization. Finally, the algorithms are successfully applied to protein structure data and to data from the cat's cerebral cortex. They show better performance than K-nearest-neighbor classification.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Herbrich, Ralf; Graepel, Thore; Bollmann-Sdorra, Peter; Obermayer, Klaus

Learning Preference Relations for Information Retrieval Proceedings Article

In: Proceedings of the International Conference on Machine Learning Workshop: Text Categorization and Machine learning, pp. 80–84, 1998.

@inproceedings{herbrich1998preference,

title = {Learning Preference Relations for Information Retrieval},

author = {Ralf Herbrich and Thore Graepel and Peter Bollmann-Sdorra and Klaus Obermayer},

url = {https://www.herbrich.me/papers/hergraebollober98.pdf},

year = {1998},

date = {1998-01-01},

booktitle = {Proceedings of the International Conference on Machine Learning Workshop: Text Categorization and Machine learning},

pages = {80–84},

abstract = {In this paper we investigate the problem of learning a preference relation from a given set of ranked documents. We show that the Bayes's optimal decision function, when applied to learning a preference relation, may violate transitivity. This is undesirable for information retrieval, because it is in conflict with a document ranking based on the user's preferences. To overcome this problem we present a vector space based method that performs a linear mapping from documents to scalar utility values and thus guarantees transitivity. The learning of the relation between documents is formulated as a classification problem on pairs of documents and is solved using the principle of structural risk minimization for good generalization. The approach is extended to polynomial utility functions by using the potential function method (the so called "kernel trick"), which allows to incorporate higher order correlations of features into the utility function at minimal computational costs. The resulting algorithm is tested on an example with artificial data. The algorithm successfully learns the utility function underlying the training examples and shows good classification performance.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

### 1997

Herbrich, Ralf

Segmentierung mit Gaborfiltern zur Induktion struktureller Klassifikatoren auf Bilddaten Masters Thesis

Technical University Berlin, 1997.

@mastersthesis{herbrich1997master,

title = {Segmentierung mit Gaborfiltern zur Induktion struktureller Klassifikatoren auf Bilddaten},

author = {Ralf Herbrich},

url = {https://www.herbrich.me/papers/diplom.pdf},

year = {1997},

date = {1997-01-01},

school = {Technical University Berlin},

keywords = {},

pubstate = {published},

tppubtype = {mastersthesis}

}

Herbrich, Ralf; Scheffer, Tobias

Generation of Task-Specific Segmentation Procedures as a Model Selection Task Proceedings Article

In: Proceedings of Workshop on Visual Information Processing, pp. 11–21, 1997.

@inproceedings{herbrich1997segmentation,

title = {Generation of Task-Specific Segmentation Procedures as a Model Selection Task},

author = {Ralf Herbrich and Tobias Scheffer},

url = {https://www.herbrich.me/papers/herbrich97.pdf},

year = {1997},

date = {1997-01-01},

booktitle = {Proceedings of Workshop on Visual Information Processing},

pages = {11–21},

abstract = {In image segmentation problems, there is usually a vast amount of filter operations available, a subset of which has to be selected and instantiated in order to obtain a satisfactory segmentation procedure for a particular do- main. In supervised segmentation, a mapping from features, such as filter outputs for individual pixels, to classes is induced automatically. However, since the sample size required for supervised learning grows exponentially in the number of features it is not feasible to learn a segmentation procedure from a large amount of possible filters. But we argue that automatic model selection methods are able to select a region model in terms of some filters.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Scheffer, Tobias; Herbrich, Ralf

Unbiased Assesment of Learning Algorithms Proceedings Article

In: Proceedings of the International Joint Conference on Artificial Intelligence, pp. 798–803, 1997.

@inproceedings{scheffer1997assessment,

title = {Unbiased Assesment of Learning Algorithms},

author = {Tobias Scheffer and Ralf Herbrich},

url = {https://www.herbrich.me/papers/scheffer97.pdf},

year = {1997},

date = {1997-01-01},

booktitle = {Proceedings of the International Joint Conference on Artificial Intelligence},

pages = {798–803},

abstract = {In order to rank the performance of machine learning algorithms, many researchs conduct experiments on benchmark datasets. Since most learning algorithms have domain-specific parameters, it is a popular custom to adapt these parameters to obtain a minimal error rate on the test set. The same rate is used to rank the algorithm which causes an optimistic bias. We quantify this bias, showing in particular that an algorithm with more parameters will probably be ranked higher than an equally good algorithm with fewer parameters. We demonstrate this result, showing the number of parameters and trials required in order to pretend to outperform C4.5 or FOIL, respectively, for various benchmark problems. We then describe how unbiased ranking experiments should be conducted.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

### 1996

Scheffer, Tobias; Herbrich, Ralf; Wysotzki, Fritz

Efficient Θ-Subsumption Based on Graph Algorithms Proceedings Article

In: Lecture Notes in Artifical Intelligence: 6th International Workshop on Inductive Logic Programming,, pp. 212–228, 1996.

@inproceedings{scheffer1996subsumption,

title = {Efficient Θ-Subsumption Based on Graph Algorithms},

author = {Tobias Scheffer and Ralf Herbrich and Fritz Wysotzki},

url = {https://www.herbrich.me/papers/scheffer96.pdf},

year = {1996},

date = {1996-01-01},

booktitle = {Lecture Notes in Artifical Intelligence: 6th International Workshop on Inductive Logic Programming,},

volume = {1314},

pages = {212–228},

abstract = {The θ-subsumption problem is crucial to the efficiency of ILP learning systems. We discuss two θ-subsumption algorithms based on strategies for preselecting suitable matching literals. The class of clauses, for which subsumption becomes polynomial, is a superset of the deterministic clauses. We further map the general problem of θ-subsumption to a certain problem of finding a clique of fixed size in a graph, and in return show that a specialization of the pruning strategy of the Carraghan and Pardalos clique algorithm provides a dramatic reduction of the subsumption search space. We also present empirical results for the mesh design data set.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}