2024
Alder, Nicolas; Herbrich, Ralf
Energy-Efficient Gaussian Processes Using Low-Precision Arithmetic Proceedings Article
In: Proceedings of the 41st International Conference on Machine Learning, 2024.
@inproceedings{AldHer2024a,
title = {Energy-Efficient Gaussian Processes Using Low-Precision Arithmetic},
author = {Nicolas Alder and Ralf Herbrich},
url = {https://www.herbrich.me/papers/low_precision_gp_cameraReady_final.pdf},
year = {2024},
date = {2024-01-01},
booktitle = {Proceedings of the 41st International Conference on Machine Learning},
abstract = {The widespread use of artificial intelligence requires finding energy-efficient paradigms for the field. We propose to reduce the energy consumption of Gaussian process regression using low-precision floating-point representations. We explore how low-precision representations impact the results of Gaussian process regression and how data set properties, implementation approach, model performance, and energy consumption interact. Our findings show that a well-conditioned kernel matrix allows reducing the energy consumption by up to 89.01% for 98.08% of arithmetic operations with little to no impact on model performance. Our findings are relevant whenever one needs to invert a symmetric full-rank matrix.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Mattes, Paul; Schlosser, Rainer; Herbrich, Ralf
Hieros: Hierarchical Imagination on Structured State Space Sequence World Models Proceedings Article
In: Proceedings of the 41st International Conference on Machine Learning, 2024.
@inproceedings{MatSchHer2024a,
title = {Hieros: Hierarchical Imagination on Structured State Space Sequence World Models},
author = {Paul Mattes and Rainer Schlosser and Ralf Herbrich},
url = {https://herbrich.me/papers/hieros_icml-8.pdf},
year = {2024},
date = {2024-01-01},
booktitle = {Proceedings of the 41st International Conference on Machine Learning},
abstract = {One of the biggest challenges to modern deep reinforcement learning (DRL) algorithms is sample efficiency. Many approaches learn a world model in order to train an agent entirely in imagination, eliminating the need for direct environ- ment interaction during training. However, these methods often suffer from either a lack of imagi- nation accuracy, exploration capabilities, or run- time efficiency. We propose HIEROS, a hierar- chical policy that learns time abstracted world representations and imagines trajectories at mul- tiple time scales in latent space. HIEROS uses an S5 layer-based world model, which predicts next world states in parallel during training and itera- tively during environment interaction. Due to the special properties of S5 layers, our method can train in parallel and predict next world states iter- atively during imagination. This allows for more efficient training than RNN-based world models and more efficient imagination than Transformer- based world models. We show that our approach outperforms the state of the art in terms of mean and median normalized human score on the Atari 100k benchmark, and that our proposed world model is able to predict complex dynamics very accurately. We also show that HIEROS displays superior exploration capabilities compared to existing approaches.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2022
Herbrich, Ralf
On Noisy Additions Unpublished
2022, (Draft Technical Report).
@unpublished{herbrich2022noisyadditions,
title = {On Noisy Additions},
author = {Ralf Herbrich},
url = {https://herbrich.me/papers/noisy_additions.pdf},
year = {2022},
date = {2022-07-01},
abstract = {So far, most of algorithmic development has relied on the concept of correct computing, in particular at the level of the basic compute unit: the arithmetic logical unit (ALU). With the rapid rise of machine learning (ML) and artificial intelligence (AI) systems, correct computing is no longer strictly required as almost all ML and AI algorithm are based on numerical approximations of (cost) minimization problems. On the other hand, energy-efficiency is becoming increasingly important. While current research into energy-efficient ML/AI focuses on em approximate representations of numbers (e.g., k-bit deep neural networks), we will explore the idea of approximate computing resulting out of under-powering the ALU of a processor. In this note, we will analytically derive the distribution over integers resulting from under-powering an ALU using message-passing to efficiently compute these distributions. One application of knowing this characterization is to use under-powered ALUs to directly implement probabilistic programs where the distribution parameters are directly controlled by the CMOS supply voltage.},
note = {Draft Technical Report},
keywords = {},
pubstate = {published},
tppubtype = {unpublished}
}
Kurle, Richard; Herbrich, Ralf; Januschowski, Tim; Wang, Yuyang; Gasthaus, Jan
On the detrimental effect of invariances in the likelihood for variational inference Proceedings Article
In: Advances in Neural Information Procesing Systems 36, 2022.
@inproceedings{kurle2022vbnn,
title = {On the detrimental effect of invariances in the likelihood for variational inference},
author = {Richard Kurle and Ralf Herbrich and Tim Januschowski and Yuyang Wang and Jan Gasthaus},
url = {https://arxiv.org/pdf/2209.07157.pdf},
year = {2022},
date = {2022-01-01},
booktitle = {Advances in Neural Information Procesing Systems 36},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2020
Herbrich, Ralf; Rastogi, Rajeev; Vollgraf, Roland
CRISP: A Probabilistic Model for Individual-Level COVID-19 Infection Risk Estimation Based on Contact Data Journal Article
In: arXiv:2006.04942, 2020.
@article{herbrich2020crisp,
title = {CRISP: A Probabilistic Model for Individual-Level COVID-19 Infection Risk Estimation Based on Contact Data},
author = {Ralf Herbrich and Rajeev Rastogi and Roland Vollgraf},
url = {https://arxiv.org/pdf/2006.04942.pdf},
year = {2020},
date = {2020-01-01},
journal = {arXiv:2006.04942},
abstract = {We present CRISP (COVID-19 Risk Score Prediction), a probabilistic graphical model for COVID-19 infection spread through a population based on the SEIR model where we assume access to (1) mutual contacts between pairs of individuals across time across various channels (e.g., Bluetooth contact traces), as well as (2) test outcomes at given times for infection, exposure and immunity tests. Our micro-level model keeps track of the infection state for each individual at every point in time, ranging from susceptible, exposed, infectious to recovered. We develop a Monte Carlo EM algorithm to infer contact-channel specific infection transmission probabilities. Our algorithm uses Gibbs sampling to draw samples of the latent infection status of each individual over the entire time period of analysis, given the latent infection status of all contacts and test outcome data. Experimental results with simulated data demonstrate our CRISP model can be parametrized by the reproduction factor R0 and exhibits population-level infectiousness and recovery time series similar to those of the classical SEIR model. However, due to the individual contact data, this model allows fine grained control and inference for a wide range of COVID-19 mitigation and suppression policy measures. Moreover, the algorithm is able to support efficient testing in a test-trace-isolate approach to contain COVID-19 infection spread. To the best of our knowledge, this is the first model with efficient inference for COVID-19 infection spread based on individual-level contact data; most epidemic models are macro-level models that reason over entire populations.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2014
He, Xinran; Pan, Junfeng; Jin, Ou; Xu, Tianbing; Liu, Bo; Xu, Tao; Shi, Yanxin; Atallah, Antoine; Herbrich, Ralf; Bowers, Stuart; Candela, Joaquin Quiñonero
Practical Lessons from Predicting Clicks on Ads at Facebook Proceedings Article
In: Proceedings of 20th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp. 1–9, ACM 2014.
@inproceedings{he2014advertising,
title = {Practical Lessons from Predicting Clicks on Ads at Facebook},
author = {Xinran He and Junfeng Pan and Ou Jin and Tianbing Xu and Bo Liu and Tao Xu and Yanxin Shi and Antoine Atallah and Ralf Herbrich and Stuart Bowers and Joaquin Quiñonero Candela},
url = {https://www.herbrich.me/papers/adclicksfacebook.pdf},
year = {2014},
date = {2014-01-01},
booktitle = {Proceedings of 20th ACM SIGKDD Conference on Knowledge Discovery and Data Mining},
pages = {1–9},
organization = {ACM},
abstract = {Online advertising allows advertisers to only bid and pay for measurable user responses, such as clicks on ads. As a consequence, click prediction systems are central to most on-line advertising systems. With over 750 million daily active users and over 1 million active advertisers, predicting clicks on Facebook ads is a challenging machine learning task. In this paper we introduce a model which combines decision trees with logistic regression, outperforming either of these methods on its own by over 3%, an improvement with significant impact to the overall system performance. We then explore how a number of fundamental parameters impact the final prediction performance of our system. Not surprisingly, the most important thing is to have the right features: those capturing historical information about the user or ad dominate other types of features. Once we have the right features and the right model (decisions trees plus logistic regression), other factors play small roles (though even small improvements are important at scale). Picking the optimal handling for data freshness, learning rate schema and data sampling improve the model slightly, though much less than adding a high-value feature, or picking the right model to begin with.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2013
Chakrabarti, Deepayan; Herbrich, Ralf
Speeding Up Large-Scale Learning with a Social Prior Proceedings Article
In: Proceedings of the 19th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 650–658, ACM 2013.
@inproceedings{chakrabarti2013speeding,
title = {Speeding Up Large-Scale Learning with a Social Prior},
author = {Deepayan Chakrabarti and Ralf Herbrich},
url = {https://www.herbrich.me/papers/kdd13-socialprior.pdf},
year = {2013},
date = {2013-01-01},
booktitle = {Proceedings of the 19th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
pages = {650–658},
organization = {ACM},
abstract = {Slow convergence and poor initial accuracy are two problems that plague efforts to use very large feature sets in online learning. This is especially true when only a few features are active in any training example, and the frequency of activations of different features is skewed. We show how these problems can be mitigated if a graph of relationships between features is known. We study this problem in a fully Bayesian setting, focusing on the problem of using Facebook user-IDs as features, with the social network giving the relationship structure. Our analysis uncovers significant problems with the obvious regularizations, and motivates a two-component mixture-model social prior that is provably better. Empirical results on large-scale click prediction problems show that our algorithm can learn as well as the baseline with 12M fewer training examples, and continuously outperforms it for over 60M examples. On a second problem using binned features, our model outperforms the baseline even after the latter sees 5x as much data.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2012
Gartrell, Mike; Paquet, Ulrich; Herbrich, Ralf
A Bayesian Treatment of Social Links in Recommender Systems Technical Report
CU Technical Report CU-CS-1092-12 2012.
@techreport{gartrell2012bayesian,
title = {A Bayesian Treatment of Social Links in Recommender Systems},
author = {Mike Gartrell and Ulrich Paquet and Ralf Herbrich},
url = {https://www.herbrich.me/papers/ucb511101092internet.pdf},
year = {2012},
date = {2012-01-01},
institution = {CU Technical Report CU-CS-1092-12},
abstract = {Recommender systems are increasingly driving user experiences on the Internet. This personalization is often achieved through the factorization of a large but sparse observation matrix of user-item feedback signals. In instances where the user's social network is known, its inclusion can significantly improve recommendations for cold start users. There are numerous ways in which the network can be incorporated into a probabilistic graphical model. We propose and investigate two ways for including a social network, either as a Markov Random Field that describes a user similarity in the prior over user features, or an explicit model that treats social links as observations. State of the art performance is reported on the Flixster online social network dataset.},
keywords = {},
pubstate = {published},
tppubtype = {techreport}
}
Dietz, Laura; Gamari, Ben; Guiver, John; Snelson, Edward; Herbrich, Ralf
De-Layering Social Networks by Shared Tastes of Friendships Proceedings Article
In: International AAAI Conference on Web and Social Media, 2012.
@inproceedings{dietz2012layering,
title = {De-Layering Social Networks by Shared Tastes of Friendships},
author = {Laura Dietz and Ben Gamari and John Guiver and Edward Snelson and Ralf Herbrich},
url = {https://www.herbrich.me/papers/icwsm2012.pdf},
year = {2012},
date = {2012-01-01},
booktitle = {International AAAI Conference on Web and Social Media},
abstract = {Traditionally, social network analyses are applied to data from a particular social domain. With the advent of online social networks such as Facebook, we observe an aggregate of various social domains resulting in a layered mix of professional contacts, family ties, and different circles. These aggregates dilute the community structure. We provide a method for de-layering social networks according to shared interests. Instead of relying on changes in the edge density, our shared taste model uses content of users to disambiguate the underlying shared interest of each friendship. We successfully de-layer real world networks from LibraryThing and Boards.ie, obtaining topics that significantly outperform LDA on unsupervised prediction of group membership.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
El-Arini, Khalid; Paquet, Ulrich; Herbrich, Ralf; Gael, Jurgen Van; y Arcas, Blaise Agüera
Transparent User Models for Personalization Proceedings Article
In: Proceedings of the 18th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 678–686, ACM 2012.
@inproceedings{elarini2012transparent,
title = {Transparent User Models for Personalization},
author = {Khalid El-Arini and Ulrich Paquet and Ralf Herbrich and Jurgen Van Gael and Blaise Agüera y Arcas},
url = {https://www.herbrich.me/papers/kdd2012-personalization.pdf},
year = {2012},
date = {2012-01-01},
booktitle = {Proceedings of the 18th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
pages = {678–686},
organization = {ACM},
abstract = {Personalization is a ubiquitous phenomenon in our daily online experience. While such technology is critical for helping us combat the overload of information we face, in many cases, we may not even realize that our results are being tailored to our personal tastes and preferences. Worse yet, when such a system makes a mistake, we have little recourse to correct it. In this work, we propose a framework for addressing this problem by developing a new user-interpretable feature set upon which to base personalized recommendations. These features, which we call badges, represent fundamental traits of users (e.g., ``vegetarian'' or ``Apple fanboy'') inferred by modeling the interplay between a user's behavior and self-reported identity. Specifically, we consider the microblogging site Twitter, where users provide short descriptions of themselves in their profiles, as well as perform actions such as tweeting and retweeting. Our approach is based on the insight that we can define badges using high precision, low recall rules (e.g., ``Twitter profile contains the phrase `Apple fanboy'''), and with enough data, generalize to other users by observing shared behavior. We develop a fully Bayesian, generative model that describes this interaction, while allowing us to avoid the pitfalls associated with having positive-only data. Experiments on real Twitter data demonstrate the effectiveness of our model at capturing rich and interpretable user traits that can be used to provide transparency for personalization.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Herbrich, Ralf
Distributed, Real-time Bayesian Learning in Online Services Proceedings Article
In: Proceedings of the 6th ACM Conference on Recommender Systems, pp. 203–204, ACM 2012.
@inproceedings{herbrich2012distributed,
title = {Distributed, Real-time Bayesian Learning in Online Services},
author = {Ralf Herbrich},
url = {https://www.herbrich.me/papers/recsys2012.pdf},
year = {2012},
date = {2012-01-01},
booktitle = {Proceedings of the 6th ACM Conference on Recommender Systems},
pages = {203–204},
organization = {ACM},
abstract = {The last ten years have seen a tremendous growth in Internet-based online services such as search, advertising, gaming and social networking. Today, it is important to analyze large collections of user interaction data as a first step in building predictive models for these services as well as learn these models in real-time. One of the biggest challenges in this setting is scale: not only does the sheer scale of data necessitate parallel processing but it also necessitates distributed models; with over 900 million active users at Facebook, any user-specific sets of features in a linear or non-linear model yields models of a size bigger than can be stored in a single system. In this talk, I will give a hands-on introduction to one of the most versatile tools for handling large collections of data with distributed probabilistic models: the sum-product algorithm for approximate message passing in factor graphs. I will discuss the application of this algorithm for the specific case of generalized linear models and outline the challenges of both approximate and distributed message passing including an in-depth discussion of expectation propagation and Map-Reduce.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Hennig, Philipp; Stern, David; Herbrich, Ralf; Graepel, Thore
Kernel Topic Models Proceedings Article
In: Proceedings of the 15th International Conference on Artificial Intelligence and Statistics (AISTATS), pp. 511–519, 2012.
@inproceedings{henning2012kerneltopic,
title = {Kernel Topic Models},
author = {Philipp Hennig and David Stern and Ralf Herbrich and Thore Graepel},
url = {https://www.herbrich.me/papers/ktm.pdf},
year = {2012},
date = {2012-01-01},
booktitle = {Proceedings of the 15th International Conference on Artificial Intelligence and Statistics (AISTATS)},
pages = {511–519},
abstract = {Latent Dirichlet Allocation models discrete data as a mixture of discrete distributions, using Dirichlet beliefs over the mixture weights. We study a variation of this concept, in which the documents' mixture weight beliefs are replaced with squashed Gaussian distributions. This allows documents to be associated with elements of a Hilbert space, admitting kernel topic models (KTM), modelling temporal, spatial, hierarchical, social and other structure between documents. The main challenge is efficient approximate inference on the latent Gaussian. We present an approximate algorithm cast around a Laplace approximation in a transformed basis. The KTM can also be interpreted as a type of Gaussian process latent variable model, or as a topic model conditional on document features, uncovering links between earlier work in these areas.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2011
Passos, Alexandre; Gael, Juergen Van; Herbrich, Ralf; Paquet, Ulrich
A Penny for Your Thoughts? The Value of Information in Recommendation Systems Proceedings Article
In: NIPS Workshop on Bayesian Optimization, Experimental Design, and Bandits, pp. 9–14, 2011.
@inproceedings{passos2011informationvalue,
title = {A Penny for Your Thoughts? The Value of Information in Recommendation Systems},
author = {Alexandre Passos and Juergen Van Gael and Ralf Herbrich and Ulrich Paquet},
url = {https://www.herbrich.me/papers/passos11penny.pdf},
year = {2011},
date = {2011-01-01},
booktitle = {NIPS Workshop on Bayesian Optimization, Experimental Design, and Bandits},
pages = {9–14},
abstract = {Most recommendation systems are trained to predict behavioral data and then used to generate more such data by recommending items and receiving feedback on the quality of these recommendations. This data in then fed back into the training process. This creates a feedback loop: as long as the low-cost way to interact with the service is through the recommender, the recommender will only ever see behavioral data on the items it chooses. This process can lead to hidden biases, as it effectively limits how much information the recommender system will ever see. On the other hand, there is a cost to making exploratory recommendations, as they should, myopically, be worse than the best bets of a recommendation system. In this paper we explore the notion that recommender systems are a special kind of active learning agents, with the peculiarity that the cost of asking for the label of an instance depends on its true label, as the cost of showing a bad recommendation when exploring is higher than the cost of showing a good recommendation.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Cheng, Weiwei; Kasneci, Gjergji; Graepel, Thore; Stern, David H; Herbrich, Ralf
Automated Feature Generation From Structured Knowledge Proceedings Article
In: Proceedings of the 20th ACM Conference on Information and Knowledge Management, pp. 1395–1404, 2011.
@inproceedings{cheng2011featuresfromknowledge,
title = {Automated Feature Generation From Structured Knowledge},
author = {Weiwei Cheng and Gjergji Kasneci and Thore Graepel and David H Stern and Ralf Herbrich},
url = {https://www.herbrich.me/papers/cikmfp0337-cheng.pdf},
year = {2011},
date = {2011-01-01},
booktitle = {Proceedings of the 20th ACM Conference on Information and Knowledge Management},
pages = {1395–1404},
abstract = {The prediction accuracy of any learning algorithm highly depends on the quality of the selected features; but often, the task of feature construction and selection is tedious and non-scalable. In recent years, however, there have been numerous projects with the goal of constructing general-purpose or domain-specific knowledge bases with entity-relationship-entity triples extracted from various Web sources or collected from user communities, e.g., YAGO, DBpedia, Free- base, UMLS, etc. This paper advocates the simple and yet far-reaching idea that the structured knowledge contained in such knowledge bases can be exploited to automatically extract features for general learning tasks. We introduce an expressive graph-based language for extracting features from such knowledge bases and a theoretical framework for constructing feature vectors from the extracted features. Our experimental evaluation on different learning scenarios provides evidence that the features derived through our framework can considerably improve the prediction accuracy, especially when the labeled data at hand is sparse.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Kohli, Pushmeet; Bachrach, Yoram; Graepel, Thore; Smyth, Gavin; Armstrong, Michael; Stillwell, David; Kearns, Michael
Behavioral Game Theory on Online Social Networks: Colonel Blotto is on Facebook Proceedings Article
In: Proceedings of the 2nd Workshop on Computational Social Science and the Wisdom of Crowds, 2011.
@inproceedings{kohli2011colonelblotto,
title = {Behavioral Game Theory on Online Social Networks: Colonel Blotto is on Facebook},
author = {Pushmeet Kohli and Yoram Bachrach and Thore Graepel and Gavin Smyth and Michael Armstrong and David Stillwell and Michael Kearns},
url = {https://www.herbrich.me/papers/nips_blotto_2011.pdf},
year = {2011},
date = {2011-01-01},
booktitle = {Proceedings of the 2nd Workshop on Computational Social Science and the Wisdom of Crowds},
abstract = {We show how online social networks such as Facebook can be used in Behavioral Game Theory research. We report the deployment of a Facebook application Project Waterloo that allows users to play the Colonel Blotto game against their friends and strangers. Unlike conventional studies performed in the laboratory environment, which rely on monetary incentives to attract human subjects to play games, our framework does not use money and instead relies on reputation and entertainment incentives. We describe the Facebook application we created for conducting this experiment, and perform a preliminary analysis of the data collected in the game. We conclude by discussing the advantages of our approach and list some ideas for future work.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Xu, Yan; Cao, Xiang; Sellen, Abigail; Herbrich, Ralf; Graepel, Thore
Sociable Killers: Understanding Social Relationships in an Online First-Person Shooter Game Proceedings Article
In: Proceedings of the 2011 ACM Conference on Computer Supported Cooperative Work, pp. 197–206, 2011.
@inproceedings{xu2011sociablekillers,
title = {Sociable Killers: Understanding Social Relationships in an Online First-Person Shooter Game},
author = {Yan Xu and Xiang Cao and Abigail Sellen and Ralf Herbrich and Thore Graepel},
url = {https://www.herbrich.me/papers/CSCW11.pdf},
year = {2011},
date = {2011-01-01},
booktitle = {Proceedings of the 2011 ACM Conference on Computer Supported Cooperative Work},
pages = {197–206},
abstract = {Online video games can be seen as medium for the formation and maintenance of social relationships. In this paper, we explore what social relationships mean under the context of online First-Person Shooter (FPS) games, how these relationships influence game experience, and how players manage them. We combine qualitative interview and quantitative game log data, and find that despite the gap between the non-persistent game world and potentially persistent social relationships, a diversity of social relationships emerge and play a central role in the enjoyment of online FPS games. We report the forms, development, and impact of such relationships, and discuss our findings in light of design implications and comparison with other game genres.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2010
Bachrach, Yoram; Herbrich, Ralf
Fingerprinting Ratings for Collaborative Filtering - Theoretical and Empirical Analysis Proceedings Article
In: Proceedings of 17th International Symposium on String Processing and Information Retrieval, pp. 25–36, 2010.
@inproceedings{bachrach2010fingerprinting,
title = {Fingerprinting Ratings for Collaborative Filtering - Theoretical and Empirical Analysis},
author = {Yoram Bachrach and Ralf Herbrich},
url = {https://www.herbrich.me/papers/spire10.pdf},
year = {2010},
date = {2010-01-01},
booktitle = {Proceedings of 17th International Symposium on String Processing and Information Retrieval},
pages = {25–36},
abstract = {We consider fingerprinting methods for collaborative filtering (CF) systems. In general, CF systems show their real strength when supplied with enormous data sets. Earlier work already suggests sketching techniques to handle massive amounts of information, but most prior analysis has so far been limited to non-ranking application scenarios and has focused mainly on a theoretical analysis. We demonstrate how to use fingerprinting methods to compute a family of rank correlation coefficients. Our methods allow identifying users who have similar rankings over a certain set of items, a problem that lies at the heart of CF applications. We show that our method allows approximating rank correlations with high accuracy and confidence. We examine the suggested methods empirically through a recommender system for the Netflix dataset, showing that the required fingerprint sizes are even smaller than the theoretical analysis suggests. We also explore the of use standard hash functions rather than min-wise independent hashes and the relation between the quality of the final recommendations and the fingerprint size.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Graepel, Thore; Candela, Joaquin Quiñonero; Borchert, Thomas; Herbrich, Ralf
Web-Scale Bayesian Click-Through Rate Prediction for Sponsored Search Advertising in Microsoft's Bing Search Engine Proceedings Article
In: Proceedings of the 27th International Conference on Machine Learning, pp. 13–20, 2010.
@inproceedings{graepel2010bayesianctr,
title = {Web-Scale Bayesian Click-Through Rate Prediction for Sponsored Search Advertising in Microsoft's Bing Search Engine},
author = {Thore Graepel and Joaquin Quiñonero Candela and Thomas Borchert and Ralf Herbrich},
url = {https://www.herbrich.me/papers/adpredictor.pdf},
year = {2010},
date = {2010-01-01},
booktitle = {Proceedings of the 27th International Conference on Machine Learning},
pages = {13–20},
abstract = {We describe a new Bayesian click-through rate (CTR) prediction algorithm used for Sponsored Search in Microsoft's Bing search engine. The algorithm is based on a probit regression model that maps discrete or real-valued input features to probabilities. It maintains Gaussian beliefs over weights of the model and performs Gaussian online updates derived from approximate message passing. Scalability of the algorithm is ensured through a principled weight pruning procedure and an approximate parallel implementation. We discuss the challenges arising from evaluating and tuning the predictor as part of the complex system of sponsored search where the predictions made by the algorithm decide about future training sample composition. Finally, we show experimental results from the production system and compare to a calibrated Naive Bayes algorithm.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Kasneci, Gjergji; Gael, Jurgen Van; Herbrich, Ralf; Graepel, Thore
Bayesian Knowledge Corroboration with Logical Rules and User Feedback Proceedings Article
In: Proceedings of European Conference on Machine Learning and Knowledge Discovery in Databases, pp. 1–18, 2010.
@inproceedings{kasneci2010knowledgecorroboration,
title = {Bayesian Knowledge Corroboration with Logical Rules and User Feedback},
author = {Gjergji Kasneci and Jurgen Van Gael and Ralf Herbrich and Thore Graepel},
url = {https://www.herbrich.me/papers/ecml10.pdf},
year = {2010},
date = {2010-01-01},
booktitle = {Proceedings of European Conference on Machine Learning and Knowledge Discovery in Databases},
pages = {1–18},
abstract = {Current knowledge bases suffer from either low coverage or low accuracy. The underlying hypothesis of this work is that user feedback can greatly improve the quality of automatically extracted knowledge bases. The feedback could help quantify the uncertainty associated with the stored statements and would enable mechanisms for searching, ranking and reasoning at entity-relationship level. Most importantly, a principled model for exploiting user feedback to learn the truth values of statements in the knowledge base would be a major step forward in addressing the issue of knowledge base curation. We present a family of probabilistic graphical models that builds on user feedback and logical inference rules derived from the popular Semantic-Web formalism of RDFS. Through internal inference and belief propagation, these models can learn both, the truth values of the statements in the knowledge base and the reliabilities of the users who give feedback. We demonstrate the viability of our approach in extensive experiments on real-world datasets, with feedback collected from Amazon Mechanical Turk.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Paquet, Ulrich; Gael, Jurgen Van; Stern, David; Kasneci, Gjergji; Herbrich, Ralf; Graepel, Thore
Vuvuzelas & Active Learning for Online Classification Proceedings Article
In: Proceedings of Computational Social Science and the Wisdom of Crowds Workshop, 2010.
@inproceedings{paquet2010,
title = {Vuvuzelas & Active Learning for Online Classification},
author = {Ulrich Paquet and Jurgen Van Gael and David Stern and Gjergji Kasneci and Ralf Herbrich and Thore Graepel},
url = {https://www.herbrich.me/papers/vuvuzela.pdf},
year = {2010},
date = {2010-01-01},
booktitle = {Proceedings of Computational Social Science and the Wisdom of Crowds Workshop},
abstract = {Many online service systems leverage user-generated content from Web 2.0 style platforms such as Wikipedia, Twitter, Facebook, and many more. Often, the value lies in the freshness of this information (e.g. tweets, event-based articles, blog posts, etc.). This freshness poses a challenge for supervised learning models as they frequently have to deal with previously unseen features. In this paper we address the problem of online classification for tweets, namely, how can a classifier be updated in an online manner, so that it can correctly classify the latest hype on Twitter? We propose a two-step strategy to solve this problem. The first step follows an active learning strategy that enables the selection of tweets for which a label would be most useful; the selected tweet is then forwarded to Amazon Mechanical Turk where it is labeled by multiple users. The second step builds on a Bayesian corroboration model that aggregates the noisy labels provided by the users by taking their reliabilities into account.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Stern, David; Samulowitz, Horst; Herbrich, Ralf; Graepel, Thore; Pulina, Luca; Tacchella, Armando
Collaborative Expert Portfolio Management Proceedings Article
In: Proceedings of the 24th AAAI Conference on Artificial Intelligence, 2010.
@inproceedings{stern2010collaborativeexperts,
title = {Collaborative Expert Portfolio Management},
author = {David Stern and Horst Samulowitz and Ralf Herbrich and Thore Graepel and Luca Pulina and Armando Tacchella},
url = {https://www.herbrich.me/papers/matchboxqbf.pdf},
year = {2010},
date = {2010-01-01},
booktitle = {Proceedings of the 24th AAAI Conference on Artificial Intelligence},
abstract = {We consider the task of assigning experts from a portfolio of specialists in order to solve a set of tasks. We apply a Bayesian model which combines collaborative filtering with a feature-based description of tasks and experts to yield a general framework for managing a portfolio of experts. The model learns an embedding of tasks and problems into a latent space in which affinity is measured by the inner product. The model can be trained incrementally and can track non-stationary data, tracking potentially changing expert and task characteristics. The approach allows us to use a principled decision theoretic framework for expert selection, allowing the user to choose a utility function that best suits their objectives. The model component for taking into account the performance feedback data is pluggable, allowing flexibility. We apply the model to manage a portfolio of algorithms to solve hard combinatorial problems. This is a well studied area and we demonstrate a large improvement on the state of the art in one domain (constraint solving) and in a second domain (combinatorial auctions) created a portfolio that performed significantly better than any single algorithm.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Zaman, Tauhid R; Herbrich, Ralf; Gael, Jurgen Van; Stern, David
Predicting Information Spreading in Twitter Proceedings Article
In: Proceedings of Computational Social Science and the Wisdom of Crowds Workshop, 2010.
@inproceedings{zaman2010informationspreading,
title = {Predicting Information Spreading in Twitter},
author = {Tauhid R Zaman and Ralf Herbrich and Jurgen Van Gael and David Stern},
url = {https://www.herbrich.me/papers/nips10_twitter.pdf},
year = {2010},
date = {2010-01-01},
booktitle = {Proceedings of Computational Social Science and the Wisdom of Crowds Workshop},
abstract = {We present a new methodology for predicting the spread of information in a social network. We focus on the Twitter network, where information is in the form of 140 character messages called tweets, and information is spread by users forwarding tweets, a practice known as retweeting. Using data of who and what was retweeted, we train a probabilistic collaborative filter model to predict future retweets. We find that the most important features for prediction are the identity of the source of the tweet and retweeter. Our methodology is quite flexible and be used as a basis for other prediction models in social networks.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Zhang, Xinhua; Graepel, Thore; Herbrich, Ralf
Bayesian Online Learning for Multi-label and Multi-variate Performance Measures Proceedings Article
In: Proceedings of the 13th International Conference on Artificial Intelligence and Statistics (AISTATS), pp. 956–963, 2010.
@inproceedings{zhang2010bayesian,
title = {Bayesian Online Learning for Multi-label and Multi-variate Performance Measures},
author = {Xinhua Zhang and Thore Graepel and Ralf Herbrich},
url = {https://www.herbrich.me/papers/zhang10b.pdf},
year = {2010},
date = {2010-01-01},
booktitle = {Proceedings of the 13th International Conference on Artificial Intelligence and Statistics (AISTATS)},
pages = {956–963},
abstract = {Many real world applications employ multi-variate performance measures and each example can belong to multiple classes. The currently most popular approaches train an SVM for each class, followed by ad-hoc thresholding. Probabilistic models using Bayesian decision theory are also commonly adopted. In this paper, we propose a Bayesian online multi-label classification framework (BOMC) which learns a probabilistic linear classifier. The likelihood is modeled by a graphical model similar to TrueSkill, and inference is based on Gaussian density filtering with expectation propagation. Using samples from the posterior, we label the testing data by maximizing the expected F1-score. Our experiments on Reuters1-v2 dataset show BOMC compares favorably to the state-of-the-art online learners in macro-averaged F1-score and training time.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2009
Bachrach, Yoram; Herbrich, Ralf; Porat, Ely
Sketching Algorithms for Approximating Rank Correlations in Collaborative Filtering Systems Proceedings Article
In: Proceedings of 16th International Symposium String Processing and Information Retrieval, pp. 344–352, 2009.
@inproceedings{bachrach2009sketching,
title = {Sketching Algorithms for Approximating Rank Correlations in Collaborative Filtering Systems},
author = {Yoram Bachrach and Ralf Herbrich and Ely Porat},
url = {https://www.herbrich.me/papers/spire09.pdf},
year = {2009},
date = {2009-01-01},
booktitle = {Proceedings of 16th International Symposium String Processing and Information Retrieval},
pages = {344–352},
abstract = {Collaborative filtering (CF) shares information between users to provide each with recommendations. Previous work suggests using sketching techniques to handle massive data sets in CF systems, but only allows testing whether users have a high proportion of items they have both ranked. We show how to determine the correlation between the rankings of two users, using concise sketches of the rankings. The sketches allow approximating Kendall's Tau, a known rank correlation, with high accuracy ε and high confidence $1-delta$. The required sketch size is logarithmic in the confidence and polynomial in the accuracy.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Flach, Peter; Spiegler, Sebastian; Golenia, Bruno; Price, Simon; Guiver, John; Herbrich, Ralf; Graepel, Thore; Zaki, Mohammed
Novel Tools to Streamline the Conference Review Process: Experiences from SIGKDD'09 Journal Article
In: SIGKDD Explorations, vol. 11, no. 2, pp. 63–67, 2009.
@article{flach2009conferencereview,
title = {Novel Tools to Streamline the Conference Review Process: Experiences from SIGKDD'09},
author = {Peter Flach and Sebastian Spiegler and Bruno Golenia and Simon Price and John Guiver and Ralf Herbrich and Thore Graepel and Mohammed Zaki},
url = {https://www.herbrich.me/papers/ReviewerCalibration.pdf},
year = {2009},
date = {2009-01-01},
journal = {SIGKDD Explorations},
volume = {11},
number = {2},
pages = {63–67},
abstract = {The SIGKDD'09 Research Track received 537 paper submissions, which were reviewed by a Program Committee of 199 members, and a Senior Program Committee of 22 members. We used techniques from artificial intelligence and data mining to streamline and support this complicated process at three crucial stages: bidding by PC members on papers, assigning papers to reviewers, and calibrating scores obtained from the reviews. In this paper we report on the approaches taken, evaluate how well they worked, and describe some further work done after the conference.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Schwaighofer, Anton; Candela, Joaquin Quiñonero; Borchert, Thomas; Graepel, Thore; Herbrich, Ralf
Scalable Clustering and Keyword Suggestion for Online Advertisements Proceedings Article
In: Proceedings of 3rd Annual International Workshop on Data Mining and Audience Intelligence for Advertising, pp. 27–36, 2009.
@inproceedings{schwaighofer2009clustering,
title = {Scalable Clustering and Keyword Suggestion for Online Advertisements},
author = {Anton Schwaighofer and Joaquin Quiñonero Candela and Thomas Borchert and Thore Graepel and Ralf Herbrich},
url = {https://www.herbrich.me/papers/kdd2009.pdf},
year = {2009},
date = {2009-01-01},
booktitle = {Proceedings of 3rd Annual International Workshop on Data Mining and Audience Intelligence for Advertising},
pages = {27–36},
abstract = {We present an efficient Bayesian online learning algorithm for clustering vectors of binary values based on a well known model, the mixture of Bernoulli profiles. The model includes conjugate Beta priors over the success probabilities and maintains discrete probability distributions for cluster assignments. Clustering is then formulated as inference in a factor graph which is solved efficiently using online approximate message passing. The resulting algorithm has three key features: a) it requires only a single pass across the data and can hence be used on data streams, b) it maintains the uncertainty of parameters and cluster assignments, and c) it implements an automatic step size adaptation based on the current model uncertainty. The model is tested on an artificially generated toy dataset and applied to a large scale real-world data set from online advertising, the data being online ads characterized by the set of keywords to which they have been subscribed. The proposed approach scales well for large datasets, and compares favorably to other clustering algorithms on the ads dataset. As a concrete application to online advertising we show how the learned model can be used to recommend new keywords for given ads.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Stern, David; Herbrich, Ralf; Graepel, Thore
Matchbox: Large Scale Online Bayesian Recommendations Proceedings Article
In: Proceedings of the 18th International Conference on World Wide Web, pp. 111–120, 2009.
@inproceedings{stern2009matchbox,
title = {Matchbox: Large Scale Online Bayesian Recommendations},
author = {David Stern and Ralf Herbrich and Thore Graepel},
url = {https://www.herbrich.me/papers/www09.pdf},
year = {2009},
date = {2009-01-01},
booktitle = {Proceedings of the 18th International Conference on World Wide Web},
pages = {111–120},
abstract = {We present a probabilistic model for generating personalised recommendations of items to users of a web service. The Matchbox system makes use of content information in the form of user and item meta data in combination with collaborative filtering information from previous user behavior in order to predict the value of an item for a user. Users and items are represented by feature vectors which are mapped into a low-dimensional trait space in which similarity is measured in terms of inner products. The model can be trained from different types of feedback in order to learn user-item preferences. Here we present three alternatives: direct observation of an absolute rating each user gives to some items, observation of a binary preference (like/ don't like) and observation of a set of ordinal ratings on a userspecific scale. Efficient inference is achieved by approximate message passing involving a combination of Expectation Propagation (EP) and Variational Message Passing. We also include a dynamics model which allows an item's popularity, a user's taste or a user's personal rating scale to drift over time. By using Assumed-Density Filtering (ADF) for training, the model requires only a single pass through the training data. This is an on-line learning algorithm capable of incrementally taking account of new data so the system can immediately reflect the latest user preferences. We evaluate the performance of the algorithm on the MovieLens and Netflix data sets consisting of approximately 1,000,000 and 100,000,000 ratings respectively. This demonstrates that training the model using the on-line ADF approach yields state-of-the-art performance with the option of improving performance further if computational resources are available by performing multiple EP passes over the training data.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2008
Graepel, Thore; Herbrich, Ralf
Large Scale Data Analysis and Modelling in Online Services and Advertising Proceedings Article
In: Proceedings of the 14th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 2, 2008.
@inproceedings{graepel2008kdd,
title = {Large Scale Data Analysis and Modelling in Online Services and Advertising},
author = {Thore Graepel and Ralf Herbrich},
url = {https://www.herbrich.me/papers/kdd2008.pdf},
year = {2008},
date = {2008-01-01},
booktitle = {Proceedings of the 14th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
pages = {2},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2007
Dangauthier, Pierre; Herbrich, Ralf; Minka, Tom; Graepel, Thore
TrueSkill Through Time: Revisiting the History of Chess Proceedings Article
In: Advances in Neural Information Processing Systems 20, pp. 931–938, The MIT Press, 2007.
@inproceedings{dangauthier2007ttt,
title = {TrueSkill Through Time: Revisiting the History of Chess},
author = {Pierre Dangauthier and Ralf Herbrich and Tom Minka and Thore Graepel},
url = {https://www.herbrich.me/papers/ttt.pdf},
year = {2007},
date = {2007-01-01},
booktitle = {Advances in Neural Information Processing Systems 20},
pages = {931–938},
publisher = {The MIT Press},
abstract = {We extend the Bayesian skill rating system TrueSkill to infer entire time series of skills of players by smoothing through time instead of filtering. The skill of each participating player, say, every year is represented by a latent skill variable which is affected by the relevant game outcomes that year, and coupled with the skill variables of the previous and subsequent year. Inference in the resulting factor graph is carried out by approximate message passing (EP) along the time series of skills. As before the system tracks the uncertainty about player skills, explicitly models draws, can deal with any number of competing entities and can infer individual skills from team results. We extend the system to estimate player-specific draw margins. Based on these models we present an analysis of the skill curves of important players in the history of chess over the past 150 years. Results include plots of players' lifetime skill development as well as the ability to compare the skills of different players across time. Our results indicate that a) the overall playing strength has increased over the past 150 years, and b) that modelling a player's ability to force a draw provides significantly better predictive power.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Herbrich, Ralf; Graepel, Thore; Murphy, Brendan
Structure From Failure Proceedings Article
In: Proceedings of 2nd Workshop on Tackling Computer Systems Problems with Machine Learning Techniques, USENIX, 2007.
@inproceedings{herbrich2007failurestructure,
title = {Structure From Failure},
author = {Ralf Herbrich and Thore Graepel and Brendan Murphy},
url = {https://www.herbrich.me/papers/sysml2007.pdf},
year = {2007},
date = {2007-01-01},
booktitle = {Proceedings of 2nd Workshop on Tackling Computer Systems Problems with Machine Learning Techniques},
publisher = {USENIX},
abstract = {We investigate the problem of learning the dependencies among servers in large networks based on failure patterns in their up-time behaviour. We model up-times in terms of exponential distributions whose inverse lifetime parameters lmay vary with the state of other servers. Based on a conjugate Gamma prior over inverse lifetimes we identify the most likely network configuration given that any node has at most one parent. The method can be viewed as a special case of learning a continuous time Bayesian network. Our approach enables us to easily incorporate existing expert prior knowledge. Furthermore our method enjoys advantages over a state-of-the-art rule-based approach. We validate the approach on synthetic data and apply it to five year data for a set of over 500 servers at a server farm of a major Microsoft web site.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Stern, David; Herbrich, Ralf; Graepel, Thore
Learning to Solve Game Trees Proceedings Article
In: Proceedings of the 24th International Conference on Machine Learning, pp. 839–846, 2007.
@inproceedings{stern2007gametrees,
title = {Learning to Solve Game Trees},
author = {David Stern and Ralf Herbrich and Thore Graepel},
url = {https://www.herbrich.me/papers/p839-stern.pdf},
year = {2007},
date = {2007-01-01},
booktitle = {Proceedings of the 24th International Conference on Machine Learning},
pages = {839–846},
abstract = {We apply probability theory to the task of proving whether a goal can be achieved by a player in an adversarial game. Such problems are solved by searching the game tree. We view this tree as a graphical model which yields a distribution over the (Boolean) outcome of the search before it terminates. Experiments show that a best-first search algorithm guided by this distribution explores a similar number of nodes as Proof-Number Search to solve Go problems. Knowledge is incorporated into search by using domain-specific models to provide prior distributions over the values of leaf nodes of the game tree. These are surrogate for the unexplored parts of the tree. The parameters of these models can be learned from previous search trees. Experiments on Go show that the speed of problem solving can be increased by orders of magnitude by this technique but care must be taken to avoid over-fitting.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2006
Herbrich, Ralf; Minka, Tom; Graepel, Thore
TrueSkill(TM): A Bayesian Skill Rating System Proceedings Article
In: Advances in Neural Information Procesing Systems 19, pp. 569–576, The MIT Press, 2006.
@inproceedings{herbrich2006trueskill,
title = {TrueSkill(TM): A Bayesian Skill Rating System},
author = {Ralf Herbrich and Tom Minka and Thore Graepel},
url = {https://www.herbrich.me/papers/trueskill.pdf},
year = {2006},
date = {2006-01-01},
booktitle = {Advances in Neural Information Procesing Systems 19},
pages = {569–576},
publisher = {The MIT Press},
abstract = {We present a new Bayesian skill rating system which can be viewed as a generalisation of the Elo system used in Chess. The new system tracks the uncertainty about player skills, explicitly models draws, can deal with any number of competing entities and can infer individual skills from team results. Inference is performed by approximate message passing on a factor graph representation of the model. We present experimental evidence on the increased accuracy and convergence speed of the system compared to Elo and report on our experience with the new rating system running in a large-scale commercial online gaming service under the name of TrueSkill.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Graepel, Thore; Herbrich, Ralf
Ranking and Matchmaking Journal Article
In: Game Developer Magazine, no. 10, 2006.
@article{graepel2006ranking,
title = {Ranking and Matchmaking},
author = {Thore Graepel and Ralf Herbrich},
url = {https://www.herbrich.me/papers/Game-Developer-Feature-Article-Graepel-Herbrich.pdf},
year = {2006},
date = {2006-01-01},
journal = {Game Developer Magazine},
number = {10},
abstract = {Online game modes are gradually becoming the standard for console games, rather than an added bonus. But how do you get the right players playing against the right people, close enough in skill to be challenging, but not so difficult as to be frustrating?},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Stern, David; Herbrich, Ralf; Graepel, Thore
Bayesian Pattern Ranking for Move Prediction in the Game of Go Proceedings Article
In: Proceedings of the 23rd International Conference on Machine Learning, pp. 873–880, 2006.
@inproceedings{stern2006bayesianpatternranking,
title = {Bayesian Pattern Ranking for Move Prediction in the Game of Go},
author = {David Stern and Ralf Herbrich and Thore Graepel},
url = {https://www.herbrich.me/papers/p873-stern.pdf},
year = {2006},
date = {2006-01-01},
booktitle = {Proceedings of the 23rd International Conference on Machine Learning},
pages = {873–880},
abstract = {We investigate the problem of learning to predict moves in the board game of Go from game records of expert players. In particular, we obtain a probability distribution over legal moves for professional play in a given position. This distribution has numerous applications in computer Go, including serving as an efficient stand-alone Go player. It would also be effective as a move selector and move sorter for game tree search and as a training tool for Go players. Our method has two major components: a) a pattern extraction scheme for efficiently harvesting patterns of given size and shape from expert game records and b) a Bayesian learning algorithm (in two variants) that learns a distribution over the values of a move given a board position based on the local pattern context. The system is trained on 181,000 expert games and shows excellent prediction performance as indicated by its ability to perfectly predict the moves made by professional Go players in 34% of test positions.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2005
Agarwal, Shivani; Graepel, Thore; Herbrich, Ralf; Har-Peled, Sariel; Roth, Dan
Generalization Bounds for the Area Under the ROC Curve Journal Article
In: Journal of Machine Learning Research, vol. 6, pp. 393–425, 2005.
@article{agarwal2005roc,
title = {Generalization Bounds for the Area Under the ROC Curve},
author = {Shivani Agarwal and Thore Graepel and Ralf Herbrich and Sariel Har-Peled and Dan Roth},
url = {https://www.herbrich.me/papers/auc.pdf},
year = {2005},
date = {2005-01-01},
journal = {Journal of Machine Learning Research},
volume = {6},
pages = {393–425},
abstract = {We study generalization properties of the area under the ROC curve (AUC), a quantity that has been advocated as an evaluation criterion for the bipartite ranking problem. The AUC is a different term than the error rate used for evaluation in classification problems; consequently, existing generalization bounds for the classification error rate cannot be used to draw conclusions about the AUC. In this paper, we define the expected accuracy of a ranking function (analogous to the expected error rate of a classification function), and derive distribution-free probabilistic bounds on the deviation of the empirical AUC of a ranking function (observed on a finite data sequence) from its expected accuracy. We derive both a large deviation bound, which serves to bound the expected accuracy of a ranking function in terms of its empirical AUC on a test sequence, and a uniform convergence bound, which serves to bound the expected accuracy of a learned ranking function in terms of its empirical AUC on a training sequence. Our uniform convergence bound is expressed in terms of a new set of combinatorial parameters that we term the bipartite rank-shatter coefficients; these play the same role in our result as do the standard VC-dimension related shatter coefficients (also known as the growth function) in uniform convergence results for the classification error rate. A comparison of our result with a recent uniform convergence result derived by Freund et al. (2003) for a quantity closely related to the AUC shows that the bound provided by our result can be considerably tighter.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Graepel, Thore; Herbrich, Ralf; Shawe-Taylor, John
PAC-Bayesian Compression Bounds on the Prediction Error of Learning Algorithms for Classification Journal Article
In: Machine Learning, vol. 59, pp. 55–76, 2005.
@article{graepel2005pacbayesian,
title = {PAC-Bayesian Compression Bounds on the Prediction Error of Learning Algorithms for Classification},
author = {Thore Graepel and Ralf Herbrich and John Shawe-Taylor},
url = {https://www.herbrich.me/papers/graehertay05.pdf},
year = {2005},
date = {2005-01-01},
journal = {Machine Learning},
volume = {59},
pages = {55–76},
abstract = {We consider bounds on the prediction error of classification algorithms based on sample compression. We refine the notion of a compression scheme to distinguish permutation and repetition invariant and non-permutation and repetition invariant compression schemes leading to different prediction error bounds. Also, we extend known results on compression to the case of non-zero empirical risk. We provide bounds on the prediction error of classifiers returned by mistakedriven online learning algorithms by interpreting mistake bounds as bounds on the size of the respective compression scheme of the algorithm. This leads to a bound on the prediction error of perceptron solutions that depends on the margin a support vector machine would achieve on the same training sample. Furthermore, using the property of compression we derive bounds on the average prediction error of kernel classifiers in the PAC-Bayesian framework. These bounds assume a prior measure over the expansion coefficients in the data-dependent kernel expansion and bound the average prediction error uniformly over subsets of the space of expansion coefficients.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Gretton, Arthur; Herbrich, Ralf; Smola, Alexander J; Bousquet, Olivier; Schölkopf, Bernhard
Kernel Methods for Measuring Independence Journal Article
In: Journal of Machine Learning Research, vol. 6, pp. 2075–2129, 2005.
@article{gretton2005kernelindependence,
title = {Kernel Methods for Measuring Independence},
author = {Arthur Gretton and Ralf Herbrich and Alexander J Smola and Olivier Bousquet and Bernhard Schölkopf},
url = {https://www.herbrich.me/papers/gretton05a.pdf},
year = {2005},
date = {2005-01-01},
journal = {Journal of Machine Learning Research},
volume = {6},
pages = {2075–2129},
abstract = {We introduce two new functionals, the constrained covariance and the kernel mutual information, to measure the degree of independence of random variables. These quantities are both based on the covariance between functions of the random variables in reproducing kernel Hilbert spaces (RKHSs). We prove that when the RKHSs are universal, both functionals are zero if and only if the random variables are pairwise independent. We also show that the kernel mutual information is an upper bound near independence on the Parzen window estimate of the mutual information. Analogous results apply for two correlation-based dependence functionals introduced earlier: we show the kernel canonical correlation and the kernel generalised variance to be independence measures for universal kernels, and prove the latter to be an upper bound on the mutual information near independence. The performance of the kernel dependence functionals in measuring independence is verified in the context of independent component analysis.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Gretton, Arthur; Smola, Alexander; Bousquet, Olivier; Herbrich, Ralf; Belitski, Andrei; Augath, Mark; Murayama, Yusuke; Pauls, Jon; Schölkopf, Bernhard; Logothetis, Nikos
Kernel Constrained Covariance for Dependence Measurement Proceedings Article
In: Proceedings of the 10th International Workshop on Artificial Intelligence and Statistics (AISTATS), pp. 112–119, 2005.
@inproceedings{gretton2005kernelcoco,
title = {Kernel Constrained Covariance for Dependence Measurement},
author = {Arthur Gretton and Alexander Smola and Olivier Bousquet and Ralf Herbrich and Andrei Belitski and Mark Augath and Yusuke Murayama and Jon Pauls and Bernhard Schölkopf and Nikos Logothetis},
url = {https://www.herbrich.me/papers/kcc.pdf},
year = {2005},
date = {2005-01-01},
booktitle = {Proceedings of the 10th International Workshop on Artificial Intelligence and Statistics (AISTATS)},
pages = {112–119},
abstract = {We discuss reproducing kernel Hilbert space (RKHS)-based measures of statistical dependence, with emphasis on constrained covariance (COCO), a novel criterion to test dependence of random variables. We show that COCO is a test for independence if and only if the associated RKHSs are universal. That said, no independence test exists that can distinguish dependent and independent random variables in all circumstances. Dependent random variables can result in a COCO which is arbitrarily close to zero when the source densities are highly non-smooth. All current kernel-based independence tests share this behaviour. We demonstrate exponential convergence between the population and empirical COCO. Finally, we use COCO as a measure of joint neural activity between voxels in MRI recordings of the macaque monkey, and compare the results to the mutual information and the correlation. We also show the effect of removing breathing artefacts from the MRI recording.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Herbrich, Ralf
On Gaussian Expectation Propagation Technical Report
Microsoft Research 2005.
@techreport{herbrich2005gaussianep,
title = {On Gaussian Expectation Propagation},
author = {Ralf Herbrich},
url = {https://www.herbrich.me/papers/EP.pdf},
year = {2005},
date = {2005-01-01},
institution = {Microsoft Research},
abstract = {In this short note we will re-derive the Gaussian expectation propagation (Gaussian EP) algorithm as presented in Minka (2001) and demonstrate an application of Gaussian EP to approximate multi-dimensional truncated Gaussians.},
keywords = {},
pubstate = {published},
tppubtype = {techreport}
}
Herbrich, Ralf
Minimising the Kullback-Leibler Divergence Technical Report
Microsoft Research 2005.
@techreport{herbrich2005minimizingkl,
title = {Minimising the Kullback-Leibler Divergence},
author = {Ralf Herbrich},
url = {https://www.herbrich.me/papers/KL.pdf},
year = {2005},
date = {2005-01-01},
institution = {Microsoft Research},
abstract = {In this note we show that minimising the Kullback–Leibler divergence over a family in the class of exponential distributions is achieved by matching the expected natural statistic. We will also give an explicit update formula for distributions with only one likelihood term.},
keywords = {},
pubstate = {published},
tppubtype = {techreport}
}
Herbrich, Ralf; Graepel, Thore; Williamson, Robert C
The Structure of Version Space Book Section
In: Innovations in Machine Learning: Theory and Applications, pp. 257–274, Springer, 2005.
@incollection{herbrich2005versionspace,
title = {The Structure of Version Space},
author = {Ralf Herbrich and Thore Graepel and Robert C Williamson},
url = {https://www.herbrich.me/papers/holmes_mod2.pdf},
year = {2005},
date = {2005-01-01},
booktitle = {Innovations in Machine Learning: Theory and Applications},
pages = {257–274},
publisher = {Springer},
chapter = {9},
abstract = {We investigate the generalisation performance of consistent classifiers, i.e. classifiers that are contained in the so-called version space, both from a theoretical and experimental angle. In contrast to classical VC analysis - where no single classifier within version space is singled out on grounds of a generalisation error bound - the data dependent structural risk minimisation framework suggests that there exists one particular classifier that is to be preferred because it minimises the generalisation error bound. This is usually taken to provide a theoretical justification for learning algorithms such as the well known support vector machine. A reinterpretation of a recent PAC-Bayesian result, however, reveals that given a suitably chosen hypothesis space there exists a large fraction of classifiers with small generalisation error although we cannot readily identify them for a specific learning task. In the particular case of linear classifiers we show that classifiers found by the classical perceptron algorithm have guarantees bounded by the size of version space. These results are complemented with an empirical study for kernel classifiers on the task of handwritten digit recognition which demonstrates that even classifiers with a small margin may exhibit excellent generalisation. In order to perform this analysis we introduce the kernel Gibbs sampler - an algorithm which can be used to sample consistent kernel classifiers.},
keywords = {},
pubstate = {published},
tppubtype = {incollection}
}
2004
Agarwal, Shivani; Graepel, Thore; Herbrich, Ralf; Roth, Dan
A Large Deviation Bound for the Area Under the ROC Curve Proceedings Article
In: Advances in Neural Information Processing Systems 17, pp. 9–16, 2004.
@inproceedings{agarwal2004roc,
title = {A Large Deviation Bound for the Area Under the ROC Curve},
author = {Shivani Agarwal and Thore Graepel and Ralf Herbrich and Dan Roth},
url = {https://www.herbrich.me/papers/nips04-auc.pdf},
year = {2004},
date = {2004-01-01},
booktitle = {Advances in Neural Information Processing Systems 17},
pages = {9–16},
abstract = {The area under the ROC curve (AUC) has been advocated as an evaluation criterion for the bipartite ranking problem. We study large deviation properties of the AUC; in particular, we derive a distribution-free large deviation bound for the AUC which serves to bound the expected accuracy of a ranking function in terms of its empirical AUC on an independent test sequence. A comparison of our result with a corresponding large deviation result for the classification error rate suggests that the test sample size required to obtain an epsilon-accurate estimate of the expected accuracy of a ranking function with delta-confidence is larger than that required to obtain an epsilon-accurate estimate of the expected error rate of a classification function with the same confidence. A simple application of the union bound allows the large deviation bound to be extended to learned ranking functions chosen from finite function classes.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Graepel, Thore; Herbrich, Ralf; Gold, Julian
Learning to Fight Proceedings Article
In: Proceedings of the International Conference on Computer Games: Artificial Intelligence, Design and Education, pp. 193–200, 2004.
@inproceedings{graepel2004learningtofight,
title = {Learning to Fight},
author = {Thore Graepel and Ralf Herbrich and Julian Gold},
url = {https://www.herbrich.me/papers/graehergol04.pdf},
year = {2004},
date = {2004-01-01},
booktitle = {Proceedings of the International Conference on Computer Games: Artificial Intelligence, Design and Education},
pages = {193–200},
abstract = {We apply reinforcement learning to the problem of finding good policies for a fighting agent in a commercial computer game. The learning agent is trained using the SARSA algorithm for on-policy learning of an action-value function represented by linear and neural network function approximators. We discuss the selection and construction of features, actions, and rewards as well as other design choices necessary to integrate the learning process into the game. The learning agent is trained against the built-in AI of the game with different rewards encouraging aggressive or defensive behaviour. We show that the learning agent finds interesting (and partly near optimal) policies in accordance with the reward functions provided. We also discuss the particular challenges arising in the application of reinforcement learning to the domain of computer games.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Rajaram, Shyamsundar; Graepel, Thore; Herbrich, Ralf
Poisson-Networks : A Model for Structured Poisson Processes Proceedings Article
In: Proceedings of the 10th International Workshop on Artificial Intelligence and Statistics, pp. 277–284, 2004.
@inproceedings{rajaram2004poissonnetworks,
title = {Poisson-Networks : A Model for Structured Poisson Processes},
author = {Shyamsundar Rajaram and Thore Graepel and Ralf Herbrich},
url = {https://www.herbrich.me/papers/spikenets.pdf},
year = {2004},
date = {2004-01-01},
booktitle = {Proceedings of the 10th International Workshop on Artificial Intelligence and Statistics},
pages = {277–284},
abstract = {Modelling structured multivariate point process data has wide ranging applications like understanding neural activity, developing faster file access systems and learning dependencies among servers in large networks. In this paper, we develop the Poisson network model for representing multivariate structured Poisson processes. In our model each node of the network represents a Poisson process. The novelty of our work is that waiting times of a process are modelled by an exponential distribution with a piecewise constant rate function that depends on the event counts of its parents in the network in a generalised linear way. Our choice of model allows to perform exact sampling from arbitrary structures. We adopt a Bayesian approach for learning the network structure. Further, we discuss fixed point and sampling based approximations for performing inference of rate functions in Poisson networks.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2003
Graepel, Thore; Herbrich, Ralf
Invariant Pattern Recognition by Semidefinite Programming Machines Proceedings Article
In: Advances in Neural Information Processing Systems 16, pp. 33–40, 2003.
@inproceedings{graepel2003sdm,
title = {Invariant Pattern Recognition by Semidefinite Programming Machines},
author = {Thore Graepel and Ralf Herbrich},
url = {https://www.herbrich.me/papers/sdpm.pdf},
year = {2003},
date = {2003-01-01},
booktitle = {Advances in Neural Information Processing Systems 16},
pages = {33–40},
abstract = {Knowledge about local invariances with respect to given pattern transformations can greatly improve the accuracy of classification. Previous approaches are either based on regularisation or on the generation of virtual (transformed) examples. We develop a new framework for learning linear classifiers under known transformations based on semidefinite programming. We present a new learning algorithm - the Semidefinite Programming Machine (SDPM) - which is able to find a maximum margin hyperplane when the training examples are polynomial trajectories instead of single points. The solution is found to be sparse in dual variables and allows to identify those points on the trajectory with minimal real-valued output as virtual support vectors. Extensions to segments of trajectories, to more than one transformation parameter, and to learning with kernels are discussed. In experiments we use a Taylor expansion to locally approximate rotational invariance in pixel images from USPS and find improvements over known methods.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Graepel, Thore; Herbrich, Ralf; Kharechko, Andriy; Shawe-Taylor, John
Semi-Definite Programming by Perceptron Learning Proceedings Article
In: Advances in Neural Information Processing Systems 16, pp. 457–464, The MIT Press, 2003.
@inproceedings{graepel2003sdpwithperceptron,
title = {Semi-Definite Programming by Perceptron Learning},
author = {Thore Graepel and Ralf Herbrich and Andriy Kharechko and John Shawe-Taylor},
url = {https://www.herbrich.me/papers/sdpm-pla.pdf},
year = {2003},
date = {2003-01-01},
booktitle = {Advances in Neural Information Processing Systems 16},
pages = {457–464},
publisher = {The MIT Press},
abstract = {We present a modified version of the perceptron learning algorithm (PLA) which solves semidefinite programs (SDPs) in polynomial time. The algorithm is based on the following three observations: (i) Semidefinite programs are linear programs with infinitely many (linear) constraints; (ii) every linear program can be solved by a sequence of constraint satisfaction problems with linear constraints; (iii) in general, the perceptron learning algorithm solves a constraint satisfaction problem with linear constraints in finitely many updates. Combining the PLA with a probabilistic rescaling algorithm (which, on average, increases the size of the feasable region) results in a probabilistic algorithm for solving SDPs that runs in polynomial time. We present preliminary results which demonstrate that the algorithm works, but is not competitive with state-of-the-art interior point methods.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Gretton, Arthur; Herbrich, Ralf; Smola, Alexander
The Kernel Mutual Information Proceedings Article
In: Proceedings of IEEE Internaltional Conference on Acoustics, Speech and Signal Processing, pp. 880–883, 2003.
@inproceedings{gretton2003kmi,
title = {The Kernel Mutual Information},
author = {Arthur Gretton and Ralf Herbrich and Alexander Smola},
url = {https://www.herbrich.me/papers/GreHerSmo03.pdf},
year = {2003},
date = {2003-01-01},
booktitle = {Proceedings of IEEE Internaltional Conference on Acoustics, Speech and Signal Processing},
pages = {880–883},
abstract = {We introduce a new contrast function, the kernel mutual information (KMI), to measure the degree of independence of continuous random variables. This contrast function provides an approximate upper bound on the mutual information, as measured near independence, and is based on a kernel density estimate of the mutual information between a discretised approximation of the continuous random variables. We show that Bach and Jordan's kernel generalised variance (KGV) is also an upper bound on the same kernel density estimate, but is looser. Finally, we suggest that the addition of a regularising term in the KGV causes it to approach the KMI, which motivates the introduction of this regularisation.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Harrington, Edward; Herbrich, Ralf; Kivinen, Jyrki; Platt, John C; Williamson, Robert C
Online Bayes Point Machines Proceedings Article
In: Proceedings of Advances in Knowledge Discovery and Data Mining, pp. 241–252, 2003.
@inproceedings{harrington2003onlinebpm,
title = {Online Bayes Point Machines},
author = {Edward Harrington and Ralf Herbrich and Jyrki Kivinen and John C Platt and Robert C Williamson},
url = {https://www.herbrich.me/papers/OBPM.pdf},
year = {2003},
date = {2003-01-01},
booktitle = {Proceedings of Advances in Knowledge Discovery and Data Mining},
pages = {241–252},
abstract = {We present a new and simple algorithm for learning large margin classifiers that works in a truly online manner. The algorithm generates a linear classifier by averaging the weights associated with several perceptron-like algorithms run in parallel in order to approximate the Bayes point. A random subsample of the incoming data stream is used to ensure diversity in the perceptron solutions. We experimentally study the algorithm's performance on online and batch learning settings. The online experiments showed that our algorithm produces a low prediction error on the training sequence and tracks the presence of concept drift. On the batch problems its performance is comparable to the maximum margin algorithm which explicitly maximises the margin.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Herbrich, Ralf; Graepel, Thore
Introduction to the Special Issue on Learning Theory Journal Article
In: Journal of Machine Learning Research, vol. 4, pp. 755–757, 2003.
@article{herbrich2003intro,
title = {Introduction to the Special Issue on Learning Theory},
author = {Ralf Herbrich and Thore Graepel},
url = {https://www.herbrich.me/papers/lt.pdf},
year = {2003},
date = {2003-01-01},
journal = {Journal of Machine Learning Research},
volume = {4},
pages = {755–757},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
2002
Herbrich, Ralf
Learning Kernel Classifiers: Theory and Algorithms Book
2nd edition, The MIT Press, 2002.
@book{herbrich2002learningkernelclassifiers,
title = {Learning Kernel Classifiers: Theory and Algorithms},
author = {Ralf Herbrich},
year = {2002},
date = {2002-01-01},
publisher = {The MIT Press},
edition = {2nd edition},
keywords = {},
pubstate = {published},
tppubtype = {book}
}