@inproceedings{c10ea249c87b4479bca357febde3311c,
title = "On Pretraining Data Diversity for Self-Supervised Learning",
abstract = "We explore the impact of training with more diverse datasets, characterized by the number of unique samples, on the performance of self-supervised learning (SSL) under a fixed computational budget. Our findings demonstrate that increasing pretraining data diversity enhances SSL performance, albeit only when the distribution distance to the downstream data is minimal. Notably, even with an exceptionally large pretraining data diversity achieved through methods like web crawling or diffusion-generated data, among other ways, the distribution shift remains a challenge. Our experiments are comprehensive with seven SSL methods using large-scale datasets such as ImageNet and YFCC100M amounting to over 200 GPU days. The code and trained models will be available at https://github.com/hammoudhasan/DiversitySSL.",
keywords = "Data diversity, Distribution shift, Self-supervised learning",
author = "{Al Kader Hammoud}, {Hasan Abed} and Tuhin Das and Fabio Pizzati and Torr, {Philip H.S.} and Adel Bibi and Bernard Ghanem",
note = "Publisher Copyright: {\textcopyright} The Author(s), under exclusive license to Springer Nature Switzerland AG 2025.; 18th European Conference on Computer Vision, ECCV 2024 ; Conference date: 29-09-2024 Through 04-10-2024",
year = "2025",
doi = "10.1007/978-3-031-72992-8_4",
language = "English (US)",
isbn = "9783031729911",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "54--71",
editor = "Ale{\v s} Leonardis and Elisa Ricci and Stefan Roth and Olga Russakovsky and Torsten Sattler and G{\"u}l Varol",
booktitle = "Computer Vision – ECCV 2024 - 18th European Conference, Proceedings",
address = "Germany",
}