@inproceedings{3c1f68fae64e49fc9da575cc506a35d5,
title = "OmNICCL: Zero-cost Sparse AllReduce with Direct Cache Access and SmartNICs",
abstract = "AllReduce is a collective communication pattern commonly used in Distributed Deep Learning (DDL) and High Performance Computing (HPC). Sparse AllReduce, which compresses the data transmitted, achieves significant acceleration on specific workloads. However, compression introduces a non-negligible performance overhead. Therefore, we propose the OmNICreduce algorithm, an efficient inter-node sparse AllReduce method, as well as its implementation, OmNICCL. It utilizes Direct Cache Access (DCA) to achieve zero-overhead lossless compression and employs SmartNICs for aggregation on the data plane. We demonstrate that our method can provide up to a 7.24× speedup over conventional dense AllReduce methods under a 100Gbps RoCEv2 network and 1.76-17.37× performance improvement over state-of-the-art implementations when performing sparse AllReduce.",
keywords = "Collective Communication, DCA, DPU, In-Network Aggregation, SmartNIC",
author = "Tongzhou Gu and Jiawei Fei and Marco Canini",
note = "Publisher Copyright: {\textcopyright} 2024 Owner/Author.; 1st Workshop on Networks for AI Computing, NAIC 2024 ; Conference date: 04-08-2024 Through 08-08-2024",
year = "2024",
month = aug,
day = "4",
doi = "10.1145/3672198.3673804",
language = "English (US)",
series = "NAIC 2024 - Proceedings of the 2024 SIGCOMM Workshop on Networks for AI Computing",
publisher = "Association for Computing Machinery, Inc",
pages = "75--83",
booktitle = "NAIC 2024 - Proceedings of the 2024 SIGCOMM Workshop on Networks for AI Computing",
}