@inproceedings{4422f2e09ea947a7996c93d6f88a4b13,
title = "RelTransformer: A Transformer-Based Long-Tail Visual Relationship Recognition",
abstract = "The visual relationship recognition (VRR) task aims at understanding the pairwise visual relationships between interacting objects in an image. These relationships typically have a long-tail distribution due to their compositional nature. This problem gets more severe when the vocabulary becomes large, rendering this task very challenging. This paper shows that modeling an effective message-passing flow through an attention mechanism can be critical to tackling the compositionality and long-tail challenges in VRR. The method, called RelTransformer, represents each image as a fully-connected scene graph and restructures the whole scene into the relation-triplet and global-scene contexts. It directly passes the message from each element in the relation-triplet and global-scene contexts to the target relation via self-attention. We also design a learnable memory to augment the long-tail relation representation learning. Through extensive experiments, we find that our model generalizes well on many VRR benchmarks. Our model outperforms the best-performing models on two large-scale long-tail VRR benchmarks, VG8K-LT (+2.0% overall acc) and GQA-LT (+26.0% overall acc), both having a highly skewed distribution towards the tail. It also achieves strong results on the VG200 relation detection task. Our code is available at https://github.com/Vision-CAIR/ReITransformer.",
keywords = "Scene analysis and understanding, Transfer/low-shot/long-tail learning",
author = "Jun Chen and Aniket Agarwal and Sherif Abdelkarim and Deyao Zhu and Mohamed Elhoseiny",
note = "Publisher Copyright: {\textcopyright} 2022 IEEE.; 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022 ; Conference date: 19-06-2022 Through 24-06-2022",
year = "2022",
doi = "10.1109/CVPR52688.2022.01890",
language = "English (US)",
series = "Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition",
publisher = "IEEE Computer Society",
pages = "19485--19495",
booktitle = "Proceedings - 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022",
address = "United States",
}