@inproceedings{b6ec815eff8a4869811b8bf74765ef3f,
title = "Learning to Disentangle Latent Physical Factors for Video Prediction",
abstract = "Physical scene understanding is a fundamental human ability. Empowering artificial systems with such understanding is an important step towards flexible and adaptive behavior in the real world. As a step in this direction, we propose a novel approach to physical scene understanding in video. We train a deep neural network for video prediction which embeds the video sequence in a low-dimensional recurrent latent space representation. We optimize the total correlation of the latent dimensions within a variational recurrent auto-encoder framework. This encourages the representation to disentangle the latent physical factors of variation in the training data. To train and evaluate our approach, we use synthetic video sequences in three different physical scenarios with various degrees of difficulty. Our experiments demonstrate that our model can disentangle several appearance-related properties in the unsupervised case. If we add supervision signals for the latent code, our model can further improve the disentanglement of dynamics-related properties.",
author = "Deyao Zhu and Marco Munderloh and Bodo Rosenhahn and J{\"o}rg St{\"u}ckler",
note = "Funding Information: This work has been supported through Cyber Valley. Publisher Copyright: {\textcopyright} Springer Nature Switzerland AG 2019.; 41st DAGM German Conference on Pattern Recognition, DAGM GCPR 2019 ; Conference date: 10-09-2019 Through 13-09-2019",
year = "2019",
doi = "10.1007/978-3-030-33676-9_42",
language = "English (US)",
isbn = "9783030336752",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer",
pages = "595--608",
editor = "Fink, {Gernot A.} and Simone Frintrop and Xiaoyi Jiang",
booktitle = "Pattern Recognition - 41st DAGM German Conference, DAGM GCPR 2019, Proceedings",
address = "Germany",
}