@inproceedings{682b7a5057a5474ca0af670b09179634,
title = "Can Gaze Inform Egocentric Action Recognition?",
abstract = "We investigate the hypothesis that gaze-signal can improve egocentric action recognition on the standard benchmark, EGTEA Gaze++ dataset. In contrast to prior work where gaze-signal was only used during training, we formulate a novel neural fusion approach, Cross-modality Attention Blocks (CMA), to leverage gaze-signal for action recognition during inference as well. CMA combines information from different modalities at different levels of abstraction to achieve state-of-the-art performance for egocentric action recognition. Specifically, fusing the video-stream with optical-flow with CMA outperforms the current state-of-the-art by 3%. However, when CMA is employed to fuse gaze-signal with video-stream data, no improvements are observed. Further investigation of this counter-intuitive finding indicates that small spatial overlap between the network's attention-map and gaze ground-truth renders the gaze-signal uninformative for this benchmark. Based on our empirical findings, we recommend improvements to the current benchmark to develop practical systems for egocentric video understanding with gaze-signal.",
keywords = "attention, deep neural networks, egocentric action recognition, gaze",
author = "Zehua Zhang and David Crandall and Michael Proulx and Sachin Talathi and Abhishek Sharma",
note = "Publisher Copyright: {\textcopyright} 2022 ACM.; 2022 ACM Symposium on Eye Tracking Research and Applications, ETRA 2022 ; Conference date: 08-06-2022 Through 11-06-2022",
year = "2022",
month = jun,
day = "8",
doi = "10.1145/3517031.3529628",
language = "English",
series = "Eye Tracking Research and Applications Symposium (ETRA)",
publisher = "Association for Computing Machinery",
editor = "Spencer, {Stephen N.}",
booktitle = "Proceedings - ETRA 2022",
address = "USA United States",
}