multi-modal

Gaze-assisted automatic captioning of fetal ultrasound videos using three-way multi-modal deep neural networks

BibTex @article{ALSHARID2022102630, title = {Gaze-assisted automatic captioning of fetal ultrasound videos using three-way multi-modal deep neural networks}, journal = {Medical Image Analysis}, volume = {82}, pages = {102630}, year = {2022}, issn = {1361-8415}, doi = {https://doi.org/10.1016/j.media.2022.102630}, url = {https://www.sciencedirect.com/science/article/pii/S1361841522002584}, author = {Mohammad Alsharid and Yifan Cai and Harshita Sharma and Lior Drukker and Aris T. Papageorghiou and J. Alison Noble}, keywords = {Video captioning, Gaze tracking, Fetal ultrasound, Audio–visual, Multi-modal}, abstract = {In this work, we present a novel gaze-assisted natural language processing (NLP)-based video captioning model to describe routine second-trimester fetal ultrasound scan videos in a vocabulary of spoken sonography.