Yifan Cai | Harshita Sharma

Gaze-assisted automatic captioning of fetal ultrasound videos using three-way multi-modal deep neural networks

Tue, 01 Nov 2022 00:00:00 +0000

BibTex

@article{ALSHARID2022102630,
title = {Gaze-assisted automatic captioning of fetal ultrasound videos using three-way multi-modal deep neural networks},
journal = {Medical Image Analysis},
volume = {82},
pages = {102630},
year = {2022},
issn = {1361-8415},
doi = {https://doi.org/10.1016/j.media.2022.102630},
url = {https://www.sciencedirect.com/science/article/pii/S1361841522002584},
author = {Mohammad Alsharid and Yifan Cai and Harshita Sharma and Lior Drukker and Aris T. Papageorghiou and J. Alison Noble},
keywords = {Video captioning, Gaze tracking, Fetal ultrasound, Audio–visual, Multi-modal},
abstract = {In this work, we present a novel gaze-assisted natural language processing (NLP)-based video captioning model to describe routine second-trimester fetal ultrasound scan videos in a vocabulary of spoken sonography. The primary novelty of our multi-modal approach is that the learned video captioning model is built using a combination of ultrasound video, tracked gaze and textual transcriptions from speech recordings. The textual captions that describe the spatio-temporal scan video content are learnt from sonographer speech recordings. The generation of captions is assisted by sonographer gaze-tracking information reflecting their visual attention while performing live-imaging and interpreting a frozen image. To evaluate the effect of adding, or withholding, different forms of gaze on the video model, we compare spatio-temporal deep networks trained using three multi-modal configurations, namely: (1) a gaze-less neural network with only text and video as input, (2) a neural network additionally using real sonographer gaze in the form of attention maps, and (3) a neural network using automatically-predicted gaze in the form of saliency maps instead. We assess algorithm performance through established general text-based metrics (BLEU, ROUGE-L, F1 score), a domain-specific metric (ARS), and metrics that consider the richness and efficiency of the generated captions with respect to the scan video. Results show that the proposed gaze-assisted models can generate richer and more diverse captions for clinical fetal ultrasound scan videos than those without gaze at the expense of the perceived sentence structure. The results also show that the generated captions are similar to sonographer speech in terms of discussing the visual content and the scanning actions performed.}
}

Multimodal Continual Learning with Sonographer Eye-Tracking in Fetal Ultrasound

Thu, 30 Sep 2021 00:00:00 +0000

BibTex

@InProceedings{patra2021multimodal,
author="Patra, Arijit and Cai, Yifan and Chatelain, Pierre and Sharma, Harshita and Drukker, Lior and Papageorghiou, Aris T. and Noble, J. Alison",
editor="Noble, J. Alison and Aylward, Stephen and Grimwood, Alexander and Min, Zhe and Lee, Su-Lin and Hu, Yipeng",
title="Multimodal Continual Learning with Sonographer Eye-Tracking in Fetal Ultrasound",
booktitle="Simplifying Medical Ultrasound",
year="2021",
publisher="Springer International Publishing",
address="Cham",
pages="14--24",
abstract="Deep networks have been shown to achieve impressive accuracy for some medical image analysis tasks where large datasets and annotations are available. However, tasks involving learning over new sets of classes arriving over extended time is a different and difficult challenge due to the tendency of reduction in performance over old classes while adapting to new ones. Controlling such a `forgetting' is vital for deployed algorithms to evolve with new arrivals of data incrementally. Usually, incremental learning approaches rely on expert knowledge in the form of manual annotations or active feedback. In this paper, we explore the role that other forms of expert knowledge might play in making deep networks in medical image analysis immune to forgetting over extended time. We introduce a novel framework for mitigation of this forgetting effect in deep networks considering the case of combining ultrasound video with point-of-gaze tracked for expert sonographers during model training. This is used along with a novel weighted distillation strategy to reduce the propagation of effects due to class imbalance.",
isbn="978-3-030-87583-1"
}

Spatio-Temporal Visual Attention Modelling of Standard Biometry Plane-Finding Navigation

Sat, 20 Jun 2020 00:00:00 +0000

BibTex

@article{cai2020spatio,
title = "Spatio-temporal visual attention modelling of standard biometry plane-finding navigation",
journal = "Medical Image Analysis",
volume = "65",
pages = "101762",
year = "2020",
issn = "1361-8415",
doi = "https://doi.org/10.1016/j.media.2020.101762",
url = "http://www.sciencedirect.com/science/article/pii/S1361841520301262",
author = "Yifan Cai and Richard Droste and Harshita Sharma and Pierre Chatelain and Lior Drukker and Aris T. Papageorghiou and J. Alison Noble",
keywords = "Fetal ultrasound, Gaze tracking, Multi-task learning, Saliency prediction, Standard plane detection"
}

Towards Capturing Sonographic Experience: Cognition-Inspired Ultrasound Video Saliency Prediction

Fri, 01 Nov 2019 00:00:00 +0000

BibTex

@inproceedings{droste_towards_2020,
title = {Towards Capturing Sonographic Experience: Cognition-Inspired Ultrasound Video Saliency Prediction},
author = {Droste, Richard and Cai, Yifan and Sharma, Harshita and Chatelain, Pierre and Papageorghiou, Aris T. and Noble, J. Alison},
booktitle = {Medical Image Understanding and Analysis},
doi = {10.1007/978-3-030-39343-4_15},
editor = {Zheng, Yalin and Williams, Bryan M. and Chen, Ke},
isbn = {978-3-030-39343-4},
keywords = {Convolutional neural networks, Fetal ultrasound, Gaze tracking, Video saliency prediction},
language = {en},
pages = {174--186},
publisher = {Springer International Publishing},
address = {Cham},
series = {Communications in Computer and Information Science},
shorttitle = {Towards Capturing Sonographic Experience},
year = {2020}
}

Ultrasound Image Representation Learning by Modeling Sonographer Visual Attention

Sat, 01 Jun 2019 00:00:00 +0000

BibTex

@inproceedings{droste_ultrasound_2019,
author = {Droste, Richard and Cai, Yifan and Sharma, Harshita and Chatelain, Pierre and Drukker, Lior and Papageorghiou, Aris T. and Noble, J. Alison},
title = {Ultrasound Image Representation Learning by Modeling Sonographer Visual Attention},
booktitle = {Information Processing in Medical Imaging},
address = {Cham},
doi = {10.1007/978-3-030-20351-1_46},
editor = {Chung, Albert C. S. and Gee, James C. and Yushkevich, Paul A. and Bao, Siqi},
isbn = {978-3-030-20351-1},
language = {en},
pages = {592--604},
publisher = {Springer International Publishing},
series = {Lecture Notes in Computer Science},
year = {2019}
}

Monitoring Sonographer Performance: The Perception Ultrasound by Learning Sonographer Experience (PULSE) study

Mon, 01 Apr 2019 00:00:00 +0000

Efficient Ultrasound Image Analysis Models with Sonographer Gaze Assisted Distillation

Tue, 01 Jan 2019 00:00:00 +0000

BibTex

@inproceedings{patra_efficient_2019,
title = {Efficient Ultrasound Image Analysis Models with Sonographer Gaze Assisted Distillation},
author = {Patra, Arijit and Cai, Yifan and Chatelain, Pierre and Sharma, Harshita and Drukker, Lior and Papageorghiou, Aris T. and Noble, J. Alison},
booktitle = {Medical Image Computing and Computer Assisted Intervention – MICCAI 2019},
doi = {10.1007/978-3-030-32251-9_43},
editor = {Shen, Dinggang and Liu, Tianming and Peters, Terry M. and Staib, Lawrence H. and Essert, Caroline and Zhou, Sean and Yap, Pew-Thian and Khan, Ali},
isbn = {978-3-030-32251-9},
language = {en},
pages = {394--402},
publisher = {Springer International Publishing},
series = {Lecture Notes in Computer Science},
address = {Cham},
year = {2019}
}

Multi-task SonoEyeNet: Detection of Fetal Standardized Planes Assisted by Generated Sonographer Attention Maps

Mon, 01 Jan 2018 00:00:00 +0000

BibTex

@inproceedings{cai_multi-task_2018,
title = {Multi-task SonoEyeNet: Detection of Fetal Standardized Planes Assisted by Generated Sonographer Attention Maps},
author = {Cai, Yifan and Sharma, Harshita and Chatelain, Pierre and Noble, J. Alison},
booktitle = {Medical Image Computing and Computer Assisted Intervention – MICCAI 2018},
doi = {10.1007/978-3-030-00928-1_98},
editor = {Frangi, Alejandro F. and Schnabel, Julia A. and Davatzikos, Christos and Alberola-López, Carlos and Fichtinger, Gabor},
isbn = {978-3-030-00928-1},
language = {en},
pages = {871--879},
publisher = {Springer International Publishing},
series = {Lecture Notes in Computer Science},
shorttitle = {Multi-task SonoEyeNet},
year = {2018},
address = {Cham}
}

SonoEyeNet: Standardized fetal ultrasound plane detection informed by eye tracking

Mon, 01 Jan 2018 00:00:00 +0000

BibTex

@inproceedings{cai_sonoeyenet_2018,
title = {SonoEyeNet: Standardized fetal ultrasound plane detection informed by eye tracking},
author = {Cai, Y. and Sharma, H. and Chatelain, P. and Noble, J. A.},
booktitle = {2018 IEEE 15th International Symposium on Biomedical Imaging (ISBI 2018)},
doi = {10.1109/ISBI.2018.8363851},
note = {ISSN: 1945-8452},
pages = {1475--1478},
shorttitle = {SonoEyeNet},
year = {2018}
}