@inproceedings{4c3513171bbc42a0a40889138b7c21c7,
title = "HORUS: multimodal large language models framework for video retrieval at VBS 2025",
abstract = "In the dynamic field of video retrieval, precise and effective search methods are crucial for managing complex datasets. We present HORUS, a novel approach based on multimodal Large Language Models (mLLMs) that advances video retrieval capabilities through two key innovations: (1) advanced multi-modal feature aggregation, integrating text-to-image search with CLIP, free-text search from captions generated by Video-LLaMA2, and visual features from Video-LLaMA to capture temporal dynamics; and (2) GPT-based query expansion, combined with an advanced filter, addresses issues with low-quality open-ended text queries and refines item searches based on type and location within a scene. This work provides cutting-edge solutions for the VBS 2025 challenge and offers valuable insights into enhancing video search techniques.",
keywords = "Video browser showdown, Video retrieval, Multi-modal feature aggregation",
author = "Tai Nguyen and Anh, {Vo Ngoc Minh} and Pham, {Duc Dat} and Vinh, {Tran Quang} and Quynh, {Nhu Duong Thi} and Tien, {Le Anh} and Le, {Tan Duy} and Nguyen, {Binh T.}",
note = "Funding: Acknowledgments. The work was partially supported by Mr. Hoang Anh Le from OCANY VIET NAM VENTURE CO., LTD.",
year = "2025",
month = jan,
day = "1",
doi = "10.1007/978-981-96-2074-6_34",
language = "English",
isbn = "9789819620739",
series = "Lecture notes in computer science",
publisher = "Springer Nature Singapore",
pages = "286--293",
editor = "Ichiro Ide and Ioannis Kompatsiaris and Changsheng Xu and Keiji Yanai and Wei-Ta Chu and Naoko Nitta and Michael Riegler and Toshihiko Yamasaki",
booktitle = "MultiMedia modeling",
}