@inproceedings{4c3513171bbc42a0a40889138b7c21c7,
title = "HORUS: multimodal large language models framework for video retrieval at VBS 2025",
abstract = "In the dynamic field of video retrieval, precise and effective search methods are crucial for managing complex datasets. We present HORUS, a novel approach based on multimodal Large Language Models (mLLMs) that advances video retrieval capabilities through two key innovations: (1) advanced multi-modal feature aggregation, integrating text-to-image search with CLIP, free-text search from captions generated by Video-LLaMA2, and visual features from Video-LLaMA to capture temporal dynamics; and (2) GPT-based query expansion, combined with an advanced filter, addresses issues with low-quality open-ended text queries and refines item searches based on type and location within a scene. This work provides cutting-edge solutions for the VBS 2025 challenge and offers valuable insights into enhancing video search techniques.",
keywords = "Video browser showdown, Video retrieval, Multi-modal feature aggregation",
author = "Tai Nguyen and Anh, \{Vo Ngoc Minh\} and Pham, \{Duc Dat\} and Vinh, \{Tran Quang\} and Quynh, \{Nhu Duong Thi\} and Tien, \{Le Anh\} and Le, \{Tan Duy\} and Nguyen, \{Binh T.\}",
note = "Funding: Acknowledgments. The work was partially supported by Mr. Hoang Anh Le from OCANY VIET NAM VENTURE CO., LTD.",
year = "2025",
month = jan,
day = "1",
doi = "10.1007/978-981-96-2074-6\_34",
language = "English",
isbn = "9789819620739",
series = "Lecture notes in computer science",
publisher = "Springer Nature Singapore",
pages = "286--293",
editor = "Ichiro Ide and Ioannis Kompatsiaris and Changsheng Xu and Keiji Yanai and Wei-Ta Chu and Naoko Nitta and Michael Riegler and Toshihiko Yamasaki",
booktitle = "MultiMedia modeling",
}