@inproceedings{5c5c21ceb7ae4642b4f75c33347e08e3,
title = "T2ICount: enhancing cross-modal understanding for zero-shot counting",
abstract = "Zero-Shot object counting aims to count instances of arbitrary object categories specified by text descriptions. Existing methods typically rely on vision-language models like CLIP, but often exhibit limited sensitivity to text prompts. We present T21 Count, a diffusion-based framework that lever-ages rich prior knowledge and fine-grained visual understanding from pretrained diffusion models. While one-step demising ensures efficiency, it leads to weakened text sensitivity. To address this challenge, we propose a Hierarchical Semantic Correction Module that progressively refines text-image feature alignment, and a Representational Regional Coherence Loss that provides reliable supervision signals by leveraging the cross-attention maps extracted from the demising U-Net. Furthermore, we observe that current benchmarks mainly focus on majority objects in images, potentially masking models' text sensitivity. To address this, we contribute a challenging re-annotated subset of FSC147 for better evaluation of text-guided counting ability. Extensive experiments demonstrate that our method achieves superior performance across different benchmarks. Code is available at https://github.com/chal5yq/T2lCount.",
author = "Yifei Qian and Zhongliang Guo and Bowen Deng and Lei, \{Chun Tong\} and Shuai Zhao and Lau, \{Chun Pong\} and Xiaopeng Hong and Michael Pound",
note = "Funding: This work was supported by the Biotechnology and Biological Sciences Research Council (grant number BB/Y513908/1). This work was also funded by the National Natural Science Foundation of China (62376070, 62076195).",
year = "2025",
month = aug,
day = "13",
doi = "10.1109/CVPR52734.2025.02359",
language = "English",
isbn = "9798331543655",
series = "IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
publisher = "IEEE Computer Society",
pages = "25336--25345",
booktitle = "2025 IEEE/CVF conference on computer vision and pattern recognition (CVPR)",
address = "United States",
}