{"id":"https:\/\/openalex.org\/W4416260731","doi":"https:\/\/doi.org\/10.48550\/arxiv.2506.21924","title":"SPAZER: Spatial-Semantic Progressive Reasoning Agent for Zero-shot 3D Visual Grounding","display_name":"SPAZER: Spatial-Semantic Progressive Reasoning Agent for Zero-shot 3D Visual Grounding","publication_year":2025,"publication_date":"2025-06-27","ids":{"openalex":"https:\/\/openalex.org\/W4416260731","doi":"https:\/\/doi.org\/10.48550\/arxiv.2506.21924"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2506.21924","is_oa":true,"landing_page_url":"http:\/\/arxiv.org\/abs\/2506.21924","pdf_url":"https:\/\/arxiv.org\/pdf\/2506.21924","source":{"id":"https:\/\/openalex.org\/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https:\/\/arxiv.org\/pdf\/2506.21924","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Jin, Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jin, Zhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5008273094","display_name":"Rong-Cheng Tu","orcid":"https:\/\/orcid.org\/0000-0002-9567-159X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tu, Rong-Cheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5101872743","display_name":"Jingyi Liao","orcid":"https:\/\/orcid.org\/0009-0003-3975-3675"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liao, Jingyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5034826017","display_name":"Wenhao Sun","orcid":"https:\/\/orcid.org\/0000-0002-8416-455X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Wenhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Luo, Xiao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Luo, Xiao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5090854835","display_name":"Shunyu Liu","orcid":"https:\/\/orcid.org\/0000-0003-0584-9129"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Shunyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https:\/\/openalex.org\/A5108378610","display_name":"Dacheng Tao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tao, Dacheng","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":7,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https:\/\/openalex.org\/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9233999848365784,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},"topics":[{"id":"https:\/\/openalex.org\/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9233999848365784,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},{"id":"https:\/\/openalex.org\/T10036","display_name":"Advanced Neural Network Applications","score":0.025499999523162842,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},{"id":"https:\/\/openalex.org\/T10812","display_name":"Human Pose and Action Recognition","score":0.012799999676644802,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https:\/\/openalex.org\/keywords\/visual-reasoning","display_name":"Visual reasoning","score":0.7299000024795532},{"id":"https:\/\/openalex.org\/keywords\/limiting","display_name":"Limiting","score":0.5784000158309937},{"id":"https:\/\/openalex.org\/keywords\/bridging","display_name":"Bridging (networking)","score":0.5723999738693237},{"id":"https:\/\/openalex.org\/keywords\/rendering","display_name":"Rendering (computer graphics)","score":0.5095000267028809},{"id":"https:\/\/openalex.org\/keywords\/ground","display_name":"Ground","score":0.4399999976158142},{"id":"https:\/\/openalex.org\/keywords\/modalities","display_name":"Modalities","score":0.4341000020503998},{"id":"https:\/\/openalex.org\/keywords\/visualization","display_name":"Visualization","score":0.4106000065803528},{"id":"https:\/\/openalex.org\/keywords\/active-vision","display_name":"Active vision","score":0.40389999747276306},{"id":"https:\/\/openalex.org\/keywords\/exploit","display_name":"Exploit","score":0.39570000767707825}],"concepts":[{"id":"https:\/\/openalex.org\/C41008148","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q21198","display_name":"Computer science","level":0,"score":0.7534000277519226},{"id":"https:\/\/openalex.org\/C2777508537","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q7936620","display_name":"Visual reasoning","level":2,"score":0.7299000024795532},{"id":"https:\/\/openalex.org\/C154945302","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6514999866485596},{"id":"https:\/\/openalex.org\/C188198153","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q1613840","display_name":"Limiting","level":2,"score":0.5784000158309937},{"id":"https:\/\/openalex.org\/C174348530","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q188635","display_name":"Bridging (networking)","level":2,"score":0.5723999738693237},{"id":"https:\/\/openalex.org\/C205711294","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q176953","display_name":"Rendering (computer graphics)","level":2,"score":0.5095000267028809},{"id":"https:\/\/openalex.org\/C168993435","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q6501125","display_name":"Ground","level":2,"score":0.4399999976158142},{"id":"https:\/\/openalex.org\/C2779903281","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q6888026","display_name":"Modalities","level":2,"score":0.4341000020503998},{"id":"https:\/\/openalex.org\/C36464697","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q451553","display_name":"Visualization","level":2,"score":0.4106000065803528},{"id":"https:\/\/openalex.org\/C193611912","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q4677596","display_name":"Active vision","level":2,"score":0.40389999747276306},{"id":"https:\/\/openalex.org\/C165696696","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q11287","display_name":"Exploit","level":2,"score":0.39570000767707825},{"id":"https:\/\/openalex.org\/C44291984","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q1074173","display_name":"Question answering","level":2,"score":0.35569998621940613},{"id":"https:\/\/openalex.org\/C195324797","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q33742","display_name":"Natural language","level":2,"score":0.35339999198913574},{"id":"https:\/\/openalex.org\/C51632099","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q3985153","display_name":"Training set","level":2,"score":0.34040001034736633},{"id":"https:\/\/openalex.org\/C2779439875","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q1078276","display_name":"Natural language understanding","level":3,"score":0.3400000035762787},{"id":"https:\/\/openalex.org\/C119857082","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q2539","display_name":"Machine learning","level":1,"score":0.3384999930858612},{"id":"https:\/\/openalex.org\/C155911833","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q3817354","display_name":"Spatial intelligence","level":2,"score":0.33340001106262207},{"id":"https:\/\/openalex.org\/C31972630","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q844240","display_name":"Computer vision","level":1,"score":0.31209999322891235},{"id":"https:\/\/openalex.org\/C204321447","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q30642","display_name":"Natural language processing","level":1,"score":0.2913999855518341},{"id":"https:\/\/openalex.org\/C50644808","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q192776","display_name":"Artificial neural network","level":2,"score":0.2808000147342682},{"id":"https:\/\/openalex.org\/C159032336","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q2488768","display_name":"Non-monotonic logic","level":2,"score":0.2759999930858612},{"id":"https:\/\/openalex.org\/C107457646","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.27320000529289246},{"id":"https:\/\/openalex.org\/C2984842247","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q197536","display_name":"Deep neural networks","level":3,"score":0.26930001378059387},{"id":"https:\/\/openalex.org\/C115086926","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q17004651","display_name":"Causal reasoning","level":3,"score":0.2646999955177307},{"id":"https:\/\/openalex.org\/C97364631","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q484284","display_name":"Deductive reasoning","level":2,"score":0.26089999079704285},{"id":"https:\/\/openalex.org\/C64754055","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q7574053","display_name":"Spatial contextual awareness","level":2,"score":0.2535000145435333},{"id":"https:\/\/openalex.org\/C89288958","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q7301504","display_name":"Reasoning system","level":2,"score":0.25119999051094055}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2506.21924","is_oa":true,"landing_page_url":"http:\/\/arxiv.org\/abs\/2506.21924","pdf_url":"https:\/\/arxiv.org\/pdf\/2506.21924","source":{"id":"https:\/\/openalex.org\/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550\/arxiv.2506.21924","is_oa":true,"landing_page_url":"https:\/\/doi.org\/10.48550\/arxiv.2506.21924","pdf_url":null,"source":{"id":"https:\/\/openalex.org\/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https:\/\/openalex.org\/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https:\/\/openalex.org\/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https:\/\/openalex.org\/licenses\/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2506.21924","is_oa":true,"landing_page_url":"http:\/\/arxiv.org\/abs\/2506.21924","pdf_url":"https:\/\/arxiv.org\/pdf\/2506.21924","source":{"id":"https:\/\/openalex.org\/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"3D":[0,11,24,95],"Visual":[1],"Grounding":[2],"(3DVG)":[3],"aims":[4],"to":[5,51,109,130],"localize":[6],"target":[7],"objects":[8],"within":[9],"a":[10,74,82,94,111],"scene":[12,91],"based":[13],"on":[14,22,102,150,155],"natural":[15],"language":[16],"queries.":[17],"To":[18],"alleviate":[19],"the":[20,35,90,98,132],"reliance":[21],"costly":[23],"training":[25,149],"data,":[26],"recent":[27],"studies":[28],"have":[29],"explored":[30],"zero-shot":[31,146,167],"3DVG":[32],"by":[33],"leveraging":[34,118],"extensive":[36],"knowledge":[37],"and":[38,45,92,138,157,174],"powerful":[39],"reasoning":[40,84,140],"capabilities":[41],"of":[42,114,172],"pre-trained":[43],"LLMs":[44],"VLMs.":[46],"However,":[47],"existing":[48],"paradigms":[49],"tend":[50],"emphasize":[52],"either":[53],"spatial":[54,137],"(3D-based)":[55],"or":[56],"semantic":[57,139],"(2D-based)":[58],"understanding,":[59],"limiting":[60],"their":[61],"effectiveness":[62],"in":[63,81,176],"complex":[64],"real-world":[65],"applications.":[66],"In":[67],"this":[68],"work,":[69],"we":[70],"introduce":[71],"SPAZER":[72,143,162],"-":[73],"VLM-driven":[75],"agent":[76],"that":[77,161],"combines":[78],"both":[79],"modalities":[80],"progressive":[83],"framework.":[85],"It":[86],"first":[87],"holistically":[88],"analyzes":[89],"produces":[93],"rendering":[96],"from":[97],"optimal":[99],"viewpoint.":[100],"Based":[101],"this,":[103],"anchor-guided":[104],"candidate":[105],"screening":[106],"is":[107,127],"conducted":[108],"perform":[110],"coarse-level":[112],"localization":[113],"potential":[115],"objects.":[116],"Furthermore,":[117],"retrieved":[119],"relevant":[120],"2D":[121],"camera":[122],"images,":[123],"3D-2D":[124],"joint":[125],"decision-making":[126],"efficiently":[128],"performed":[129],"determine":[131],"best-matching":[133],"object.":[134],"By":[135],"bridging":[136],"neural":[141],"streams,":[142],"achieves":[144],"robust":[145],"grounding":[147],"without":[148],"3D-labeled":[151],"data.":[152],"Extensive":[153],"experiments":[154],"ScanRefer":[156],"Nr3D":[158],"benchmarks":[159],"demonstrate":[160],"significantly":[163],"outperforms":[164],"previous":[165],"state-of-the-art":[166],"methods,":[168],"achieving":[169],"notable":[170],"gains":[171],"9.0%":[173],"10.9%":[175],"accuracy.":[177]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}