{"id":"https:\/\/openalex.org\/W4416696371","doi":"https:\/\/doi.org\/10.48550\/arxiv.2509.01790","title":"Flaw or Artifact? Rethinking Prompt Sensitivity in Evaluating LLMs","display_name":"Flaw or Artifact? Rethinking Prompt Sensitivity in Evaluating LLMs","publication_year":2025,"publication_date":"2025-09-01","ids":{"openalex":"https:\/\/openalex.org\/W4416696371","doi":"https:\/\/doi.org\/10.48550\/arxiv.2509.01790"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2509.01790","is_oa":true,"landing_page_url":"http:\/\/arxiv.org\/abs\/2509.01790","pdf_url":"https:\/\/arxiv.org\/pdf\/2509.01790","source":{"id":"https:\/\/openalex.org\/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https:\/\/arxiv.org\/pdf\/2509.01790","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https:\/\/openalex.org\/A5120309081","display_name":"Andong Hua","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Hua, Andong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Tang, Kenan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tang, Kenan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5064570991","display_name":"Chen Gu","orcid":"https:\/\/orcid.org\/0000-0003-0003-8992"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Chenhe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5020160826","display_name":"Jiong-Yan Gu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gu, Jindong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5078402672","display_name":"Eric Wong","orcid":"https:\/\/orcid.org\/0000-0003-2437-7446"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wong, Eric","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https:\/\/openalex.org\/A5101664570","display_name":"Yao Qin","orcid":"https:\/\/orcid.org\/0000-0001-9303-6341"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin, Yao","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https:\/\/openalex.org\/A5120309081"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https:\/\/openalex.org\/T10028","display_name":"Topic Modeling","score":0.73580002784729,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1702","display_name":"Artificial Intelligence"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},"topics":[{"id":"https:\/\/openalex.org\/T10028","display_name":"Topic Modeling","score":0.73580002784729,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1702","display_name":"Artificial Intelligence"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},{"id":"https:\/\/openalex.org\/T13629","display_name":"Text Readability and Simplification","score":0.11699999868869781,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1702","display_name":"Artificial Intelligence"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},{"id":"https:\/\/openalex.org\/T10181","display_name":"Natural Language Processing Techniques","score":0.030500000342726707,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1702","display_name":"Artificial Intelligence"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https:\/\/openalex.org\/keywords\/sensitivity","display_name":"Sensitivity (control systems)","score":0.6565999984741211},{"id":"https:\/\/openalex.org\/keywords\/artifact","display_name":"Artifact (error)","score":0.6546000242233276},{"id":"https:\/\/openalex.org\/keywords\/heuristic","display_name":"Heuristic","score":0.5048999786376953},{"id":"https:\/\/openalex.org\/keywords\/variance","display_name":"Variance (accounting)","score":0.49630001187324524},{"id":"https:\/\/openalex.org\/keywords\/ranking","display_name":"Ranking (information retrieval)","score":0.3260999917984009}],"concepts":[{"id":"https:\/\/openalex.org\/C21200559","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q7451068","display_name":"Sensitivity (control systems)","level":2,"score":0.6565999984741211},{"id":"https:\/\/openalex.org\/C2779010991","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q2720909","display_name":"Artifact (error)","level":2,"score":0.6546000242233276},{"id":"https:\/\/openalex.org\/C173801870","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q201413","display_name":"Heuristic","level":2,"score":0.5048999786376953},{"id":"https:\/\/openalex.org\/C196083921","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q7915758","display_name":"Variance (accounting)","level":2,"score":0.49630001187324524},{"id":"https:\/\/openalex.org\/C15744967","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q9418","display_name":"Psychology","level":0,"score":0.3587999939918518},{"id":"https:\/\/openalex.org\/C41008148","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q21198","display_name":"Computer science","level":0,"score":0.35440000891685486},{"id":"https:\/\/openalex.org\/C180747234","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q23373","display_name":"Cognitive psychology","level":1,"score":0.3515999913215637},{"id":"https:\/\/openalex.org\/C149782125","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q160039","display_name":"Econometrics","level":1,"score":0.3330000042915344},{"id":"https:\/\/openalex.org\/C189430467","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q7293293","display_name":"Ranking (information retrieval)","level":2,"score":0.3260999917984009},{"id":"https:\/\/openalex.org\/C2164484","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.2770000100135803},{"id":"https:\/\/openalex.org\/C154945302","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q11660","display_name":"Artificial intelligence","level":1,"score":0.27390000224113464},{"id":"https:\/\/openalex.org\/C112930515","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.26910001039505005},{"id":"https:\/\/openalex.org\/C111335779","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q3454686","display_name":"Reduction (mathematics)","level":2,"score":0.2549999952316284},{"id":"https:\/\/openalex.org\/C119857082","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q2539","display_name":"Machine learning","level":1,"score":0.2529999911785126},{"id":"https:\/\/openalex.org\/C2778334786","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.25290000438690186}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2509.01790","is_oa":true,"landing_page_url":"http:\/\/arxiv.org\/abs\/2509.01790","pdf_url":"https:\/\/arxiv.org\/pdf\/2509.01790","source":{"id":"https:\/\/openalex.org\/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550\/arxiv.2509.01790","is_oa":true,"landing_page_url":"https:\/\/doi.org\/10.48550\/arxiv.2509.01790","pdf_url":null,"source":{"id":"https:\/\/openalex.org\/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https:\/\/openalex.org\/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https:\/\/openalex.org\/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https:\/\/openalex.org\/licenses\/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2509.01790","is_oa":true,"landing_page_url":"http:\/\/arxiv.org\/abs\/2509.01790","pdf_url":"https:\/\/arxiv.org\/pdf\/2509.01790","source":{"id":"https:\/\/openalex.org\/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Prompt":[0],"sensitivity,":[1],"referring":[2],"to":[3,18,163],"the":[4,47,101,184],"phenomenon":[5],"where":[6],"paraphrasing":[7],"(i.e.,":[8],"repeating":[9],"something":[10],"written":[11],"or":[12,59,129],"spoken":[13],"using":[14],"different":[15],"words)":[16],"leads":[17],"significant":[19],"changes":[20],"in":[21,141,149,183],"large":[22],"language":[23],"model":[24,150],"(LLM)":[25],"performance,":[26],"has":[27],"been":[28],"widely":[29,48],"accepted":[30],"as":[31,127],"a":[32,138,145,181],"core":[33],"limitation":[34],"of":[35,57,65,100,178],"LLMs.":[36],"In":[37],"this":[38,42,70],"work,":[39],"we":[40,72,132,136],"revisit":[41],"issue":[43],"and":[44,79,88,112,144,169],"ask:":[45],"Is":[46],"reported":[49],"high":[50],"prompt":[51,94,102,164,171],"sensitivity":[52,103,172],"truly":[53],"an":[54,63,176],"inherent":[55],"weakness":[56],"LLMs,":[58],"is":[60],"it":[61],"largely":[62],"artifact":[64,177],"evaluation":[66,107,179],"processes?":[67],"To":[68],"answer":[69,114],"question,":[71],"systematically":[73],"evaluate":[74],"7":[75],"LLMs":[76,159],"(e.g.,":[77],"GPT":[78],"Gemini":[80],"family)":[81],"across":[82,152],"6":[83],"benchmarks,":[84],"including":[85,109],"both":[86],"multiple-choice":[87],"open-ended":[89],"tasks":[90],"on":[91],"12":[92],"diverse":[93],"templates.":[95],"We":[96],"find":[97],"that":[98,157,170],"much":[99],"stems":[104],"from":[105],"heuristic":[106],"methods,":[108],"log-likelihood":[110],"scoring":[111],"rigid":[113],"matching,":[115],"which":[116],"often":[117],"overlook":[118],"semantically":[119],"correct":[120],"responses":[121],"expressed":[122],"through":[123],"alternative":[124],"phrasings,":[125],"such":[126],"synonyms":[128],"paraphrases.":[130],"When":[131],"adopt":[133],"LLM-as-a-Judge":[134],"evaluations,":[135],"observe":[137],"substantial":[139],"reduction":[140],"performance":[142],"variance":[143],"consistently":[146],"higher":[147],"correlation":[148],"rankings":[151],"prompts.":[153],"Our":[154],"findings":[155],"suggest":[156],"modern":[158],"are":[160],"more":[161,175],"robust":[162],"templates":[165],"than":[166,180],"previously":[167],"believed,":[168],"may":[173],"be":[174],"flaw":[182],"models.":[185]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2025-10-10T00:00:00"}