{"id":"https:\/\/openalex.org\/W3188573193","doi":"https:\/\/doi.org\/10.24963\/ijcai.2021\/659","title":"Beyond Accuracy: Behavioral Testing of NLP Models with Checklist (Extended Abstract)","display_name":"Beyond Accuracy: Behavioral Testing of NLP Models with Checklist (Extended Abstract)","publication_year":2021,"publication_date":"2021-08-01","ids":{"openalex":"https:\/\/openalex.org\/W3188573193","doi":"https:\/\/doi.org\/10.24963\/ijcai.2021\/659","mag":"3188573193"},"language":"en","primary_location":{"id":"doi:10.24963\/ijcai.2021\/659","is_oa":true,"landing_page_url":"https:\/\/doi.org\/10.24963\/ijcai.2021\/659","pdf_url":"https:\/\/www.ijcai.org\/proceedings\/2021\/0659.pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirtieth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https:\/\/www.ijcai.org\/proceedings\/2021\/0659.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https:\/\/openalex.org\/A5019726734","display_name":"Marco T\u00falio Ribeiro","orcid":"https:\/\/orcid.org\/0000-0002-3301-1297"},"institutions":[{"id":"https:\/\/openalex.org\/I4210164937","display_name":"Microsoft Research (United Kingdom)","ror":"https:\/\/ror.org\/05k87vq12","country_code":"GB","type":"company","lineage":["https:\/\/openalex.org\/I1290206253","https:\/\/openalex.org\/I4210164937"]}],"countries":["GB"],"is_corresponding":true,"raw_author_name":"Marco Tulio Ribeiro","raw_affiliation_strings":["Microsoft Research"],"affiliations":[{"raw_affiliation_string":"Microsoft Research","institution_ids":["https:\/\/openalex.org\/I4210164937"]}]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5004225142","display_name":"Tongshuang Wu","orcid":"https:\/\/orcid.org\/0000-0003-1630-0588"},"institutions":[{"id":"https:\/\/openalex.org\/I201448701","display_name":"University of Washington","ror":"https:\/\/ror.org\/00cvxb145","country_code":"US","type":"education","lineage":["https:\/\/openalex.org\/I201448701"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tongshuang Wu","raw_affiliation_strings":["University of Washington"],"affiliations":[{"raw_affiliation_string":"University of Washington","institution_ids":["https:\/\/openalex.org\/I201448701"]}]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5090739892","display_name":"Carlos Guestrin","orcid":"https:\/\/orcid.org\/0000-0001-6348-5939"},"institutions":[{"id":"https:\/\/openalex.org\/I201448701","display_name":"University of Washington","ror":"https:\/\/ror.org\/00cvxb145","country_code":"US","type":"education","lineage":["https:\/\/openalex.org\/I201448701"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Carlos Guestrin","raw_affiliation_strings":["University of Washington"],"affiliations":[{"raw_affiliation_string":"University of Washington","institution_ids":["https:\/\/openalex.org\/I201448701"]}]},{"author_position":"last","author":{"id":"https:\/\/openalex.org\/A5005779128","display_name":"Sameer Singh","orcid":"https:\/\/orcid.org\/0000-0003-0621-6323"},"institutions":[{"id":"https:\/\/openalex.org\/I204250578","display_name":"University of California, Irvine","ror":"https:\/\/ror.org\/04gyf1771","country_code":"US","type":"education","lineage":["https:\/\/openalex.org\/I204250578"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sameer Singh","raw_affiliation_strings":["University of California, Irvine"],"affiliations":[{"raw_affiliation_string":"University of California, Irvine","institution_ids":["https:\/\/openalex.org\/I204250578"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":4,"corresponding_author_ids":["https:\/\/openalex.org\/A5019726734"],"corresponding_institution_ids":["https:\/\/openalex.org\/I4210164937"],"apc_list":null,"apc_paid":null,"fwci":7.436,"has_fulltext":false,"cited_by_count":31,"citation_normalized_percentile":{"value":0.97202485,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"4824","last_page":"4828"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https:\/\/openalex.org\/T10260","display_name":"Software Engineering Research","score":0.9997000098228455,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1710","display_name":"Information Systems"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},"topics":[{"id":"https:\/\/openalex.org\/T10260","display_name":"Software Engineering Research","score":0.9997000098228455,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1710","display_name":"Information Systems"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},{"id":"https:\/\/openalex.org\/T10028","display_name":"Topic Modeling","score":0.9980999827384949,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1702","display_name":"Artificial Intelligence"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},{"id":"https:\/\/openalex.org\/T10743","display_name":"Software Testing and Debugging Techniques","score":0.9901999831199646,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1712","display_name":"Software"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https:\/\/openalex.org\/keywords\/checklist","display_name":"Checklist","score":0.7905409336090088},{"id":"https:\/\/openalex.org\/keywords\/computer-science","display_name":"Computer science","score":0.7628058195114136},{"id":"https:\/\/openalex.org\/keywords\/generalization","display_name":"Generalization","score":0.6243038177490234},{"id":"https:\/\/openalex.org\/keywords\/artificial-intelligence","display_name":"Artificial intelligence","score":0.6066076755523682},{"id":"https:\/\/openalex.org\/keywords\/machine-learning","display_name":"Machine learning","score":0.5688576698303223},{"id":"https:\/\/openalex.org\/keywords\/task","display_name":"Task (project management)","score":0.5574945211410522},{"id":"https:\/\/openalex.org\/keywords\/natural-language-processing","display_name":"Natural language processing","score":0.5250368118286133},{"id":"https:\/\/openalex.org\/keywords\/test","display_name":"Test (biology)","score":0.49592527747154236},{"id":"https:\/\/openalex.org\/keywords\/software-engineering","display_name":"Software engineering","score":0.32416123151779175},{"id":"https:\/\/openalex.org\/keywords\/cognitive-psychology","display_name":"Cognitive psychology","score":0.10902956128120422},{"id":"https:\/\/openalex.org\/keywords\/psychology","display_name":"Psychology","score":0.10404396057128906},{"id":"https:\/\/openalex.org\/keywords\/engineering","display_name":"Engineering","score":0.08009815216064453}],"concepts":[{"id":"https:\/\/openalex.org\/C2779356329","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q922625","display_name":"Checklist","level":2,"score":0.7905409336090088},{"id":"https:\/\/openalex.org\/C41008148","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q21198","display_name":"Computer science","level":0,"score":0.7628058195114136},{"id":"https:\/\/openalex.org\/C177148314","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q170084","display_name":"Generalization","level":2,"score":0.6243038177490234},{"id":"https:\/\/openalex.org\/C154945302","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6066076755523682},{"id":"https:\/\/openalex.org\/C119857082","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q2539","display_name":"Machine learning","level":1,"score":0.5688576698303223},{"id":"https:\/\/openalex.org\/C2780451532","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q759676","display_name":"Task (project management)","level":2,"score":0.5574945211410522},{"id":"https:\/\/openalex.org\/C204321447","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q30642","display_name":"Natural language processing","level":1,"score":0.5250368118286133},{"id":"https:\/\/openalex.org\/C2777267654","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q3519023","display_name":"Test (biology)","level":2,"score":0.49592527747154236},{"id":"https:\/\/openalex.org\/C115903868","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q80993","display_name":"Software engineering","level":1,"score":0.32416123151779175},{"id":"https:\/\/openalex.org\/C180747234","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q23373","display_name":"Cognitive psychology","level":1,"score":0.10902956128120422},{"id":"https:\/\/openalex.org\/C15744967","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q9418","display_name":"Psychology","level":0,"score":0.10404396057128906},{"id":"https:\/\/openalex.org\/C127413603","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q11023","display_name":"Engineering","level":0,"score":0.08009815216064453},{"id":"https:\/\/openalex.org\/C86803240","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q420","display_name":"Biology","level":0,"score":0},{"id":"https:\/\/openalex.org\/C134306372","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q7754","display_name":"Mathematical analysis","level":1,"score":0},{"id":"https:\/\/openalex.org\/C201995342","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q682496","display_name":"Systems engineering","level":1,"score":0},{"id":"https:\/\/openalex.org\/C151730666","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q7205","display_name":"Paleontology","level":1,"score":0},{"id":"https:\/\/openalex.org\/C33923547","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q395","display_name":"Mathematics","level":0,"score":0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.24963\/ijcai.2021\/659","is_oa":true,"landing_page_url":"https:\/\/doi.org\/10.24963\/ijcai.2021\/659","pdf_url":"https:\/\/www.ijcai.org\/proceedings\/2021\/0659.pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirtieth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.24963\/ijcai.2021\/659","is_oa":true,"landing_page_url":"https:\/\/doi.org\/10.24963\/ijcai.2021\/659","pdf_url":"https:\/\/www.ijcai.org\/proceedings\/2021\/0659.pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the Thirtieth International Joint Conference on Artificial Intelligence","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https:\/\/content.openalex.org\/works\/W3188573193.pdf","grobid_xml":"https:\/\/content.openalex.org\/works\/W3188573193.grobid-xml"},"referenced_works_count":18,"referenced_works":["https:\/\/openalex.org\/W2076253536","https:\/\/openalex.org\/W2108816886","https:\/\/openalex.org\/W2282821441","https:\/\/openalex.org\/W2324595780","https:\/\/openalex.org\/W2516809705","https:\/\/openalex.org\/W2767899794","https:\/\/openalex.org\/W2799007037","https:\/\/openalex.org\/W2923014074","https:\/\/openalex.org\/W2949858875","https:\/\/openalex.org\/W2953039212","https:\/\/openalex.org\/W2963126845","https:\/\/openalex.org\/W2963310665","https:\/\/openalex.org\/W2963323070","https:\/\/openalex.org\/W2963748441","https:\/\/openalex.org\/W2965373594","https:\/\/openalex.org\/W2980282514","https:\/\/openalex.org\/W2989344603","https:\/\/openalex.org\/W3035507081"],"related_works":["https:\/\/openalex.org\/W4234875088","https:\/\/openalex.org\/W2013796470","https:\/\/openalex.org\/W2420514274","https:\/\/openalex.org\/W2001467811","https:\/\/openalex.org\/W4362687946","https:\/\/openalex.org\/W1971633942","https:\/\/openalex.org\/W2902805979","https:\/\/openalex.org\/W2969972828","https:\/\/openalex.org\/W2987114614","https:\/\/openalex.org\/W2808458916"],"abstract_inverted_index":{"Although":[0],"measuring":[1],"held-out":[2],"accuracy":[3],"has":[4],"been":[5],"the":[6,15,89],"primary":[7],"approach":[8],"to":[9,76],"evaluate":[10],"generalization,":[11],"it":[12],"often":[13],"overestimates":[14],"performance":[16],"of":[17,38,58,83,91],"NLP":[18,52,134],"models,":[19],"while":[20],"alternative":[21],"approaches":[22],"for":[23,50,95,114],"evaluating":[24],"models":[25],"either":[26],"focus":[27],"on":[28,32],"individual":[29],"tasks":[30],"or":[31],"specific":[33],"behaviors.":[34],"Inspired":[35],"by":[36],"principles":[37],"behavioral":[39],"testing":[40,51],"in":[41,101,125],"software":[42,74],"engineering,":[43],"we":[44],"introduce":[45],"CheckList,":[46],"a":[47,56,73,78,108,111,115],"task-agnostic":[48],"methodology":[49],"models.":[53,106],"CheckList":[54,92,137],"includes":[55],"matrix":[57],"general":[59],"linguistic":[60],"capabilities":[61],"and":[62,80,104,122,143],"test":[63,68,84],"types":[64],"that":[65],"facilitate":[66],"comprehensive":[67],"ideation,":[69],"as":[70,72,140,148,151],"well":[71],"tool":[75],"generate":[77],"large":[79],"diverse":[81],"number":[82],"cases":[85],"quickly.":[86],"We":[87],"illustrate":[88],"utility":[90],"with":[93,136],"tests":[94],"three":[96,146],"tasks,":[97],"identifying":[98],"critical":[99],"failures":[100],"both":[102],"commercial":[103,116],"state-of-art":[105],"In":[107,130],"user":[109,132],"study,":[110,133],"team":[112],"responsible":[113],"sentiment":[117],"analysis":[118],"model":[119],"found":[120,144],"new":[121],"actionable":[123],"bugs":[124,150],"an":[126],"extensively":[127],"tested":[128],"model.":[129],"another":[131],"practitioners":[135],"created":[138],"twice":[139],"many":[141,149],"tests,":[142],"almost":[145],"times":[147],"users":[152],"without":[153],"it.":[154]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":2},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":12},{"year":2022,"cited_by_count":9},{"year":2021,"cited_by_count":1},{"year":2020,"cited_by_count":1}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}