{"id":"https:\/\/openalex.org\/W4402671029","doi":"https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.792","title":"Arabic Diacritics in the Wild: Exploiting Opportunities for Improved Diacritization","display_name":"Arabic Diacritics in the Wild: Exploiting Opportunities for Improved Diacritization","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https:\/\/openalex.org\/W4402671029","doi":"https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.792"},"language":"en","primary_location":{"id":"doi:10.18653\/v1\/2024.acl-long.792","is_oa":true,"landing_page_url":"https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.792","pdf_url":"https:\/\/aclanthology.org\/2024.acl-long.792.pdf","source":null,"license":"cc-by","license_id":"https:\/\/openalex.org\/licenses\/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https:\/\/aclanthology.org\/2024.acl-long.792.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https:\/\/openalex.org\/A5099097296","display_name":"Salman Elgamal","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Salman Elgamal","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5080300851","display_name":"Ossama Obeid","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ossama Obeid","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5092596426","display_name":"MHD Tameem Kabbani","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mhd Kabbani","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5042049616","display_name":"Go Inoue","orcid":"https:\/\/orcid.org\/0000-0002-7933-0832"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Go Inoue","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https:\/\/openalex.org\/A5084517393","display_name":"Nizar Habash","orcid":"https:\/\/orcid.org\/0000-0002-1831-3457"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nizar Habash","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https:\/\/openalex.org\/A5099097296"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.8627,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":{"value":0.90713064,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":95},"biblio":{"volume":null,"issue":null,"first_page":"14815","last_page":"14829"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https:\/\/openalex.org\/T13912","display_name":"Language, Linguistics, Cultural Analysis","score":0.2354000061750412,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1203","display_name":"Language and Linguistics"},"field":{"id":"https:\/\/openalex.org\/fields\/12","display_name":"Arts and Humanities"},"domain":{"id":"https:\/\/openalex.org\/domains\/2","display_name":"Social Sciences"}},"topics":[{"id":"https:\/\/openalex.org\/T13912","display_name":"Language, Linguistics, Cultural Analysis","score":0.2354000061750412,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1203","display_name":"Language and Linguistics"},"field":{"id":"https:\/\/openalex.org\/fields\/12","display_name":"Arts and Humanities"},"domain":{"id":"https:\/\/openalex.org\/domains\/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https:\/\/openalex.org\/keywords\/computer-science","display_name":"Computer science","score":0.6281064748764038},{"id":"https:\/\/openalex.org\/keywords\/arabic","display_name":"Arabic","score":0.612136721611023},{"id":"https:\/\/openalex.org\/keywords\/artificial-intelligence","display_name":"Artificial intelligence","score":0.3272262215614319},{"id":"https:\/\/openalex.org\/keywords\/linguistics","display_name":"Linguistics","score":0.14251559972763062}],"concepts":[{"id":"https:\/\/openalex.org\/C41008148","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q21198","display_name":"Computer science","level":0,"score":0.6281064748764038},{"id":"https:\/\/openalex.org\/C96455323","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q13955","display_name":"Arabic","level":2,"score":0.612136721611023},{"id":"https:\/\/openalex.org\/C154945302","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3272262215614319},{"id":"https:\/\/openalex.org\/C41895202","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q8162","display_name":"Linguistics","level":1,"score":0.14251559972763062},{"id":"https:\/\/openalex.org\/C138885662","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q5891","display_name":"Philosophy","level":0,"score":0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653\/v1\/2024.acl-long.792","is_oa":true,"landing_page_url":"https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.792","pdf_url":"https:\/\/aclanthology.org\/2024.acl-long.792.pdf","source":null,"license":"cc-by","license_id":"https:\/\/openalex.org\/licenses\/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653\/v1\/2024.acl-long.792","is_oa":true,"landing_page_url":"https:\/\/doi.org\/10.18653\/v1\/2024.acl-long.792","pdf_url":"https:\/\/aclanthology.org\/2024.acl-long.792.pdf","source":null,"license":"cc-by","license_id":"https:\/\/openalex.org\/licenses\/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","raw_type":"proceedings-article"},"sustainable_development_goals":[{"id":"https:\/\/metadata.un.org\/sdg\/2","display_name":"Zero hunger","score":0.5}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https:\/\/content.openalex.org\/works\/W4402671029.pdf","grobid_xml":"https:\/\/content.openalex.org\/works\/W4402671029.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https:\/\/openalex.org\/W4391375266","https:\/\/openalex.org\/W2748952813","https:\/\/openalex.org\/W2390279801","https:\/\/openalex.org\/W2358668433","https:\/\/openalex.org\/W4396701345","https:\/\/openalex.org\/W2376932109","https:\/\/openalex.org\/W2001405890","https:\/\/openalex.org\/W4396696052","https:\/\/openalex.org\/W4249048193","https:\/\/openalex.org\/W2382290278"],"abstract_inverted_index":{"The":[0],"widespread":[1],"absence":[2],"of":[3,22],"diacritical":[4],"marks":[5],"in":[6,30,70,79,87],"Arabic":[7,14,80],"text":[8],"poses":[9],"a":[10,55,92],"significant":[11],"challenge":[12],"for":[13],"natural":[15],"language":[16],"processing":[17],"(NLP).This":[18],"paper":[19],"explores":[20],"instances":[21],"naturally":[23],"occurring":[24],"diacritics,":[25,85],"referred":[26],"to":[27,33,65,75,82],"as":[28,107],"\"diacritics":[29],"the":[31,76],"wild,\"":[32],"unveil":[34],"patterns":[35],"and":[36,51,97,105],"latent":[37],"information":[38],"across":[39],"six":[40],"diverse":[41],"genres:":[42],"news":[43],"articles,":[44],"novels,":[45],"children's":[46],"books,":[47],"poetry,":[48],"political":[49],"documents,":[50],"ChatGPT":[52],"outputs.We":[53],"present":[54],"new":[56],"annotated":[57],"dataset":[58],"that":[59],"maps":[60],"realworld":[61],"partially":[62],"diacritized":[63],"words":[64],"their":[66],"maximal":[67],"full":[68],"diacritization":[69,100],"context.Additionally,":[71],"we":[72],"propose":[73],"extensions":[74],"analyze-anddisambiguate":[77],"approach":[78],"NLP":[81],"leverage":[83],"these":[84],"resulting":[86],"notable":[88],"improvements.Our":[89],"contributions":[90],"encompass":[91],"thorough":[93],"analysis,":[94],"valuable":[95],"datasets,":[96],"an":[98],"extended":[99],"algorithm.We":[101],"release":[102],"our":[103],"code":[104],"datasets":[106],"open":[108],"source.":[109]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-11T06:11:40.159057","created_date":"2025-10-10T00:00:00"}