{"id":"https:\/\/openalex.org\/W4414886143","doi":"https:\/\/doi.org\/10.48550\/arxiv.2507.17735","title":"Accent Normalization Using Self-Supervised Discrete Tokens with Non-Parallel Data","display_name":"Accent Normalization Using Self-Supervised Discrete Tokens with Non-Parallel Data","publication_year":2025,"publication_date":"2025-07-23","ids":{"openalex":"https:\/\/openalex.org\/W4414886143","doi":"https:\/\/doi.org\/10.48550\/arxiv.2507.17735"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2507.17735","is_oa":true,"landing_page_url":"http:\/\/arxiv.org\/abs\/2507.17735","pdf_url":"https:\/\/arxiv.org\/pdf\/2507.17735","source":{"id":"https:\/\/openalex.org\/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https:\/\/arxiv.org\/pdf\/2507.17735","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https:\/\/openalex.org\/A5065778847","display_name":"Qibing Bai","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Bai, Qibing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5108413182","display_name":"Sho Inoue","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Inoue, Sho","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5100328312","display_name":"Shuai Wang","orcid":"https:\/\/orcid.org\/0000-0002-7897-2024"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Shuai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5101421603","display_name":"Zhongjie Jiang","orcid":"https:\/\/orcid.org\/0009-0009-7256-6592"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jiang, Zhongjie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5084128157","display_name":"Yannan Wang","orcid":"https:\/\/orcid.org\/0000-0001-7248-4954"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yannan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https:\/\/openalex.org\/A5032690182","display_name":"Haizhou Li","orcid":"https:\/\/orcid.org\/0000-0001-9158-9401"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Haizhou","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https:\/\/openalex.org\/A5065778847"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https:\/\/openalex.org\/T10320","display_name":"Neural Networks and Applications","score":0.5164999961853027,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1702","display_name":"Artificial Intelligence"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},"topics":[{"id":"https:\/\/openalex.org\/T10320","display_name":"Neural Networks and Applications","score":0.5164999961853027,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1702","display_name":"Artificial Intelligence"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https:\/\/openalex.org\/keywords\/normalization","display_name":"Normalization (sociology)","score":0.8144000172615051},{"id":"https:\/\/openalex.org\/keywords\/timbre","display_name":"Timbre","score":0.5958999991416931},{"id":"https:\/\/openalex.org\/keywords\/stress","display_name":"Stress (linguistics)","score":0.5235999822616577},{"id":"https:\/\/openalex.org\/keywords\/pipeline","display_name":"Pipeline (software)","score":0.4307999908924103},{"id":"https:\/\/openalex.org\/keywords\/training-set","display_name":"Training set","score":0.424699991941452},{"id":"https:\/\/openalex.org\/keywords\/duration","display_name":"Duration (music)","score":0.3440999984741211},{"id":"https:\/\/openalex.org\/keywords\/speech-processing","display_name":"Speech processing","score":0.3433000147342682}],"concepts":[{"id":"https:\/\/openalex.org\/C136886441","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.8144000172615051},{"id":"https:\/\/openalex.org\/C28490314","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q189436","display_name":"Speech recognition","level":1,"score":0.7236999869346619},{"id":"https:\/\/openalex.org\/C41008148","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q21198","display_name":"Computer science","level":0,"score":0.7192000150680542},{"id":"https:\/\/openalex.org\/C2776539107","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q176501","display_name":"Timbre","level":3,"score":0.5958999991416931},{"id":"https:\/\/openalex.org\/C2776756274","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q181767","display_name":"Stress (linguistics)","level":2,"score":0.5235999822616577},{"id":"https:\/\/openalex.org\/C154945302","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q11660","display_name":"Artificial intelligence","level":1,"score":0.46630001068115234},{"id":"https:\/\/openalex.org\/C43521106","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.4307999908924103},{"id":"https:\/\/openalex.org\/C51632099","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q3985153","display_name":"Training set","level":2,"score":0.424699991941452},{"id":"https:\/\/openalex.org\/C204321447","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q30642","display_name":"Natural language processing","level":1,"score":0.40450000762939453},{"id":"https:\/\/openalex.org\/C112758219","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q16038819","display_name":"Duration (music)","level":2,"score":0.3440999984741211},{"id":"https:\/\/openalex.org\/C61328038","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q3358061","display_name":"Speech processing","level":2,"score":0.3433000147342682},{"id":"https:\/\/openalex.org\/C12725497","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q810247","display_name":"Baseline (sea)","level":2,"score":0.3021000027656555},{"id":"https:\/\/openalex.org\/C153180895","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.29120001196861267},{"id":"https:\/\/openalex.org\/C14999030","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q16346","display_name":"Speech synthesis","level":2,"score":0.2856000065803528},{"id":"https:\/\/openalex.org\/C99498987","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q2210247","display_name":"Noise (video)","level":3,"score":0.2752000093460083},{"id":"https:\/\/openalex.org\/C23224414","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q176769","display_name":"Hidden Markov model","level":2,"score":0.2736000120639801},{"id":"https:\/\/openalex.org\/C48145219","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q1335365","display_name":"Security token","level":2,"score":0.27219998836517334},{"id":"https:\/\/openalex.org\/C542774811","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q10880526","display_name":"Prosody","level":2,"score":0.2513999938964844}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2507.17735","is_oa":true,"landing_page_url":"http:\/\/arxiv.org\/abs\/2507.17735","pdf_url":"https:\/\/arxiv.org\/pdf\/2507.17735","source":{"id":"https:\/\/openalex.org\/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},{"id":"doi:10.48550\/arxiv.2507.17735","is_oa":true,"landing_page_url":"https:\/\/doi.org\/10.48550\/arxiv.2507.17735","pdf_url":null,"source":{"id":"https:\/\/openalex.org\/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https:\/\/openalex.org\/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https:\/\/openalex.org\/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https:\/\/openalex.org\/licenses\/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2507.17735","is_oa":true,"landing_page_url":"http:\/\/arxiv.org\/abs\/2507.17735","pdf_url":"https:\/\/arxiv.org\/pdf\/2507.17735","source":{"id":"https:\/\/openalex.org\/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Accent":[0],"normalization":[1],"converts":[2,32],"foreign-accented":[3],"speech":[4,7],"into":[5],"native-like":[6],"while":[8],"preserving":[9],"speaker":[10],"identity.":[11],"We":[12,77],"propose":[13],"a":[14,35,51],"novel":[15],"pipeline":[16],"using":[17,42],"self-supervised":[18],"discrete":[19],"tokens":[20,28],"and":[21,38,58],"non-parallel":[22],"training":[23],"data.":[24],"The":[25],"system":[26],"extracts":[27],"from":[29],"source":[30],"speech,":[31],"them":[33],"through":[34],"dedicated":[36],"model,":[37],"synthesizes":[39],"the":[40,71],"output":[41],"flow":[43],"matching.":[44],"Our":[45],"method":[46],"demonstrates":[47],"superior":[48],"performance":[49],"over":[50],"frame-to-frame":[52],"baseline":[53],"in":[54],"naturalness,":[55],"accentedness":[56],"reduction,":[57],"timbre":[59],"preservation":[60,82],"across":[61],"multiple":[62],"English":[63],"accents.":[64],"Through":[65],"token-level":[66],"phonetic":[67],"analysis,":[68],"we":[69],"validate":[70],"effectiveness":[72],"of":[73],"our":[74],"token-based":[75],"approach.":[76],"also":[78],"develop":[79],"two":[80],"duration":[81],"methods,":[83],"suitable":[84],"for":[85],"applications":[86],"such":[87],"as":[88],"dubbing.":[89]},"counts_by_year":[],"updated_date":"2026-03-07T16:01:11.037858","created_date":"2025-10-10T00:00:00"}