{"id":"https:\/\/openalex.org\/W6948186523","doi":"https:\/\/doi.org\/10.48550\/arxiv.2508.09600","title":"OSUM-EChat: Enhancing End-to-End Empathetic Spoken Chatbot via Understanding-Driven Spoken Dialogue","display_name":"OSUM-EChat: Enhancing End-to-End Empathetic Spoken Chatbot via Understanding-Driven Spoken Dialogue","publication_year":2025,"publication_date":"2025-08-13","ids":{"openalex":"https:\/\/openalex.org\/W6948186523","doi":"https:\/\/doi.org\/10.48550\/arxiv.2508.09600"},"language":"en","primary_location":{"id":"doi:10.48550\/arxiv.2508.09600","is_oa":true,"landing_page_url":"https:\/\/doi.org\/10.48550\/arxiv.2508.09600","pdf_url":null,"source":{"id":"https:\/\/openalex.org\/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https:\/\/openalex.org\/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https:\/\/openalex.org\/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https:\/\/doi.org\/10.48550\/arxiv.2508.09600","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Geng, Xuelong","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Geng, Xuelong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Shao, Qijie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shao, Qijie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Xue, Hongfei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xue, Hongfei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Shuiyuan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Shuiyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Xie, Hanke","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Hanke","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Guo, Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Zhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhao, Yi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Yi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Li, Guojian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Guojian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Tian, Wenjie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Wenjie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Wang, Chengyou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Chengyou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhao, Zhixian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Zhixian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Xia, Kangxiang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xia, Kangxiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zhang, Ziyu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Ziyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Lin, Zhennan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Zhennan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Zuo, Tianlun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zuo, Tianlun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Shao, Mingchen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shao, Mingchen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Cao, Yuang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Yuang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Ma, Guobin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ma, Guobin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Li, Longhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Longhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Dai, Yuhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Yuhang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Gao, Dehui","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Dehui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Guo, Dake","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Dake","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Xie, Lei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Lei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":23,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":{"id":"https:\/\/openalex.org\/T10028","display_name":"Topic Modeling","score":0.32820001244544983,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1702","display_name":"Artificial Intelligence"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},"topics":[{"id":"https:\/\/openalex.org\/T10028","display_name":"Topic Modeling","score":0.32820001244544983,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1702","display_name":"Artificial Intelligence"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},{"id":"https:\/\/openalex.org\/T11714","display_name":"Multimodal Machine Learning Applications","score":0.15199999511241913,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},{"id":"https:\/\/openalex.org\/T12128","display_name":"AI in Service Interactions","score":0.1509000062942505,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1702","display_name":"Artificial Intelligence"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https:\/\/openalex.org\/keywords\/paralanguage","display_name":"Paralanguage","score":0.9714000225067139},{"id":"https:\/\/openalex.org\/keywords\/spoken-language","display_name":"Spoken language","score":0.5568000078201294},{"id":"https:\/\/openalex.org\/keywords\/chatbot","display_name":"Chatbot","score":0.5340999960899353},{"id":"https:\/\/openalex.org\/keywords\/natural","display_name":"Natural (archaeology)","score":0.4903999865055084},{"id":"https:\/\/openalex.org\/keywords\/empathy","display_name":"Empathy","score":0.48330000042915344},{"id":"https:\/\/openalex.org\/keywords\/mechanism","display_name":"Mechanism (biology)","score":0.3716000020503998},{"id":"https:\/\/openalex.org\/keywords\/class","display_name":"Class (philosophy)","score":0.3610000014305115},{"id":"https:\/\/openalex.org\/keywords\/imitation","display_name":"Imitation","score":0.35850000381469727}],"concepts":[{"id":"https:\/\/openalex.org\/C133378560","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q1753225","display_name":"Paralanguage","level":2,"score":0.9714000225067139},{"id":"https:\/\/openalex.org\/C41008148","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q21198","display_name":"Computer science","level":0,"score":0.6388000249862671},{"id":"https:\/\/openalex.org\/C2776230583","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q1322198","display_name":"Spoken language","level":2,"score":0.5568000078201294},{"id":"https:\/\/openalex.org\/C2779041454","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q870780","display_name":"Chatbot","level":2,"score":0.5340999960899353},{"id":"https:\/\/openalex.org\/C2776608160","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.4903999865055084},{"id":"https:\/\/openalex.org\/C2779885105","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q182263","display_name":"Empathy","level":2,"score":0.48330000042915344},{"id":"https:\/\/openalex.org\/C204321447","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q30642","display_name":"Natural language processing","level":1,"score":0.41290000081062317},{"id":"https:\/\/openalex.org\/C89611455","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q6804646","display_name":"Mechanism (biology)","level":2,"score":0.3716000020503998},{"id":"https:\/\/openalex.org\/C2777212361","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q5127848","display_name":"Class (philosophy)","level":2,"score":0.3610000014305115},{"id":"https:\/\/openalex.org\/C126388530","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q1131737","display_name":"Imitation","level":2,"score":0.35850000381469727},{"id":"https:\/\/openalex.org\/C2777601897","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q3409113","display_name":"Presentation (obstetrics)","level":2,"score":0.3434000015258789},{"id":"https:\/\/openalex.org\/C15744967","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q9418","display_name":"Psychology","level":0,"score":0.3431999981403351},{"id":"https:\/\/openalex.org\/C195324797","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q33742","display_name":"Natural language","level":2,"score":0.33500000834465027},{"id":"https:\/\/openalex.org\/C2779439875","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q1078276","display_name":"Natural language understanding","level":3,"score":0.3305000066757202},{"id":"https:\/\/openalex.org\/C26517878","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q228039","display_name":"Key (lock)","level":2,"score":0.3255000114440918},{"id":"https:\/\/openalex.org\/C41895202","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q8162","display_name":"Linguistics","level":1,"score":0.3188000023365021},{"id":"https:\/\/openalex.org\/C154945302","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q11660","display_name":"Artificial intelligence","level":1,"score":0.2976999878883362},{"id":"https:\/\/openalex.org\/C2777200299","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q52943","display_name":"Conversation","level":2,"score":0.295199990272522},{"id":"https:\/\/openalex.org\/C2983448237","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q1078276","display_name":"Language understanding","level":2,"score":0.2928999960422516},{"id":"https:\/\/openalex.org\/C46312422","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q11024","display_name":"Communication","level":1,"score":0.2752000093460083},{"id":"https:\/\/openalex.org\/C96777728","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q640757","display_name":"Kinesics","level":3,"score":0.2628999948501587},{"id":"https:\/\/openalex.org\/C2779304628","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.26089999079704285}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550\/arxiv.2508.09600","is_oa":true,"landing_page_url":"https:\/\/doi.org\/10.48550\/arxiv.2508.09600","pdf_url":null,"source":{"id":"https:\/\/openalex.org\/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https:\/\/openalex.org\/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https:\/\/openalex.org\/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550\/arxiv.2508.09600","is_oa":true,"landing_page_url":"https:\/\/doi.org\/10.48550\/arxiv.2508.09600","pdf_url":null,"source":{"id":"https:\/\/openalex.org\/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https:\/\/openalex.org\/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https:\/\/openalex.org\/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https:\/\/metadata.un.org\/sdg\/5","display_name":"Gender equality","score":0.49147698283195496}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Empathy":[0],"is":[1],"crucial":[2],"in":[3,29,91],"enabling":[4,140],"natural":[5],"interactions":[6],"within":[7],"spoken":[8,82,103,118,196],"dialogue":[9,52,83,104,119,138,154,187,197],"systems,":[10],"allowing":[11],"machines":[12],"to":[13,18,86,117,143],"recognize":[14],"and":[15,25,38,63,69,121,174],"respond":[16],"appropriately":[17],"paralinguistic":[19,57,130],"cues":[20,58],"such":[21],"as":[22],"age,":[23],"gender,":[24],"emotion.":[26],"Recent":[27],"advancements":[28],"end-to-end":[30,81,195],"speech":[31,36,114],"language":[32],"models,":[33],"which":[34],"unify":[35],"understanding":[37,115,131],"generation,":[39,139],"provide":[40],"promising":[41],"solutions.":[42],"However,":[43],"several":[44],"challenges":[45],"persist,":[46],"including":[47],"an":[48,79],"over-reliance":[49],"on":[50,152],"large-scale":[51,153],"datasets,":[53],"insufficient":[54],"extraction":[55],"of":[56,66,111,135,170,186],"vital":[59],"for":[60,181],"conveying":[61],"empathy,":[62],"the":[64,109,141,164,175,183],"lack":[65],"empathy-specific":[67],"datasets":[68,155],"evaluation":[70],"frameworks.":[71],"To":[72],"address":[73],"these":[74],"issues,":[75],"we":[76,162],"introduce":[77,163],"OSUM-EChat,":[78],"open-source,":[80],"system":[84,142],"designed":[85],"enhance":[87],"empathetic":[88,146,159,171,184,200],"interactions,":[89],"particularly":[90],"resource-limited":[92],"settings.":[93],"OSUM-EChat":[94,193],"introduces":[95],"two":[96],"key":[97],"innovations:":[98],"(1)":[99],"a":[100,112,123,133,167,178],"three-stage":[101],"understanding-driven":[102],"training":[105],"strategy":[106],"that":[107,128,192],"extends":[108],"capabilities":[110,185],"large":[113],"model":[116],"tasks,":[120],"(2)":[122],"linguistic-paralinguistic":[124],"dual":[125],"thinking":[126],"mechanism":[127],"integrates":[129],"through":[132],"chain":[134],"thought":[136],"with":[137],"produce":[144],"more":[145],"responses.":[147],"This":[148],"approach":[149],"reduces":[150],"reliance":[151],"while":[156],"maintaining":[157],"high-quality":[158],"interactions.":[160],"Additionally,":[161],"EChat-200K":[165],"dataset,":[166],"rich":[168],"corpus":[169],"speech-to-speech":[172],"dialogues,":[173],"EChat-eval":[176],"benchmark,":[177],"comprehensive":[179],"framework":[180],"evaluating":[182],"systems.":[188],"Experimental":[189],"results":[190],"demonstrate":[191],"outperforms":[194],"models":[198],"regarding":[199],"responsiveness,":[201],"validating":[202],"its":[203],"effectiveness.":[204]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}