{"id":"https:\/\/openalex.org\/W3144086690","doi":"https:\/\/doi.org\/10.1109\/slt48900.2021.9383555","title":"End-to-End Speaker Diarization Conditioned on Speech Activity and Overlap Detection","display_name":"End-to-End Speaker Diarization Conditioned on Speech Activity and Overlap Detection","publication_year":2021,"publication_date":"2021-01-19","ids":{"openalex":"https:\/\/openalex.org\/W3144086690","doi":"https:\/\/doi.org\/10.1109\/slt48900.2021.9383555","mag":"3144086690"},"language":"en","primary_location":{"id":"doi:10.1109\/slt48900.2021.9383555","is_oa":false,"landing_page_url":"https:\/\/doi.org\/10.1109\/slt48900.2021.9383555","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https:\/\/openalex.org\/A5087545401","display_name":"Yuki Takashima","orcid":"https:\/\/orcid.org\/0000-0001-8489-9487"},"institutions":[{"id":"https:\/\/openalex.org\/I65143321","display_name":"Hitachi (Japan)","ror":"https:\/\/ror.org\/02exqgm79","country_code":"JP","type":"company","lineage":["https:\/\/openalex.org\/I65143321"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Yuki Takashima","raw_affiliation_strings":["Hitachi, Ltd. Research & Development Group, Japan"],"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd. Research & Development Group, Japan","institution_ids":["https:\/\/openalex.org\/I65143321"]}]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5044818016","display_name":"Yusuke Fujita","orcid":"https:\/\/orcid.org\/0000-0002-6523-8146"},"institutions":[{"id":"https:\/\/openalex.org\/I65143321","display_name":"Hitachi (Japan)","ror":"https:\/\/ror.org\/02exqgm79","country_code":"JP","type":"company","lineage":["https:\/\/openalex.org\/I65143321"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Yusuke Fujita","raw_affiliation_strings":["Hitachi, Ltd. Research & Development Group, Japan"],"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd. Research & Development Group, Japan","institution_ids":["https:\/\/openalex.org\/I65143321"]}]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5001291873","display_name":"Shinji Watanabe","orcid":"https:\/\/orcid.org\/0000-0002-5970-8631"},"institutions":[{"id":"https:\/\/openalex.org\/I145311948","display_name":"Johns Hopkins University","ror":"https:\/\/ror.org\/00za53h95","country_code":"US","type":"education","lineage":["https:\/\/openalex.org\/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shinji Watanabe","raw_affiliation_strings":["Center for Language and Speech Processing, Johns Hopkins University, USA"],"affiliations":[{"raw_affiliation_string":"Center for Language and Speech Processing, Johns Hopkins University, USA","institution_ids":["https:\/\/openalex.org\/I145311948"]}]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5026324656","display_name":"Shota Horiguchi","orcid":"https:\/\/orcid.org\/0000-0002-3166-4956"},"institutions":[{"id":"https:\/\/openalex.org\/I65143321","display_name":"Hitachi (Japan)","ror":"https:\/\/ror.org\/02exqgm79","country_code":"JP","type":"company","lineage":["https:\/\/openalex.org\/I65143321"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Shota Horiguchi","raw_affiliation_strings":["Hitachi, Ltd. Research & Development Group, Japan"],"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd. Research & Development Group, Japan","institution_ids":["https:\/\/openalex.org\/I65143321"]}]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5059858850","display_name":"Leibny Paola Garcia","orcid":"https:\/\/orcid.org\/0000-0002-7449-5726"},"institutions":[{"id":"https:\/\/openalex.org\/I145311948","display_name":"Johns Hopkins University","ror":"https:\/\/ror.org\/00za53h95","country_code":"US","type":"education","lineage":["https:\/\/openalex.org\/I145311948"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Paola Garcia","raw_affiliation_strings":["Center for Language and Speech Processing, Johns Hopkins University, USA"],"affiliations":[{"raw_affiliation_string":"Center for Language and Speech Processing, Johns Hopkins University, USA","institution_ids":["https:\/\/openalex.org\/I145311948"]}]},{"author_position":"last","author":{"id":"https:\/\/openalex.org\/A5076987349","display_name":"Kenji Nagamatsu","orcid":null},"institutions":[{"id":"https:\/\/openalex.org\/I65143321","display_name":"Hitachi (Japan)","ror":"https:\/\/ror.org\/02exqgm79","country_code":"JP","type":"company","lineage":["https:\/\/openalex.org\/I65143321"]}],"countries":["JP"],"is_corresponding":false,"raw_author_name":"Kenji Nagamatsu","raw_affiliation_strings":["Hitachi, Ltd. Research & Development Group, Japan"],"affiliations":[{"raw_affiliation_string":"Hitachi, Ltd. Research & Development Group, Japan","institution_ids":["https:\/\/openalex.org\/I65143321"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https:\/\/openalex.org\/A5087545401"],"corresponding_institution_ids":["https:\/\/openalex.org\/I65143321"],"apc_list":null,"apc_paid":null,"fwci":2.6904,"has_fulltext":false,"cited_by_count":22,"citation_normalized_percentile":{"value":0.91142055,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":96,"max":98},"biblio":{"volume":null,"issue":null,"first_page":"849","last_page":"856"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https:\/\/openalex.org\/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1702","display_name":"Artificial Intelligence"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},"topics":[{"id":"https:\/\/openalex.org\/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998999834060669,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1702","display_name":"Artificial Intelligence"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},{"id":"https:\/\/openalex.org\/T10860","display_name":"Speech and Audio Processing","score":0.9987000226974487,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1711","display_name":"Signal Processing"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},{"id":"https:\/\/openalex.org\/T11309","display_name":"Music and Audio Processing","score":0.9961000084877014,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1711","display_name":"Signal Processing"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https:\/\/openalex.org\/keywords\/speaker-diarisation","display_name":"Speaker diarisation","score":0.9639265537261963},{"id":"https:\/\/openalex.org\/keywords\/computer-science","display_name":"Computer science","score":0.7853421568870544},{"id":"https:\/\/openalex.org\/keywords\/speech-recognition","display_name":"Speech recognition","score":0.6764222383499146},{"id":"https:\/\/openalex.org\/keywords\/leverage","display_name":"Leverage (statistics)","score":0.5659840703010559},{"id":"https:\/\/openalex.org\/keywords\/cluster-analysis","display_name":"Cluster analysis","score":0.5400242805480957},{"id":"https:\/\/openalex.org\/keywords\/speaker-recognition","display_name":"Speaker recognition","score":0.4882681369781494},{"id":"https:\/\/openalex.org\/keywords\/dependency","display_name":"Dependency (UML)","score":0.4804482161998749},{"id":"https:\/\/openalex.org\/keywords\/artificial-intelligence","display_name":"Artificial intelligence","score":0.4766407907009125},{"id":"https:\/\/openalex.org\/keywords\/probabilistic-logic","display_name":"Probabilistic logic","score":0.44461730122566223},{"id":"https:\/\/openalex.org\/keywords\/task","display_name":"Task (project management)","score":0.43200021982192993},{"id":"https:\/\/openalex.org\/keywords\/end-to-end-principle","display_name":"End-to-end principle","score":0.4164818823337555},{"id":"https:\/\/openalex.org\/keywords\/engineering","display_name":"Engineering","score":0.0661870539188385}],"concepts":[{"id":"https:\/\/openalex.org\/C149838564","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q7574248","display_name":"Speaker diarisation","level":3,"score":0.9639265537261963},{"id":"https:\/\/openalex.org\/C41008148","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q21198","display_name":"Computer science","level":0,"score":0.7853421568870544},{"id":"https:\/\/openalex.org\/C28490314","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q189436","display_name":"Speech recognition","level":1,"score":0.6764222383499146},{"id":"https:\/\/openalex.org\/C153083717","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5659840703010559},{"id":"https:\/\/openalex.org\/C73555534","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q622825","display_name":"Cluster analysis","level":2,"score":0.5400242805480957},{"id":"https:\/\/openalex.org\/C133892786","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q1145189","display_name":"Speaker recognition","level":2,"score":0.4882681369781494},{"id":"https:\/\/openalex.org\/C19768560","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q320727","display_name":"Dependency (UML)","level":2,"score":0.4804482161998749},{"id":"https:\/\/openalex.org\/C154945302","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4766407907009125},{"id":"https:\/\/openalex.org\/C49937458","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q2599292","display_name":"Probabilistic logic","level":2,"score":0.44461730122566223},{"id":"https:\/\/openalex.org\/C2780451532","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q759676","display_name":"Task (project management)","level":2,"score":0.43200021982192993},{"id":"https:\/\/openalex.org\/C74296488","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q2527392","display_name":"End-to-end principle","level":2,"score":0.4164818823337555},{"id":"https:\/\/openalex.org\/C127413603","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q11023","display_name":"Engineering","level":0,"score":0.0661870539188385},{"id":"https:\/\/openalex.org\/C201995342","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q682496","display_name":"Systems engineering","level":1,"score":0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109\/slt48900.2021.9383555","is_oa":false,"landing_page_url":"https:\/\/doi.org\/10.1109\/slt48900.2021.9383555","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2021 IEEE Spoken Language Technology Workshop (SLT)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.5,"id":"https:\/\/metadata.un.org\/sdg\/4","display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":52,"referenced_works":["https:\/\/openalex.org\/W123007118","https:\/\/openalex.org\/W1965819578","https:\/\/openalex.org\/W2016589492","https:\/\/openalex.org\/W2081074144","https:\/\/openalex.org\/W2099679614","https:\/\/openalex.org\/W2137630452","https:\/\/openalex.org\/W2159591770","https:\/\/openalex.org\/W2165232124","https:\/\/openalex.org\/W2460742184","https:\/\/openalex.org\/W2586505867","https:\/\/openalex.org\/W2746574320","https:\/\/openalex.org\/W2753709519","https:\/\/openalex.org\/W2884584040","https:\/\/openalex.org\/W2884797218","https:\/\/openalex.org\/W2889418727","https:\/\/openalex.org\/W2890964092","https:\/\/openalex.org\/W2896538040","https:\/\/openalex.org\/W2913340405","https:\/\/openalex.org\/W2962788625","https:\/\/openalex.org\/W2963168538","https:\/\/openalex.org\/W2963403868","https:\/\/openalex.org\/W2963470929","https:\/\/openalex.org\/W2963745998","https:\/\/openalex.org\/W2963872552","https:\/\/openalex.org\/W2972449503","https:\/\/openalex.org\/W2972492143","https:\/\/openalex.org\/W2972765864","https:\/\/openalex.org\/W2972949456","https:\/\/openalex.org\/W2981608174","https:\/\/openalex.org\/W3004597053","https:\/\/openalex.org\/W3008357631","https:\/\/openalex.org\/W3010788072","https:\/\/openalex.org\/W3016400019","https:\/\/openalex.org\/W3020336359","https:\/\/openalex.org\/W3024085360","https:\/\/openalex.org\/W3025260599","https:\/\/openalex.org\/W3033627755","https:\/\/openalex.org\/W3034010994","https:\/\/openalex.org\/W3095212884","https:\/\/openalex.org\/W3105169251","https:\/\/openalex.org\/W3128434617","https:\/\/openalex.org\/W3154262773","https:\/\/openalex.org\/W3208240190","https:\/\/openalex.org\/W4385245566","https:\/\/openalex.org\/W6739901393","https:\/\/openalex.org\/W6751523768","https:\/\/openalex.org\/W6766542914","https:\/\/openalex.org\/W6769530546","https:\/\/openalex.org\/W6779069803","https:\/\/openalex.org\/W6779253965","https:\/\/openalex.org\/W6779752190","https:\/\/openalex.org\/W6802591457"],"related_works":["https:\/\/openalex.org\/W2206035908","https:\/\/openalex.org\/W2149220986","https:\/\/openalex.org\/W1493012537","https:\/\/openalex.org\/W4247736853","https:\/\/openalex.org\/W2162158162","https:\/\/openalex.org\/W1999004162","https:\/\/openalex.org\/W2125642021","https:\/\/openalex.org\/W1521049138","https:\/\/openalex.org\/W2023466863","https:\/\/openalex.org\/W2696990509"],"abstract_inverted_index":{"In":[0,35],"this":[1,36],"paper,":[2,37],"we":[3,47],"present":[4],"a":[5,49,59,100],"conditional":[6],"multitask":[7,51],"learning":[8,52],"method":[9,97],"for":[10],"end-to-end":[11],"neural":[12],"speaker":[13,56,70,83,105],"diarization":[14,57,71,115],"(EEND).":[15],"The":[16],"EEND":[17,45,110],"system":[18],"has":[19],"shown":[20],"promising":[21],"performance":[22,42],"compared":[23],"with":[24],"traditional":[25],"clustering-based":[26],"methods,":[27],"especially":[28],"in":[29,112],"the":[30,41,44,65,87],"case":[31],"of":[32,43,82,114],"overlapping":[33],"speech.":[34],"to":[38,102],"further":[39],"improve":[40],"system,":[46],"propose":[48],"novel":[50],"framework":[53],"that":[54,79,94],"solves":[55],"and":[58,76,107],"desired":[60],"subtask":[61,101],"while":[62],"explicitly":[63],"considering":[64],"task":[66],"dependency.":[67],"We":[68],"optimize":[69],"conditioned":[72],"on":[73,86],"speech":[74],"activity":[75],"overlap":[77],"detection":[78],"are":[80],"subtasks":[81],"diarization,":[84,106],"based":[85],"probabilistic":[88],"chain":[89],"rule.":[90],"Experimental":[91],"results":[92],"show":[93],"our":[95],"proposed":[96],"can":[98],"leverage":[99],"effectively":[103],"model":[104],"outperforms":[108],"conventional":[109],"systems":[111],"terms":[113],"error":[116],"rate.":[117]},"counts_by_year":[{"year":2025,"cited_by_count":3},{"year":2024,"cited_by_count":4},{"year":2023,"cited_by_count":5},{"year":2022,"cited_by_count":3},{"year":2021,"cited_by_count":7}],"updated_date":"2025-11-06T03:46:38.306776","created_date":"2025-10-10T00:00:00"}