{"id":"https:\/\/openalex.org\/W4396600347","doi":"https:\/\/doi.org\/10.48550\/arxiv.2404.19180","title":"MACO: Exploring GEMM Acceleration on a Loosely-Coupled Multi-core Processor","display_name":"MACO: Exploring GEMM Acceleration on a Loosely-Coupled Multi-core Processor","publication_year":2024,"publication_date":"2024-04-30","ids":{"openalex":"https:\/\/openalex.org\/W4396600347","doi":"https:\/\/doi.org\/10.48550\/arxiv.2404.19180"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2404.19180","is_oa":true,"landing_page_url":"http:\/\/arxiv.org\/abs\/2404.19180","pdf_url":"https:\/\/arxiv.org\/pdf\/2404.19180","source":{"id":"https:\/\/openalex.org\/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https:\/\/openalex.org\/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https:\/\/openalex.org\/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https:\/\/arxiv.org\/pdf\/2404.19180","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https:\/\/openalex.org\/A5059001985","display_name":"Bingcai Sui","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sui, Bingcai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5037428152","display_name":"Junzhong Shen","orcid":"https:\/\/orcid.org\/0000-0001-6233-6800"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shen, Junzhong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5110631223","display_name":"Caixia Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Caixia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5084884735","display_name":"Junhui Wang","orcid":"https:\/\/orcid.org\/0000-0002-6225-8045"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Junhui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https:\/\/openalex.org\/A5101573079","display_name":"Zhong Zheng","orcid":"https:\/\/orcid.org\/0000-0002-2029-6107"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zheng, Zhong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https:\/\/openalex.org\/A5101723213","display_name":"Wei Guo","orcid":"https:\/\/orcid.org\/0000-0002-9769-6045"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Wei","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":6,"corresponding_author_ids":["https:\/\/openalex.org\/A5059001985"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https:\/\/openalex.org\/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9991999864578247,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1708","display_name":"Hardware and Architecture"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},"topics":[{"id":"https:\/\/openalex.org\/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9991999864578247,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1708","display_name":"Hardware and Architecture"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},{"id":"https:\/\/openalex.org\/T10904","display_name":"Embedded Systems Design Techniques","score":0.979200005531311,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1708","display_name":"Hardware and Architecture"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}},{"id":"https:\/\/openalex.org\/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.9749000072479248,"subfield":{"id":"https:\/\/openalex.org\/subfields\/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https:\/\/openalex.org\/fields\/17","display_name":"Computer Science"},"domain":{"id":"https:\/\/openalex.org\/domains\/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https:\/\/openalex.org\/keywords\/acceleration","display_name":"Acceleration","score":0.6925166845321655},{"id":"https:\/\/openalex.org\/keywords\/core","display_name":"Core (optical fiber)","score":0.5999047756195068},{"id":"https:\/\/openalex.org\/keywords\/computer-science","display_name":"Computer science","score":0.5971307754516602},{"id":"https:\/\/openalex.org\/keywords\/many-core","display_name":"Many core","score":0.4584919512271881},{"id":"https:\/\/openalex.org\/keywords\/parallel-computing","display_name":"Parallel computing","score":0.4326492249965668},{"id":"https:\/\/openalex.org\/keywords\/multi-core-processor","display_name":"Multi-core processor","score":0.43060046434402466},{"id":"https:\/\/openalex.org\/keywords\/physics","display_name":"Physics","score":0.1853621006011963},{"id":"https:\/\/openalex.org\/keywords\/telecommunications","display_name":"Telecommunications","score":0.09744170308113098},{"id":"https:\/\/openalex.org\/keywords\/classical-mechanics","display_name":"Classical mechanics","score":0.07680609822273254}],"concepts":[{"id":"https:\/\/openalex.org\/C117896860","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q11376","display_name":"Acceleration","level":2,"score":0.6925166845321655},{"id":"https:\/\/openalex.org\/C2164484","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q5170150","display_name":"Core (optical fiber)","level":2,"score":0.5999047756195068},{"id":"https:\/\/openalex.org\/C41008148","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q21198","display_name":"Computer science","level":0,"score":0.5971307754516602},{"id":"https:\/\/openalex.org\/C3020431745","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q25325220","display_name":"Many core","level":2,"score":0.4584919512271881},{"id":"https:\/\/openalex.org\/C173608175","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q232661","display_name":"Parallel computing","level":1,"score":0.4326492249965668},{"id":"https:\/\/openalex.org\/C78766204","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q555032","display_name":"Multi-core processor","level":2,"score":0.43060046434402466},{"id":"https:\/\/openalex.org\/C121332964","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q413","display_name":"Physics","level":0,"score":0.1853621006011963},{"id":"https:\/\/openalex.org\/C76155785","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q418","display_name":"Telecommunications","level":1,"score":0.09744170308113098},{"id":"https:\/\/openalex.org\/C74650414","wikidata":"https:\/\/www.wikidata.org\/wiki\/Q11397","display_name":"Classical mechanics","level":1,"score":0.07680609822273254}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2404.19180","is_oa":true,"landing_page_url":"http:\/\/arxiv.org\/abs\/2404.19180","pdf_url":"https:\/\/arxiv.org\/pdf\/2404.19180","source":{"id":"https:\/\/openalex.org\/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https:\/\/openalex.org\/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https:\/\/openalex.org\/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550\/arxiv.2404.19180","is_oa":true,"landing_page_url":"https:\/\/doi.org\/10.48550\/arxiv.2404.19180","pdf_url":null,"source":{"id":"https:\/\/openalex.org\/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https:\/\/openalex.org\/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https:\/\/openalex.org\/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2404.19180","is_oa":true,"landing_page_url":"http:\/\/arxiv.org\/abs\/2404.19180","pdf_url":"https:\/\/arxiv.org\/pdf\/2404.19180","source":{"id":"https:\/\/openalex.org\/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https:\/\/openalex.org\/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https:\/\/openalex.org\/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https:\/\/openalex.org\/G3737870217","display_name":null,"funder_award_id":"Grant No. U23A20301","funder_id":"https:\/\/openalex.org\/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https:\/\/openalex.org\/G4699261488","display_name":null,"funder_award_id":"U23A20301","funder_id":"https:\/\/openalex.org\/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https:\/\/openalex.org\/G6058138561","display_name":null,"funder_award_id":", No.","funder_id":"https:\/\/openalex.org\/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https:\/\/openalex.org\/G6510479079","display_name":null,"funder_award_id":"U23A2030","funder_id":"https:\/\/openalex.org\/F4320321001","funder_display_name":"National Natural Science Foundation of China"},{"id":"https:\/\/openalex.org\/G6676365692","display_name":null,"funder_award_id":"U23A203","funder_id":"https:\/\/openalex.org\/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https:\/\/openalex.org\/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https:\/\/ror.org\/01h0zpd94"},{"id":"https:\/\/openalex.org\/F4320324150","display_name":"National University of Defense Technology","ror":"https:\/\/ror.org\/05d2yfz11"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https:\/\/content.openalex.org\/works\/W4396600347.pdf","grobid_xml":"https:\/\/content.openalex.org\/works\/W4396600347.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https:\/\/openalex.org\/W1993191611","https:\/\/openalex.org\/W2023938924","https:\/\/openalex.org\/W2918840249","https:\/\/openalex.org\/W1991859582","https:\/\/openalex.org\/W2110053126","https:\/\/openalex.org\/W2079303253","https:\/\/openalex.org\/W2104702637","https:\/\/openalex.org\/W4248099758","https:\/\/openalex.org\/W2394430006","https:\/\/openalex.org\/W4255057712"],"abstract_inverted_index":{"General-purpose":[0],"processor":[1],"vendors":[2],"have":[3],"integrated":[4],"customized":[5],"accelerator":[6],"in":[7],"their":[8],"products":[9],"due":[10],"to":[11,26,36,38,92,135,145],"the":[12,29,39,62,68,77,95],"widespread":[13],"use":[14],"of":[15,32,66,98,117],"General":[16],"Matrix-Matrix":[17],"Multiplication":[18],"(GEMM)":[19],"kernels.":[20],"However,":[21],"it":[22],"remains":[23],"a":[24,50,71],"challenge":[25],"further":[27,93],"improve":[28],"flexibilityand":[30],"scalability":[31],"these":[33],"GEMM-enhanced":[34],"processors":[35],"cater":[37],"emerging":[40],"large-scale":[41],"GEMM":[42,101],"workloads.":[43,102,148],"In":[44],"this":[45],"paper":[46,69,78],"we":[47],"propose":[48],"MACO,":[49,67],"novel":[51],"loosely-coupled":[52],"multi-core":[53],"general-purpose":[54],"architecture":[55],"optimized":[56],"for":[57,100],"GEMM-related":[58],"applications.":[59],"To":[60],"enhance":[61,94],"programmability":[63],"and":[64,86,88],"flexibility":[65],"introduces":[70],"tile-based":[72],"instruction":[73],"set":[74],"architecture.":[75],"Additionally,":[76],"presents":[79],"techniques":[80],"such":[81],"as":[82],"hardware-assisted":[83],"data":[84],"prefetching":[85],"locking,":[87],"predictive":[89],"address":[90],"translation":[91],"computational":[96,115,140],"efficiency":[97,116],"MACO":[99,108,131],"The":[103],"experimental":[104],"results":[105],"demonstrate":[106],"that":[107,130],"exhibits":[109],"good":[110],"scalability,":[111],"achieving":[112],"an":[113],"average":[114],"90%":[118],"across":[119],"multiple":[120],"cores.":[121],"Furthermore,":[122],"evaluations":[123],"on":[124],"state-of-the-art":[125],"deep":[126,146],"neural":[127],"networks":[128],"show":[129],"can":[132],"achieve":[133],"up":[134],"1.1":[136],"TFLOPS":[137],"with":[138],"88%":[139],"efficiency,":[141],"indicating":[142],"its":[143],"adaptivity":[144],"learning":[147]},"counts_by_year":[],"updated_date":"2026-04-21T08:09:41.155169","created_date":"2024-05-03T00:00:00"}