Skip to content

Commit a80fc76

Browse files
committed
provide further documentation as to the layout of AMX
1 parent 81bacbb commit a80fc76

File tree

1 file changed

+26
-1
lines changed

1 file changed

+26
-1
lines changed

src/ExtractTileOperations.cpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,31 @@
55
#include "IROperator.h"
66
#include "Util.h"
77

8+
/** \file Support extraction of AMX instructions. */
9+
10+
/**
11+
* https://asciiflow.com/#/share/eJyVUkFugzAQ%2FMrKxwoRhdAkza23SmlySHvogQsBp7FkbGSbAoryiz6nr%2BlLugZDk6ghKvJhbXZmd2b3QEScUbIQBece4XFNFVmQQ0SqiCwegtCLSI1RMBtjZGhl8BIRAHh%2BeoFVbBSr4Pq36ZOiSOBpX5cDCEikSGhuipjzun0pmdnD4%2BqtwX9%2Ffg2cLmUcTML76WyO4VAtWJ%2Ff7kIkWMEJ6gbBae2%2F3q53OHBuFBz3TS1HodPqfvUO3%2F4wO7gQag07IXqVkCuZU4VzyApuWI5BAJkdZ0K1B2ZP2%2BwJ%2FEs%2BjhKY0EYViWFSaMAaO6kypBY1hLCtDRIvMTvsekmlsc2kiGgKMw2cxqkGIyEGjn%2FlzonoIMjPUibeQX5Q1bHGisbav%2FBh2kHW2ESzdlaZkqUltaFd9UZ25TnIrIOg%2Bb7vQykLnv661GysRSaSF1k78HkHcaSbntSReLAtTL%2FscOlaI9rxYaRzzgwUOTrZeOCokLzN0TDqRYvUqtFwB6Fvqco9S5r%2BBCiqsWmNLHabzny2Y7E4PyJHcvwBx0t%2BJw%3D%3D)
12+
*
13+
* LHS Matrix RHS Matrix
14+
*
15+
* K conceptually with AMX
16+
* ┌────────┐
17+
* │12345678│ N N*4
18+
*M │ │ ┌──┐ ┌────────┐
19+
* └────────┘ │1 │ K/4│1234 │
20+
* │2 │ │5678 │
21+
* To properly multiply 2 matrices, the │3 │ └────────┘
22+
* AMX instructions perform many 4 byte K│4 │
23+
* dot products, this leads to a lot of │5 │
24+
* striding over 4 byte areas. │6 │
25+
* Normally the row of the LHS matrix, │7 │
26+
* 123... would multiply with the column │8 │
27+
* of the RHS matrix 123..., but with AMX └──┘
28+
* this column is split up into a matrix of columns / 4 byte and rows * 4.
29+
* which then results in K/4 dot products per row.
30+
*
31+
*/
32+
833
namespace Halide {
934
namespace Internal {
1035

@@ -376,7 +401,7 @@ Matmul convert_to_matmul(const Store *op, const string &new_name, AMXOpType op_t
376401

377402
if (lhs_load && !rhs_broadcast) {
378403
// now working on a larger k dimension
379-
// with a K dimension of 4 (or 2) with bf16 all the elements in the right-hand matrix are
404+
// with a K dimension of 4 (or 2) with bf16 all the elements in the right-hand matrix are
380405
// layed out in a way that multiplying with a column can be done in a single dot product.
381406
// Therefore the indexing can be reused with a broadcast,
382407
// with higher K dimensions this can no longer be done and the broadcast won't exist.

0 commit comments

Comments
 (0)