|
5 | 5 | #include "IROperator.h" |
6 | 6 | #include "Util.h" |
7 | 7 |
|
| 8 | +/** \file Support extraction of AMX instructions. */ |
| 9 | + |
| 10 | +/** |
| 11 | + * https://asciiflow.com/#/share/eJyVUkFugzAQ%2FMrKxwoRhdAkza23SmlySHvogQsBp7FkbGSbAoryiz6nr%2BlLugZDk6ghKvJhbXZmd2b3QEScUbIQBece4XFNFVmQQ0SqiCwegtCLSI1RMBtjZGhl8BIRAHh%2BeoFVbBSr4Pq36ZOiSOBpX5cDCEikSGhuipjzun0pmdnD4%2BqtwX9%2Ffg2cLmUcTML76WyO4VAtWJ%2Ff7kIkWMEJ6gbBae2%2F3q53OHBuFBz3TS1HodPqfvUO3%2F4wO7gQag07IXqVkCuZU4VzyApuWI5BAJkdZ0K1B2ZP2%2BwJ%2FEs%2BjhKY0EYViWFSaMAaO6kypBY1hLCtDRIvMTvsekmlsc2kiGgKMw2cxqkGIyEGjn%2FlzonoIMjPUibeQX5Q1bHGisbav%2FBh2kHW2ESzdlaZkqUltaFd9UZ25TnIrIOg%2Bb7vQykLnv661GysRSaSF1k78HkHcaSbntSReLAtTL%2FscOlaI9rxYaRzzgwUOTrZeOCokLzN0TDqRYvUqtFwB6Fvqco9S5r%2BBCiqsWmNLHabzny2Y7E4PyJHcvwBx0t%2BJw%3D%3D) |
| 12 | + * |
| 13 | + * LHS Matrix RHS Matrix |
| 14 | + * |
| 15 | + * K conceptually with AMX |
| 16 | + * ┌────────┐ |
| 17 | + * │12345678│ N N*4 |
| 18 | + *M │ │ ┌──┐ ┌────────┐ |
| 19 | + * └────────┘ │1 │ K/4│1234 │ |
| 20 | + * │2 │ │5678 │ |
| 21 | + * To properly multiply 2 matrices, the │3 │ └────────┘ |
| 22 | + * AMX instructions perform many 4 byte K│4 │ |
| 23 | + * dot products, this leads to a lot of │5 │ |
| 24 | + * striding over 4 byte areas. │6 │ |
| 25 | + * Normally the row of the LHS matrix, │7 │ |
| 26 | + * 123... would multiply with the column │8 │ |
| 27 | + * of the RHS matrix 123..., but with AMX └──┘ |
| 28 | + * this column is split up into a matrix of columns / 4 byte and rows * 4. |
| 29 | + * which then results in K/4 dot products per row. |
| 30 | + * |
| 31 | + */ |
| 32 | + |
8 | 33 | namespace Halide { |
9 | 34 | namespace Internal { |
10 | 35 |
|
@@ -376,7 +401,7 @@ Matmul convert_to_matmul(const Store *op, const string &new_name, AMXOpType op_t |
376 | 401 |
|
377 | 402 | if (lhs_load && !rhs_broadcast) { |
378 | 403 | // now working on a larger k dimension |
379 | | - // with a K dimension of 4 (or 2) with bf16 all the elements in the right-hand matrix are |
| 404 | + // with a K dimension of 4 (or 2) with bf16 all the elements in the right-hand matrix are |
380 | 405 | // layed out in a way that multiplying with a column can be done in a single dot product. |
381 | 406 | // Therefore the indexing can be reused with a broadcast, |
382 | 407 | // with higher K dimensions this can no longer be done and the broadcast won't exist. |
|
0 commit comments