Drive Alpaca to Elbrus (Part 2. Optimization)
What’s new
In the last article, I wrote about the launch of Alpaca on Elbrus. At the time of writing that article, there were no optimizations for Elbrus. However, now, thanks to the efforts of @troosh, we can test Elbrus already with optimizations. ATTENTION! Project llama.cpp is updated very often, and a lot changes. For now This the most current version of llama.cpp for Elbrus.
And immediately tests
In the last article, I already described what I did. So here I will immediately start with the tests.
The test was run by the following team. This command will request a random joke and calculate the request execution time.
for a in {1..8};do printf "%s;" $a;./main -t $a -m ./models/ggml-alpaca-7b-q4.bin -s 42 -p "Random joke:" -n 32 2>&1 |grep "llama_print_timings: eval time" | cut -d "(" -f 2 | grep -o -e "[0-9\.]*" ;done
Ryzen 7 5800H | Elbrus-16S | Elbrus-8SV |
707.81 | 903.02 | 1094.07 |
370.47 | 472.6 | 571.45 |
258.1 | 330.39 | 398.84 |
199.1 | 256.79 | 310.96 |
163.97 | 213.01 | 260.76 |
140.87 | 184.04 | 226.59 |
127.37 | 163.37 | 207.54 |
126.05 | 148.54 | 193.7 |
Benchmark
tests were done with benchmark-q4_0-matmult. Gathered through the command: make benchmark
1 stream | FLOPS_per_u_Second | |
Ryzen 7 5800H | 3200MHz | 40205.95 |
Elbrus-16S | 2000MHz | 22183.21 |
Elbrus-8SV | 1550MHz | 17452.88 |
And a multithreaded test
8 streams | FLOPS_per_u_Second | |
Ryzen 7 5800H | 3200MHz | 255353.06 |
Elbrus-16S | 2000MHz | 161953.14 |
Elbrus-8SV | 1550MHz | 50182.49 |
What was done
The ggml.c code was optimized for Elbrus and specifically for the Q4_0 model. Some clarification from @troosh. In this article, the tests were carried out on the Q4_0 model
I tried to optimize the work in the Q4_0 format for e2k processors with the 5th and higher version of the instruction system (for those with 128-bit registers), I post it here: https://github.com/E2Kports/llama.cpp
It is in this format that the benchmark matrix multiplication will be checked. But the model used in the article is converted to Q4_1, you should not expect acceleration on it. You need to take models in Q4_0, or wait until I finalize this format as well.
In general, the llama.cpp project is changing very quickly – I had to adapt a couple of times to new edits …
Well, a small code example
#if defined(__e2k__) && __iset__ >= 5
static inline __v2di __attribute__((__always_inline__))
e2k_dot_4_0_8_0_quants(__v2di bx, __v2di by0, __v2di by1)
{
const __v2di lowMask = __builtin_e2k_qppackdl(0x0f0f0f0f0f0f0f0fLL,
0x0f0f0f0f0f0f0f0fLL);
const __v2di bais = __builtin_e2k_qppackdl(0x0808080808080808LL,
0x0808080808080808LL);
const __v2di ones = __builtin_e2k_qppackdl(0x0001000100010001LL,
0x0001000100010001LL);
// Unpack nibbles into individual bytes
__v2di bx0 = __builtin_e2k_qpand( bx, lowMask ); // {HLhl} -> {oLol}
__v2di bx1 = __builtin_e2k_qpsrlh( bx, 4 ); // {HLhl} -> {oHLh}
bx1 = __builtin_e2k_qpand( bx1, lowMask ); // -> {oHoh}
// The output vectors contains 32 bytes, each one in [ 0 .. 15 ] interval
// Reorder bytes in "y" block to order in bx0,bx1
__v2di lo = __builtin_e2k_qppermb(by1, by0,
__builtin_e2k_qppackdl(0x1e1c1a1816141210LL,
0x0e0c0a0806040200LL));
__v2di hi = __builtin_e2k_qppermb(by1, by0,
__builtin_e2k_qppackdl(0x1f1d1b1917151311LL,
0x0f0d0b0907050301LL));
#if __iset__ >= 7
// Move each one in [ -8 .. +7 ] interval:
bx0 = __builtin_e2k_qpsubb(bx0, bais);
bx1 = __builtin_e2k_qpsubb(bx1, bais);
__v2di xy_int32 = __builtin_e2k_qpidotsbwss(bx0, lo, __builtin_e2k_qppackdl(0, 0));
xy_int32 = __builtin_e2k_qpidotsbwss(bx1, hi, xy_int32);
#else
// Get absolute values of "x" vectors:
__v2di ax0 = __builtin_e2k_qppermb(bx0 /* not used */,
__builtin_e2k_qppackdl(0x0706050403020100LL,
0x0102030405060708LL), bx0);
__v2di ax1 = __builtin_e2k_qppermb(bx1 /* not used */,
__builtin_e2k_qppackdl(0x0706050403020100LL,
0x0102030405060708LL), bx1);
// Move each one in [ -8 .. +7 ] interval:
bx0 = __builtin_e2k_qpsubb(bx0, bais);
bx1 = __builtin_e2k_qpsubb(bx1, bais);
// Sign the values of the y vectors
__v2di sy0 = __builtin_e2k_qpsignb(lo, bx0);
__v2di sy1 = __builtin_e2k_qpsignb(hi, bx1);
// Perform multiplication and create 16-bit values
__v2di dot0 = __builtin_e2k_qpmaddubsh(sy0, ax0);
__v2di dot1 = __builtin_e2k_qpmaddubsh(sy1, ax1);
// Reduce to 8 int16_t (overflow not possible: 8 bit * 4 bit => 12 bit)
__v2di dot = __builtin_e2k_qpaddh(dot0, dot1);
// Reduce to 4 int32_t by integer horizontal sums
__v2di xy_int32 = __builtin_e2k_qpmaddh(ones, dot);
#endif
// Convert vector of 4 int32_t to 4 floats
return __builtin_e2k_qpistofs(xy_int32);
}
Of the interesting things, this code provides for working with Elbrus V7.
Conclusion
At the moment, these are the best possible results, in the future we can make optimizations for the Q4_1 model.
Thanks to optimizations for the VLIW architecture, pretty good results can be achieved. Given the fact that the Ryzen 7 5800H is manufactured using a 7nm process technology and has a frequency of 3200 MHz with acceleration up to 4400 MHz. And Elbrus 16s is produced according to the 16nm process technology and has 2000 MHz (8CB has 1550 MHz in general), the results are quite good.