From: Daniel Borkmann Date: Sat, 24 Feb 2018 00:07:59 +0000 (+0100) Subject: bpf, x64: save several bytes by using mov over movabsq when possible X-Git-Url: http://git.lede-project.org./?a=commitdiff_plain;h=6fe8b9c1f41d;p=openwrt%2Fstaging%2Fblogic.git bpf, x64: save several bytes by using mov over movabsq when possible While analyzing some of the more complex BPF programs from Cilium, I found that LLVM generally prefers to emit LD_IMM64 instead of MOV32 BPF instructions for loading unsigned 32-bit immediates into a register. Given we cannot change the current/stable LLVM versions that are already out there, lets optimize this case such that the JIT prefers to emit 'mov %eax, imm32' over 'movabsq %rax, imm64' whenever suitable in order to reduce the image size by 4-5 bytes per such load in the typical case, reducing image size on some of the bigger programs by up to 4%. emit_mov_imm32() and emit_mov_imm64() have been added as helpers. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4bc36bd1b97a..f3e5cd8c1e68 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -60,7 +60,12 @@ static bool is_imm8(int value) static bool is_simm32(s64 value) { - return value == (s64) (s32) value; + return value == (s64)(s32)value; +} + +static bool is_uimm32(u64 value) +{ + return value == (u64)(u32)value; } /* mov dst, src */ @@ -355,6 +360,68 @@ static void emit_load_skb_data_hlen(u8 **pprog) *pprog = prog; } +static void emit_mov_imm32(u8 **pprog, bool sign_propagate, + u32 dst_reg, const u32 imm32) +{ + u8 *prog = *pprog; + u8 b1, b2, b3; + int cnt = 0; + + /* optimization: if imm32 is positive, use 'mov %eax, imm32' + * (which zero-extends imm32) to save 2 bytes. + */ + if (sign_propagate && (s32)imm32 < 0) { + /* 'mov %rax, imm32' sign extends imm32 */ + b1 = add_1mod(0x48, dst_reg); + b2 = 0xC7; + b3 = 0xC0; + EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32); + goto done; + } + + /* optimization: if imm32 is zero, use 'xor %eax, %eax' + * to save 3 bytes. + */ + if (imm32 == 0) { + if (is_ereg(dst_reg)) + EMIT1(add_2mod(0x40, dst_reg, dst_reg)); + b2 = 0x31; /* xor */ + b3 = 0xC0; + EMIT2(b2, add_2reg(b3, dst_reg, dst_reg)); + goto done; + } + + /* mov %eax, imm32 */ + if (is_ereg(dst_reg)) + EMIT1(add_1mod(0x40, dst_reg)); + EMIT1_off32(add_1reg(0xB8, dst_reg), imm32); +done: + *pprog = prog; +} + +static void emit_mov_imm64(u8 **pprog, u32 dst_reg, + const u32 imm32_hi, const u32 imm32_lo) +{ + u8 *prog = *pprog; + int cnt = 0; + + if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) { + /* For emitting plain u32, where sign bit must not be + * propagated LLVM tends to load imm64 over mov32 + * directly, so save couple of bytes by just doing + * 'mov %eax, imm32' instead. + */ + emit_mov_imm32(&prog, false, dst_reg, imm32_lo); + } else { + /* movabsq %rax, imm64 */ + EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); + EMIT(imm32_lo, 4); + EMIT(imm32_hi, 4); + } + + *pprog = prog; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { @@ -377,7 +444,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, const s32 imm32 = insn->imm; u32 dst_reg = insn->dst_reg; u32 src_reg = insn->src_reg; - u8 b1 = 0, b2 = 0, b3 = 0; + u8 b2 = 0, b3 = 0; s64 jmp_offset; u8 jmp_cond; bool reload_skb_data; @@ -485,58 +552,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, break; case BPF_ALU64 | BPF_MOV | BPF_K: - /* optimization: if imm32 is positive, - * use 'mov eax, imm32' (which zero-extends imm32) - * to save 2 bytes - */ - if (imm32 < 0) { - /* 'mov rax, imm32' sign extends imm32 */ - b1 = add_1mod(0x48, dst_reg); - b2 = 0xC7; - b3 = 0xC0; - EMIT3_off32(b1, b2, add_1reg(b3, dst_reg), imm32); - break; - } - case BPF_ALU | BPF_MOV | BPF_K: - /* optimization: if imm32 is zero, use 'xor ,' - * to save 3 bytes. - */ - if (imm32 == 0) { - if (is_ereg(dst_reg)) - EMIT1(add_2mod(0x40, dst_reg, dst_reg)); - b2 = 0x31; /* xor */ - b3 = 0xC0; - EMIT2(b2, add_2reg(b3, dst_reg, dst_reg)); - break; - } - - /* mov %eax, imm32 */ - if (is_ereg(dst_reg)) - EMIT1(add_1mod(0x40, dst_reg)); - EMIT1_off32(add_1reg(0xB8, dst_reg), imm32); + emit_mov_imm32(&prog, BPF_CLASS(insn->code) == BPF_ALU64, + dst_reg, imm32); break; case BPF_LD | BPF_IMM | BPF_DW: - /* optimization: if imm64 is zero, use 'xor ,' - * to save 7 bytes. - */ - if (insn[0].imm == 0 && insn[1].imm == 0) { - b1 = add_2mod(0x48, dst_reg, dst_reg); - b2 = 0x31; /* xor */ - b3 = 0xC0; - EMIT3(b1, b2, add_2reg(b3, dst_reg, dst_reg)); - - insn++; - i++; - break; - } - - /* movabsq %rax, imm64 */ - EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); - EMIT(insn[0].imm, 4); - EMIT(insn[1].imm, 4); - + emit_mov_imm64(&prog, dst_reg, insn[1].imm, insn[0].imm); insn++; i++; break; @@ -604,7 +626,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT_mov(BPF_REG_0, src_reg); else /* mov rax, imm32 */ - EMIT3_off32(0x48, 0xC7, 0xC0, imm32); + emit_mov_imm32(&prog, true, + BPF_REG_0, imm32); if (BPF_CLASS(insn->code) == BPF_ALU64) EMIT1(add_1mod(0x48, AUX_REG));