From: Felix Fietkau Date: Mon, 4 Dec 2017 21:44:33 +0000 (+0100) Subject: kernel: unroll MIPS r4k cache blast function X-Git-Url: http://git.lede-project.org./?a=commitdiff_plain;h=4e8f1e9f4ca088542fd2b861ea2f1a9dca0d845f;p=openwrt%2Fstaging%2Fluka.git kernel: unroll MIPS r4k cache blast function Optimize the compiler output for larger cache blast cases that are common for DMA-based networking. On ar71xx, I measured a routing throughput increase of ~8% Signed-off-by: Ben Menchaca Signed-off-by: Rosen Penev Signed-off-by: Felix Fietkau --- diff --git a/target/linux/brcm47xx/patches-4.9/159-cpu_fixes.patch b/target/linux/brcm47xx/patches-4.9/159-cpu_fixes.patch index 36d39fa892..3102923a6b 100644 --- a/target/linux/brcm47xx/patches-4.9/159-cpu_fixes.patch +++ b/target/linux/brcm47xx/patches-4.9/159-cpu_fixes.patch @@ -204,7 +204,7 @@ #define __BUILD_BLAST_USER_CACHE(pfx, desc, indexop, hitop, lsize) \ static inline void blast_##pfx##cache##lsize##_user_page(unsigned long page) \ -@@ -660,17 +744,19 @@ __BUILD_BLAST_USER_CACHE(d, dcache, Inde +@@ -660,53 +744,23 @@ __BUILD_BLAST_USER_CACHE(d, dcache, Inde __BUILD_BLAST_USER_CACHE(i, icache, Index_Invalidate_I, Hit_Invalidate_I, 64) /* build blast_xxx_range, protected_blast_xxx_range */ @@ -214,18 +214,59 @@ unsigned long end) \ { \ unsigned long lsize = cpu_##desc##_line_size(); \ +- unsigned long lsize_2 = lsize * 2; \ +- unsigned long lsize_3 = lsize * 3; \ +- unsigned long lsize_4 = lsize * 4; \ +- unsigned long lsize_5 = lsize * 5; \ +- unsigned long lsize_6 = lsize * 6; \ +- unsigned long lsize_7 = lsize * 7; \ +- unsigned long lsize_8 = lsize * 8; \ unsigned long addr = start & ~(lsize - 1); \ - unsigned long aend = (end - 1) & ~(lsize - 1); \ +- unsigned long aend = (end + lsize - 1) & ~(lsize - 1); \ +- int lines = (aend - addr) / lsize; \ ++ unsigned long aend = (end - 1) & ~(lsize - 1); \ + war \ \ __##pfx##flush_prologue \ \ - while (1) { \ +- while (lines >= 8) { \ +- prot##cache_op(hitop, addr); \ +- prot##cache_op(hitop, addr + lsize); \ +- prot##cache_op(hitop, addr + lsize_2); \ +- prot##cache_op(hitop, addr + lsize_3); \ +- prot##cache_op(hitop, addr + lsize_4); \ +- prot##cache_op(hitop, addr + lsize_5); \ +- prot##cache_op(hitop, addr + lsize_6); \ +- prot##cache_op(hitop, addr + lsize_7); \ +- addr += lsize_8; \ +- lines -= 8; \ +- } \ +- \ +- if (lines & 0x4) { \ +- prot##cache_op(hitop, addr); \ +- prot##cache_op(hitop, addr + lsize); \ +- prot##cache_op(hitop, addr + lsize_2); \ +- prot##cache_op(hitop, addr + lsize_3); \ +- addr += lsize_4; \ +- } \ +- \ +- if (lines & 0x2) { \ +- prot##cache_op(hitop, addr); \ +- prot##cache_op(hitop, addr + lsize); \ +- addr += lsize_2; \ +- } \ +- \ +- if (lines & 0x1) { \ ++ while (1) { \ + war2 \ prot##cache_op(hitop, addr); \ - if (addr == aend) \ - break; \ -@@ -682,8 +768,8 @@ static inline void prot##extra##blast_## ++ if (addr == aend) \ ++ break; \ ++ addr += lsize; \ + } \ + \ + __##pfx##flush_epilogue \ +@@ -714,8 +768,8 @@ static inline void prot##extra##blast_## #ifndef CONFIG_EVA @@ -236,7 +277,7 @@ #else -@@ -720,14 +806,14 @@ __BUILD_PROT_BLAST_CACHE_RANGE(d, dcache +@@ -752,14 +806,14 @@ __BUILD_PROT_BLAST_CACHE_RANGE(d, dcache __BUILD_PROT_BLAST_CACHE_RANGE(i, icache, Hit_Invalidate_I) #endif diff --git a/target/linux/generic/hack-4.14/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch b/target/linux/generic/hack-4.14/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch new file mode 100644 index 0000000000..860a7e03c5 --- /dev/null +++ b/target/linux/generic/hack-4.14/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch @@ -0,0 +1,66 @@ +From: Ben Menchaca +Date: Fri, 7 Jun 2013 18:35:22 -0500 +Subject: MIPS: r4k_cache: use more efficient cache blast + +Optimize the compiler output for larger cache blast cases that are +common for DMA-based networking. + +Signed-off-by: Ben Menchaca +Signed-off-by: Felix Fietkau +--- +--- a/arch/mips/include/asm/r4kcache.h ++++ b/arch/mips/include/asm/r4kcache.h +@@ -682,16 +682,48 @@ static inline void prot##extra##blast_## + unsigned long end) \ + { \ + unsigned long lsize = cpu_##desc##_line_size(); \ ++ unsigned long lsize_2 = lsize * 2; \ ++ unsigned long lsize_3 = lsize * 3; \ ++ unsigned long lsize_4 = lsize * 4; \ ++ unsigned long lsize_5 = lsize * 5; \ ++ unsigned long lsize_6 = lsize * 6; \ ++ unsigned long lsize_7 = lsize * 7; \ ++ unsigned long lsize_8 = lsize * 8; \ + unsigned long addr = start & ~(lsize - 1); \ +- unsigned long aend = (end - 1) & ~(lsize - 1); \ ++ unsigned long aend = (end + lsize - 1) & ~(lsize - 1); \ ++ int lines = (aend - addr) / lsize; \ + \ + __##pfx##flush_prologue \ + \ +- while (1) { \ ++ while (lines >= 8) { \ ++ prot##cache_op(hitop, addr); \ ++ prot##cache_op(hitop, addr + lsize); \ ++ prot##cache_op(hitop, addr + lsize_2); \ ++ prot##cache_op(hitop, addr + lsize_3); \ ++ prot##cache_op(hitop, addr + lsize_4); \ ++ prot##cache_op(hitop, addr + lsize_5); \ ++ prot##cache_op(hitop, addr + lsize_6); \ ++ prot##cache_op(hitop, addr + lsize_7); \ ++ addr += lsize_8; \ ++ lines -= 8; \ ++ } \ ++ \ ++ if (lines & 0x4) { \ ++ prot##cache_op(hitop, addr); \ ++ prot##cache_op(hitop, addr + lsize); \ ++ prot##cache_op(hitop, addr + lsize_2); \ ++ prot##cache_op(hitop, addr + lsize_3); \ ++ addr += lsize_4; \ ++ } \ ++ \ ++ if (lines & 0x2) { \ ++ prot##cache_op(hitop, addr); \ ++ prot##cache_op(hitop, addr + lsize); \ ++ addr += lsize_2; \ ++ } \ ++ \ ++ if (lines & 0x1) { \ + prot##cache_op(hitop, addr); \ +- if (addr == aend) \ +- break; \ +- addr += lsize; \ + } \ + \ + __##pfx##flush_epilogue \ diff --git a/target/linux/generic/hack-4.9/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch b/target/linux/generic/hack-4.9/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch new file mode 100644 index 0000000000..ce7901af40 --- /dev/null +++ b/target/linux/generic/hack-4.9/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch @@ -0,0 +1,66 @@ +From: Ben Menchaca +Date: Fri, 7 Jun 2013 18:35:22 -0500 +Subject: MIPS: r4k_cache: use more efficient cache blast + +Optimize the compiler output for larger cache blast cases that are +common for DMA-based networking. + +Signed-off-by: Ben Menchaca +Signed-off-by: Felix Fietkau +--- +--- a/arch/mips/include/asm/r4kcache.h ++++ b/arch/mips/include/asm/r4kcache.h +@@ -665,16 +665,48 @@ static inline void prot##extra##blast_##pfx##cache##_range(unsigned long start, + unsigned long end) \ + { \ + unsigned long lsize = cpu_##desc##_line_size(); \ ++ unsigned long lsize_2 = lsize * 2; \ ++ unsigned long lsize_3 = lsize * 3; \ ++ unsigned long lsize_4 = lsize * 4; \ ++ unsigned long lsize_5 = lsize * 5; \ ++ unsigned long lsize_6 = lsize * 6; \ ++ unsigned long lsize_7 = lsize * 7; \ ++ unsigned long lsize_8 = lsize * 8; \ + unsigned long addr = start & ~(lsize - 1); \ +- unsigned long aend = (end - 1) & ~(lsize - 1); \ ++ unsigned long aend = (end + lsize - 1) & ~(lsize - 1); \ ++ int lines = (aend - addr) / lsize; \ + \ + __##pfx##flush_prologue \ + \ +- while (1) { \ ++ while (lines >= 8) { \ ++ prot##cache_op(hitop, addr); \ ++ prot##cache_op(hitop, addr + lsize); \ ++ prot##cache_op(hitop, addr + lsize_2); \ ++ prot##cache_op(hitop, addr + lsize_3); \ ++ prot##cache_op(hitop, addr + lsize_4); \ ++ prot##cache_op(hitop, addr + lsize_5); \ ++ prot##cache_op(hitop, addr + lsize_6); \ ++ prot##cache_op(hitop, addr + lsize_7); \ ++ addr += lsize_8; \ ++ lines -= 8; \ ++ } \ ++ \ ++ if (lines & 0x4) { \ ++ prot##cache_op(hitop, addr); \ ++ prot##cache_op(hitop, addr + lsize); \ ++ prot##cache_op(hitop, addr + lsize_2); \ ++ prot##cache_op(hitop, addr + lsize_3); \ ++ addr += lsize_4; \ ++ } \ ++ \ ++ if (lines & 0x2) { \ ++ prot##cache_op(hitop, addr); \ ++ prot##cache_op(hitop, addr + lsize); \ ++ addr += lsize_2; \ ++ } \ ++ \ ++ if (lines & 0x1) { \ + prot##cache_op(hitop, addr); \ +- if (addr == aend) \ +- break; \ +- addr += lsize; \ + } \ + \ + __##pfx##flush_epilogue \