From d4a80984fcf1066e244a8909354df03ba1e3f786 Mon Sep 17 00:00:00 2001 From: Duane Sand Date: Fri, 12 Oct 2012 14:25:19 -0700 Subject: [PATCH] [MIPS] Benchmark test for MIPS memset16/memset32 Author: Chris Dearman AuthorDate: 2011-07-26 19:24:40 Change-Id: I0a7b3360aaf45de8ee47744f7031b84f15f37611 --- libcutils/Android.mk | 2 + libcutils/tests/Android.mk | 1 + libcutils/tests/memset_mips/Android.mk | 23 ++ .../tests/memset_mips/android_memset_dumb.S | 36 +++ .../tests/memset_mips/android_memset_test.S | 152 +++++++++++ libcutils/tests/memset_mips/memset_cmips.S | 227 +++++++++++++++++ libcutils/tests/memset_mips/memset_omips.S | 90 +++++++ libcutils/tests/memset_mips/test_memset.c | 235 ++++++++++++++++++ 8 files changed, 766 insertions(+) create mode 100644 libcutils/tests/Android.mk create mode 100644 libcutils/tests/memset_mips/Android.mk create mode 100644 libcutils/tests/memset_mips/android_memset_dumb.S create mode 100644 libcutils/tests/memset_mips/android_memset_test.S create mode 100644 libcutils/tests/memset_mips/memset_cmips.S create mode 100644 libcutils/tests/memset_mips/memset_omips.S create mode 100644 libcutils/tests/memset_mips/test_memset.c diff --git a/libcutils/Android.mk b/libcutils/Android.mk index d9bd8d83f..fbec13169 100644 --- a/libcutils/Android.mk +++ b/libcutils/Android.mk @@ -158,3 +158,5 @@ LOCAL_SRC_FILES := str_parms.c hashmap.c memory.c LOCAL_SHARED_LIBRARIES := liblog LOCAL_MODULE_TAGS := optional include $(BUILD_EXECUTABLE) + +include $(call all-makefiles-under,$(LOCAL_PATH)) diff --git a/libcutils/tests/Android.mk b/libcutils/tests/Android.mk new file mode 100644 index 000000000..65711611c --- /dev/null +++ b/libcutils/tests/Android.mk @@ -0,0 +1 @@ +include $(all-subdir-makefiles) diff --git a/libcutils/tests/memset_mips/Android.mk b/libcutils/tests/memset_mips/Android.mk new file mode 100644 index 000000000..c22fca94b --- /dev/null +++ b/libcutils/tests/memset_mips/Android.mk @@ -0,0 +1,23 @@ +# Copyright 2012 The Android Open Source Project + +ifeq ($(TARGET_ARCH),mips) + +LOCAL_PATH:= $(call my-dir) +include $(CLEAR_VARS) + +LOCAL_SRC_FILES:= \ + test_memset.c \ + android_memset_dumb.S \ + android_memset_test.S \ + memset_cmips.S \ + memset_omips.S + +LOCAL_MODULE:= test_memset + +LOCAL_FORCE_STATIC_EXECUTABLE := true +LOCAL_STATIC_LIBRARIES := libcutils libc +LOCAL_MODULE_TAGS := tests + +include $(BUILD_EXECUTABLE) + +endif diff --git a/libcutils/tests/memset_mips/android_memset_dumb.S b/libcutils/tests/memset_mips/android_memset_dumb.S new file mode 100644 index 000000000..c8a1a37ae --- /dev/null +++ b/libcutils/tests/memset_mips/android_memset_dumb.S @@ -0,0 +1,36 @@ + .global android_memset16_dumb + .type android_memset16_dumb, @function +android_memset16_dumb: + .ent android_memset16_dumb + + .set noreorder + beqz $a2,9f + srl $a2,1 + +1: sh $a1,($a0) + subu $a2,1 + bnez $a2,1b + addu $a0,2 + .set reorder + +9: j $ra + .end android_memset16_dumb + .size android_memset16_dumb,.-android_memset16_dumb + + .global android_memset32_dumb + .type android_memset32_dumb, @function +android_memset32_dumb: + .ent android_memset32_dumb + .set noreorder + beqz $a2,9f + srl $a2,2 + +1: sw $a1,($a0) + subu $a2,1 + bnez $a2,1b + addu $a0,4 + .set reorder + +9: j $ra + .end android_memset32_dumb + .size android_memset32_dumb,.-android_memset32_dumb diff --git a/libcutils/tests/memset_mips/android_memset_test.S b/libcutils/tests/memset_mips/android_memset_test.S new file mode 100644 index 000000000..e9188438a --- /dev/null +++ b/libcutils/tests/memset_mips/android_memset_test.S @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2006 The android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef NDEBUG +#define DBG # +#else +#define DBG +#endif + + .text + .align + + /* + * Optimized memset16 for MIPS + * + * void android_memset16_test(uint16_t* dst, uint16_t value, size_t size); + * + */ + + .global android_memset16_test + .type android_memset16_test, @function +android_memset16_test: + .ent android_memset16_test + .set noreorder + + /* Check parameters */ +DBG andi $t0,$a0,1 /* $a0 must be halfword aligned */ +DBG tne $t0 +DBG lui $t1,0xffff /* $a1 must be 16bits */ +DBG and $t1,$a1 +DBG tne $t1 +DBG andi $t2,$a2,1 /* $a2 must be even */ +DBG tne $t2 + +#if (__mips==32) && (__mips_isa_rev>=2) + ins $a2,$0,0,1 +#else + li $t0,~1 + and $a2,$t0 +#endif + + move $t8,$ra + blez $a2,9f /* Anything to do? */ + andi $t0,$a0,2 /* Check dst alignment */ + /* Expand value to 32 bits and check destination alignment */ +#if (__mips==32) && (__mips_isa_rev>=2) + beqz $t0,.Laligned32 /* dst is 32 bit aligned */ + ins $a1,$a1,16,16 +#else + sll $t2,$a1,16 + beqz $t0,.Laligned32 /* dst is 32 bit aligned */ + or $a1,$t2 +#endif + sh $a1,($a0) /* do one halfword to get aligned */ + subu $a2,2 + addu $a0,2 + +.Laligned32: + and $t1,$a2,63 /* is there enough left to do a full 64 byte loop? */ + beq $a2,$t1,1f + subu $t2,$a2,$t1 /* $t2 is the number of bytes to do in loop64 */ + addu $t3,$a0,$t2 /* $t3 is the end marker for loop64 */ + subu $a2,$t2 +.Lloop64: + addu $a0,64 + sw $a1,-64($a0) + sw $a1,-60($a0) + sw $a1,-56($a0) + sw $a1,-52($a0) + sw $a1,-48($a0) + sw $a1,-44($a0) + sw $a1,-40($a0) + sw $a1,-36($a0) + sw $a1,-32($a0) + sw $a1,-28($a0) + sw $a1,-24($a0) + sw $a1,-20($a0) + sw $a1,-16($a0) + sw $a1,-12($a0) + sw $a1,-8($a0) + bne $a0,$t3,.Lloop64 + sw $a1,-4($a0) + + /* Do the last 0..62 bytes */ +1: li $t0,64+12 + andi $t1,$a2,0x3c /* $t1 how many bytes to store using sw */ + bal 1f + subu $t0,$t1 /* 64+12-$t0 is offset to jump from 1f */ +1: addu $ra,$t0 + j $ra + subu $a2,$t1 +2: sw $a1,60($a0) + sw $a1,56($a0) + sw $a1,52($a0) + sw $a1,48($a0) + sw $a1,44($a0) + sw $a1,40($a0) + sw $a1,36($a0) + sw $a1,32($a0) + sw $a1,28($a0) + sw $a1,24($a0) + sw $a1,20($a0) + sw $a1,16($a0) + sw $a1,12($a0) + sw $a1,8($a0) + sw $a1,4($a0) + sw $a1,0($a0) + + beqz $a2,9f + addu $a0,$t1 + sh $a1,($a0) + +9: j $t8 + nop + .end android_memset16_test + .size android_memset16_test,.-android_memset16_test + + /* + * Optimized memset32 for MIPS + * + * void android_memset32_test(uint32_t* dst, uint32_t value, size_t size); + * + */ + .global android_memset32_test + .type android_memset32_test, @function +android_memset32_test: + .ent android_memset32_test + .set noreorder + + /* Check parameters */ +DBG andi $t0,$a0,3 /* $a0 must be word aligned */ +DBG tne $t0 +DBG andi $t2,$a2,3 /* $a2 must be a multiple of 4 bytes */ +DBG tne $t2 + + b .Laligned32 + move $t8,$ra + .end android_memset32_test + .size android_memset32_test,.-android_memset32_test diff --git a/libcutils/tests/memset_mips/memset_cmips.S b/libcutils/tests/memset_mips/memset_cmips.S new file mode 100644 index 000000000..f8f3a9169 --- /dev/null +++ b/libcutils/tests/memset_mips/memset_cmips.S @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2009 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/************************************************************************ + * + * memset.S, version "64h" with 1 cache line horizon for "pref 30" and 14 nops + * Version: "043009" + * + ************************************************************************/ + + +/************************************************************************ + * Include files + ************************************************************************/ + +#include "machine/asm.h" + +/* + * This routine could be optimized for MIPS64. The current code only + * uses MIPS32 instructions. + */ + +#if defined(__MIPSEB__) +# define SWHI swl /* high part is left in big-endian */ +#endif + +#if defined(__MIPSEL__) +# define SWHI swr /* high part is right in little-endian */ +#endif + +#if !(defined(XGPROF) || defined(XPROF)) +#undef SETUP_GP +#define SETUP_GP +#endif + +LEAF(memset_cmips,0) + + .set noreorder + .set noat + + addu t0,a0,a2 # t0 is the "past the end" address + slti AT,a2,4 # is a2 less than 4? + bne AT,zero,.Llast4 # if yes, go to last4 + move v0,a0 # memset returns the dst pointer + + beq a1,zero,.Lset0 + subu v1,zero,a0 + + # smear byte into 32 bit word +#if (__mips==32) && (__mips_isa_rev>=2) + ins a1, a1, 8, 8 # Replicate fill byte into half-word. + ins a1, a1, 16, 16 # Replicate fill byte into word. +#else + and a1,0xff + sll AT,a1,8 + or a1,AT + sll AT,a1,16 + or a1,AT +#endif + +.Lset0: andi v1,v1,0x3 # word-unaligned address? + beq v1,zero,.Laligned # v1 is the unalignment count + subu a2,a2,v1 + SWHI a1,0(a0) + addu a0,a0,v1 + +# Here we have the "word-aligned" a0 (until the "last4") +.Laligned: + andi t8,a2,0x3f # any 64-byte chunks? + # t8 is the byte count past 64-byte chunks + beq a2,t8,.Lchk8w # when a2==t8, no 64-byte chunks + # There will be at most 1 32-byte chunk then + subu a3,a2,t8 # subtract from a2 the reminder + # Here a3 counts bytes in 16w chunks + addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks + +# Find out, if there are any 64-byte chunks after which will be still at least +# 96 bytes left. The value "96" is calculated as needed buffer for +# "pref 30,64(a0)" prefetch, which can be used as "pref 30,0(a0)" after +# incrementing "a0" by 64. +# For "a2" below 160 there will be no such "pref 30 safe" 64-byte chunk. +# + sltiu v1,a2,160 + bgtz v1,.Lloop16w_nopref30 # skip "pref 30,0(a0)" + subu t7,a2,96 # subtract "pref 30 unsafe" region + # below we have at least 1 64-byte chunk which is "pref 30 safe" + andi t6,t7,0x3f # t6 is past "64-byte safe chunks" reminder + subu t5,t7,t6 # subtract from t7 the reminder + # Here t5 counts bytes in 16w "safe" chunks + addu t4,a0,t5 # Now t4 is the dst after 64-byte "safe" chunks + +# Don't use "pref 30,0(a0)" for a0 in a "middle" of a cache line +# pref 30,0(a0) +# Here we are in the region, where it is safe to use "pref 30,64(a0)" +.Lloop16w: + addiu a0,a0,64 + pref 30,-32(a0) # continue setting up the dest, addr 64-32 + sw a1,-64(a0) + sw a1,-60(a0) + sw a1,-56(a0) + sw a1,-52(a0) + sw a1,-48(a0) + sw a1,-44(a0) + sw a1,-40(a0) + sw a1,-36(a0) + nop + nop # the extra nop instructions help to balance + nop # cycles needed for "store" + "fill" + "evict" + nop # For 64byte store there are needed 8 fill + nop # and 8 evict cycles, i.e. at least 32 instr. + nop + nop + pref 30,0(a0) # continue setting up the dest, addr 64-0 + sw a1,-32(a0) + sw a1,-28(a0) + sw a1,-24(a0) + sw a1,-20(a0) + sw a1,-16(a0) + sw a1,-12(a0) + sw a1,-8(a0) + sw a1,-4(a0) + nop + nop + nop + nop # NOTE: adding 14 nop-s instead of 12 nop-s + nop # gives better results for "fast" memory + nop + bne a0,t4,.Lloop16w + nop + + beq a0,a3,.Lchk8w # maybe no more 64-byte chunks? + nop # this "delayed slot" is useless ... + +.Lloop16w_nopref30: # there could be up to 3 "64-byte nopref30" chunks + addiu a0,a0,64 + sw a1,-64(a0) + sw a1,-60(a0) + sw a1,-56(a0) + sw a1,-52(a0) + sw a1,-48(a0) + sw a1,-44(a0) + sw a1,-40(a0) + sw a1,-36(a0) + sw a1,-32(a0) + sw a1,-28(a0) + sw a1,-24(a0) + sw a1,-20(a0) + sw a1,-16(a0) + sw a1,-12(a0) + sw a1,-8(a0) + bne a0,a3,.Lloop16w_nopref30 + sw a1,-4(a0) + +.Lchk8w: # t8 here is the byte count past 64-byte chunks + + andi t7,t8,0x1f # is there a 32-byte chunk? + # the t7 is the reminder count past 32-bytes + beq t8,t7,.Lchk1w # when t8==t7, no 32-byte chunk + move a2,t7 + + sw a1,0(a0) + sw a1,4(a0) + sw a1,8(a0) + sw a1,12(a0) + sw a1,16(a0) + sw a1,20(a0) + sw a1,24(a0) + sw a1,28(a0) + addiu a0,a0,32 + +.Lchk1w: + andi t8,a2,0x3 # now t8 is the reminder past 1w chunks + beq a2,t8,.Llast4 + subu a3,a2,t8 # a3 is the count of bytes in 1w chunks + addu a3,a0,a3 # now a3 is the dst address past the 1w chunks + +# copying in words (4-byte chunks) +.LwordCopy_loop: + addiu a0,a0,4 + bne a0,a3,.LwordCopy_loop + sw a1,-4(a0) + +.Llast4:beq a0,t0,.Llast4e +.Llast4l:addiu a0,a0,1 + bne a0,t0,.Llast4l + sb a1,-1(a0) + +.Llast4e: + j ra + nop + + .set at + .set reorder + +END(memset_cmips) + + +/************************************************************************ + * Implementation : Static functions + ************************************************************************/ + diff --git a/libcutils/tests/memset_mips/memset_omips.S b/libcutils/tests/memset_mips/memset_omips.S new file mode 100644 index 000000000..4c470010a --- /dev/null +++ b/libcutils/tests/memset_mips/memset_omips.S @@ -0,0 +1,90 @@ +/* Copyright (C) 2002, 2003 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Hartvig Ekner , 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* void *memset_omips(void *s, int c, size_t n). */ + +#include "machine/asm.h" + +#ifdef __mips64 +#error mips32 code being compiled for mips64! +#endif + +#if defined(__MIPSEB__) +#error big-endian is not supported in Broadcom MIPS Android platform +# define SWHI swl /* high part is left in big-endian */ +#else +# define SWHI swr /* high part is right in little-endian */ +#endif + +LEAF (memset_omips,0) + .set noreorder + + slti t1, a2, 8 # Less than 8? + bne t1, zero, .Llast8 + move v0, a0 # Setup exit value before too late + + beq a1, zero, .Lueven # If zero pattern, no need to extend + andi a1, 0xff # Avoid problems with bogus arguments + sll t0, a1, 8 + or a1, t0 + sll t0, a1, 16 + or a1, t0 # a1 is now pattern in full word + +.Lueven: + subu t0, zero, a0 # Unaligned address? + andi t0, 0x3 + beq t0, zero, .Lchkw + subu a2, t0 + SWHI a1, 0(a0) # Yes, handle first unaligned part + addu a0, t0 # Now both a0 and a2 are updated + +.Lchkw: + andi t0, a2, 0x7 # Enough left for one loop iteration? + beq t0, a2, .Lchkl + subu a3, a2, t0 + addu a3, a0 # a3 is last loop address +1 + move a2, t0 # a2 is now # of bytes left after loop +.Lloopw: + addiu a0, 8 # Handle 2 words pr. iteration + sw a1, -8(a0) + bne a0, a3, .Lloopw + sw a1, -4(a0) + +.Lchkl: + andi t0, a2, 0x4 # Check if there is at least a full + beq t0, zero, .Llast8 # word remaining after the loop + subu a2, t0 + sw a1, 0(a0) # Yes... + addiu a0, 4 + +.Llast8: + blez a2, .Lexit # Handle last 8 bytes (if cnt>0) + addu a3, a2, a0 # a3 is last address +1 +.Llst8l: + addiu a0, 1 + bne a0, a3, .Llst8l + sb a1, -1(a0) +.Lexit: + j ra # Bye, bye + nop + + .set reorder +END (memset_omips) + + diff --git a/libcutils/tests/memset_mips/test_memset.c b/libcutils/tests/memset_mips/test_memset.c new file mode 100644 index 000000000..9705c6563 --- /dev/null +++ b/libcutils/tests/memset_mips/test_memset.c @@ -0,0 +1,235 @@ +#include +#include +#include +#include +#include + +/* + * All systems must implement or emulate the rdhwr instruction to read + * the userlocal register. Systems that emulate also return teh count register + * when accessing register $2 so this should work on most systems + */ +#define USE_RDHWR + +#ifdef USE_RDHWR +#define UNITS "cycles" +#define SCALE 2 /* Most CPU's */ +static inline uint32_t +get_count(void) +{ + uint32_t res; + asm volatile (".set push; .set mips32r2; rdhwr %[res],$2; .set pop" : [res] "=r" (res) : : "memory"); + return res; +} +#else +#define UNITS "ns" +#define SCALE 1 +static inline uint32_t +get_count(void) +{ + struct timespec now; + uint32_t res; + clock_gettime(CLOCK_REALTIME, &now); + res = (uint32_t)(now.tv_sec * 1000000000LL + now.tv_nsec); + // printf ("now=%d.%09d res=%d\n", (int)now.tv_sec, (int)now.tv_nsec, res); + return res; +} +#endif + +uint32_t overhead; +void +measure_overhead(void) +{ + int i; + uint32_t start, stop, delta; + for (i = 0; i < 32; i++) { + start = get_count(); + stop = get_count(); + delta = stop - start; + if (overhead == 0 || delta < overhead) + overhead = delta; + } + printf("overhead is %d"UNITS"\n", overhead); +} + +uint32_t +timeone(void (*fn)(), void *d, uint32_t val, uint32_t bytes) +{ + uint32_t start, stop, delta; + start = get_count(); + (*fn)(d, val, bytes); + stop = get_count(); + delta = stop - start - overhead; + // printf ("start=0x%08x stop=0x%08x delta=0x%08x\n", start, stop, delta); + return delta * SCALE; +} + +/* define VERIFY to check that memset only touches the bytes it's supposed to */ +/*#define VERIFY*/ + +/* + * Using a big arena means that memset will most likely miss in the cache + * NB Enabling verification effectively warms up the cache... + */ +#define ARENASIZE 0x1000000 +#ifdef VERIFY +char arena[ARENASIZE+8]; /* Allow space for guard words */ +#else +char arena[ARENASIZE]; +#endif + +void +testone(char *tag, void (*fn)(), int trials, int minbytes, int maxbytes, int size, int threshold) +{ + int offset; + void *d; + void *p; + uint32_t v, notv = 0; + uint32_t n; + int i, units; + int totalunits = 0, totalbytes = 0, samples = 0; + + /* Reset RNG to ensure each test uses same random values */ + srand(0); /* FIXME should be able to use some other seed than 0 */ + + for (i = 0; i < trials; i++) { + n = minbytes + (rand() % (maxbytes-minbytes)); /* How many bytes to do */ + offset = ((rand() % (ARENASIZE-n))); /* Where to start */ + +#ifdef VERIFY + offset += 4; /* Allow space for guard word at beginning */ +#endif + v = rand(); + + /* Adjust alignment and sizes based on transfer size */ + switch (size) { + case 1: + v &= 0xff; + notv = ~v & 0xff; + break; + case 2: + v &= 0xffff; + notv = ~v & 0xffff; + offset &= ~1; + n &= ~1; + break; + case 4: + notv = ~v; + offset &= ~3; + n &= ~3; + break; + } + + d = &arena[offset]; + +#ifdef VERIFY + /* Initialise the area and guard words */ + for (p = &arena[offset-4]; p < (void *)&arena[offset+n+4]; p = (void *)((uint32_t)p + size)) { + if (size == 1) + *(uint8_t *)p = notv; + else if (size == 2) + *(uint16_t *)p = notv; + else if (size == 4) + *(uint32_t *)p = notv; + } +#endif + units = timeone(fn, d, v, n); +#ifdef VERIFY + /* Check the area and guard words */ + for (p = &arena[offset-4]; p < (void *)&arena[offset+n+4]; p = (void *)((uint32_t)p + size)) { + uint32_t got = 0; + if (size == 1) + got = *(uint8_t *)p; + else if (size == 2) + got = *(uint16_t *)p; + else if (size == 4) + got = *(uint32_t *)p; + if (p < (void *)&arena[offset]) { + if (got != notv) + printf ("%s: verify failure: preguard:%p d=%p v=%08x got=%08x n=%d\n", tag, p, d, v, got, n); + } + else if (p < (void *)&arena[offset+n]) { + if (got != v) + printf ("%s: verify failure: arena:%p d=%p v=%08x got=%08x n=%d\n", tag, p, d, v, n); + } + else { + if (got != notv) + printf ("%s: verify failure: postguard:%p d=%p v=%08x got=%08x n=%d\n", tag, p, d, v, n); + } + } +#endif + + /* If the cycle count looks reasonable include it in the statistics */ + if (units < threshold) { + totalbytes += n; + totalunits += units; + samples++; + } + } + + printf("%s: samples=%d avglen=%d avg" UNITS "=%d bp"UNITS"=%g\n", + tag, samples, totalbytes/samples, totalunits/samples, (double)totalbytes/(double)totalunits); +} + +extern void android_memset32_dumb(uint32_t* dst, uint32_t value, size_t size); +extern void android_memset16_dumb(uint32_t* dst, uint16_t value, size_t size); +extern void android_memset32_test(uint32_t* dst, uint32_t value, size_t size); +extern void android_memset16_test(uint32_t* dst, uint16_t value, size_t size); +extern void memset_cmips(void* dst, int value, size_t size); +extern void memset_omips(void* dst, int value, size_t size); + +int +main(int argc, char **argv) +{ + int i; + struct { + char *type; + int trials; + int minbytes, maxbytes; + } *pp, params[] = { + {"small", 10000, 0, 64}, + {"medium", 10000, 64, 512}, + {"large", 10000, 512, 1280}, + {"varied", 10000, 0, 1280}, + }; +#define NPARAMS (sizeof(params)/sizeof(params[0])) + struct { + char *name; + void (*fn)(); + int size; + } *fp, functions[] = { + {"dmemset16", (void (*)())android_memset16_dumb, 2}, + {"tmemset16", (void (*)())android_memset16_test, 2}, + {"lmemset16", (void (*)())android_memset16, 2}, + + {"dmemset32", (void (*)())android_memset32_dumb, 4}, + {"tmemset32", (void (*)())android_memset32_test, 4}, + {"lmemset32", (void (*)())android_memset32, 4}, + + {"cmemset", (void (*)())memset_cmips, 1}, + {"omemset", (void (*)())memset_omips, 1}, + {"lmemset", (void (*)())memset, 1}, + }; +#define NFUNCTIONS (sizeof(functions)/sizeof(functions[0])) + char tag[40]; + int threshold; + + measure_overhead(); + + /* Warm up the page cache */ + memset(arena, 0xff, ARENASIZE); /* use 0xff now to avoid COW later */ + + for (fp = functions; fp < &functions[NFUNCTIONS]; fp++) { + (fp->fn)(arena, 0xffffffff, ARENASIZE); /* one call to get the code into Icache */ + for (pp = params; pp < ¶ms[NPARAMS]; pp++) { + sprintf(tag, "%10s: %7s %4d-%4d", fp->name, pp->type, pp->minbytes, pp->maxbytes); + + /* Set the cycle threshold */ + threshold = pp->maxbytes * 4 * 10; /* reasonable for cycles and ns */ + testone(tag, fp->fn, pp->trials, pp->minbytes, pp->maxbytes, fp->size, threshold); + } + printf ("\n"); + } + + return 0; +}