Arm平台下各种memcpy优化对比<三>
温馨提示:这篇文章已超过397天没有更新,请注意相关的内容是否还可用!
因memcpy导致tda4vm上的h264解码占CPU较高而改弃,从网上找到各种memcpy的优化代码,在一起做了个运行速度对比,请查收;
#include
#include /* rand, srand */
#include
#include
#include
#include /* time() */
/*
* 可以一个字长一个字长的拷贝,而不需要逐个字节来拷贝
*/
void *memcpy_word_length(void *dst,const void *src, size_t num)
{
assert((dst!=NULL)&&(src!=NULL));
int wordnum = num/4; // 计算有多少个32位,按4字节拷贝
int slice = num%4; // 剩余的按字节拷贝
int * pintsrc = (int *)src;
int * pintdst = (int *)dst;
while(wordnum--)
*pintdst++ = *pintsrc++;
while (slice--)
*((char *)pintdst++) =*((char *)pintsrc++);
return dst;
}
/*
* 考虑了地址重叠
*/
void *memmove_address_overlap(void *dst, const void *src, size_t len)
{
//8字节长度
double *srcdb, *dstdb;
char *srcch, *dstch;
size_t times, left;
times = len / 8;
left = len % 8;
if (!src || !dst) return dst;
//从前往后复制
if (dst src) {
if (times > 0) {
dstdb = (double *)(dst+len-8);
srcdb = (double *)(src+len-8);
}
while(times--) {
*dstdb = *srcdb;
dstdb--;srcdb--;
}
if (len > 8 && left) {
srcch = (char *)(srcdb+1)-1;
dstch = (char *)(dstdb+1)-1;
while(left--) {
*dstch-- = *srcch--;
}
}
else {
srcch = (char *)src + (len-1);
dstch = (char *)dst + (len-1);
while (len--)
*dstch-- = *srcch--;
}
}
return dst;
}
static void get_rand_bytes(unsigned char *data, int len)
{
int i;
srand((unsigned)time(NULL)); //种下随机种子
for (i = 0; i
#if 0
copy 10485760 bytes, memcpy waste time 7324us
copy 10485760 bytes, memcpy_word_length waste time 12940us
copy 10485760 bytes, memmove_address_overlap waste time 14450us
copy 1048576 bytes, memcpy waste time 704us
copy 1048576 bytes, memcpy_word_length waste time 1313us
copy 1048576 bytes, memmove_address_overlap waste time 1220us
copy 102400 bytes, memcpy waste time 81us
copy 102400 bytes, memcpy_word_length waste time 96us
copy 102400 bytes, memmove_address_overlap waste time 65us
copy 10240 bytes, memcpy waste time 2us
copy 10240 bytes, memcpy_word_length waste time 14us
copy 10240 bytes, memmove_address_overlap waste time 6us
copy 1024 bytes, memcpy waste time 0us
copy 1024 bytes, memcpy_word_length waste time 1us
copy 1024 bytes, memmove_address_overlap waste time 1us
#endif
