diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h index ff295760cf..560d57594b 100644 --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h @@ -1656,8 +1656,12 @@ typedef struct AVCodecContext { #define FF_MM_MMX2 0x0002 ///< SSE integer functions or AMD MMX ext #define FF_MM_SSE 0x0008 ///< SSE functions #define FF_MM_SSE2 0x0010 ///< PIV SSE2 functions +#define FF_MM_SSE2SLOW 0x40000000 ///< SSE2 supported, but usually not faster + ///< than regular MMX/SSE (e.g. Core1) #define FF_MM_3DNOWEXT 0x0020 ///< AMD 3DNowExt #define FF_MM_SSE3 0x0040 ///< Prescott SSE3 functions +#define FF_MM_SSE3SLOW 0x20000000 ///< SSE3 supported, but usually not faster + ///< than regular MMX/SSE (e.g. Core1) #define FF_MM_SSSE3 0x0080 ///< Conroe SSSE3 functions #define FF_MM_SSE4 0x0100 ///< Penryn SSE4.1 functions #define FF_MM_SSE42 0x0200 ///< Nehalem SSE4.2 functions diff --git a/libavcodec/x86/cpuid.c b/libavcodec/x86/cpuid.c index 1ed4d2e7e3..f9afd6e729 100644 --- a/libavcodec/x86/cpuid.c +++ b/libavcodec/x86/cpuid.c @@ -42,6 +42,8 @@ int mm_support(void) int rval = 0; int eax, ebx, ecx, edx; int max_std_level, max_ext_level, std_caps=0, ext_caps=0; + int family=0, model=0; + union { int i[3]; char c[12]; } vendor; #if ARCH_X86_32 x86_reg a, c; @@ -70,10 +72,12 @@ int mm_support(void) return 0; /* CPUID not supported */ #endif - cpuid(0, max_std_level, ebx, ecx, edx); + cpuid(0, max_std_level, vendor.i[0], vendor.i[2], vendor.i[1]); if(max_std_level >= 1){ cpuid(1, eax, ebx, ecx, std_caps); + family = ((eax>>8)&0xf) + ((eax>>20)&0xff); + model = ((eax>>4)&0xf) + ((eax>>12)&0xf0); if (std_caps & (1<<23)) rval |= FF_MM_MMX; if (std_caps & (1<<25)) @@ -108,13 +112,24 @@ int mm_support(void) rval |= FF_MM_MMX2; } + if (!strncmp(vendor.c, "GenuineIntel", 12) && + family == 6 && (model == 9 || model == 13 || model == 14)) { + /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah") + * theoretically support sse2, but it's usually slower than mmx, + * so let's just pretend they don't. */ + if (rval & FF_MM_SSE2) rval ^= FF_MM_SSE2SLOW|FF_MM_SSE2; + if (rval & FF_MM_SSE3) rval ^= FF_MM_SSE3SLOW|FF_MM_SSE3; + } + #if 0 - av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s\n", + av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s%s%s\n", (rval&FF_MM_MMX) ? "MMX ":"", (rval&FF_MM_MMX2) ? "MMX2 ":"", (rval&FF_MM_SSE) ? "SSE ":"", (rval&FF_MM_SSE2) ? "SSE2 ":"", + (rval&FF_MM_SSE2SLOW) ? "SSE2(slow) ":"", (rval&FF_MM_SSE3) ? "SSE3 ":"", + (rval&FF_MM_SSE3SLOW) ? "SSE3(slow) ":"", (rval&FF_MM_SSSE3) ? "SSSE3 ":"", (rval&FF_MM_SSE4) ? "SSE4.1 ":"", (rval&FF_MM_SSE42) ? "SSE4.2 ":"", diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index f491111ca5..d3e412a3dc 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -1409,9 +1409,10 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) c->sum_abs_dctelem= sum_abs_dctelem_sse2; c->hadamard8_diff[0]= hadamard8_diff16_sse2; c->hadamard8_diff[1]= hadamard8_diff_sse2; -#if CONFIG_LPC + } + + if (CONFIG_LPC && mm_flags & (FF_MM_SSE2|FF_MM_SSE2SLOW)) { c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2; -#endif } #if HAVE_SSSE3 diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c index c7b02d1541..f8de2d272f 100644 --- a/libavcodec/x86/vp8dsp-init.c +++ b/libavcodec/x86/vp8dsp-init.c @@ -328,7 +328,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; } - if (mm_flags & FF_MM_SSE2) { + if (mm_flags & (FF_MM_SSE2|FF_MM_SSE2SLOW)) { VP8_LUMA_MC_FUNC(0, 16, sse2); VP8_MC_FUNC(1, 8, sse2); VP8_BILINEAR_MC_FUNC(0, 16, sse2); @@ -338,8 +338,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; - c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; + } + + if (mm_flags & FF_MM_SSE2) { + c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; }