From 1254022ea717fba6f189d6a66841e0ba204ed80a Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Wed, 22 Feb 2012 14:51:37 -0800
Subject: [PATCH 01/40] swscale: fix filtersize clipping.

if srcW<=2, clip(x, 1, srcW-2) still allows srcW to be < 1.
---
 libswscale/utils.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index 2fe9c5b47f..6ae8af64e9 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -263,7 +263,8 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
         if (xInc <= 1<<16)      filterSize= 1 + sizeFactor; // upscale
         else                    filterSize= 1 + (sizeFactor*srcW + dstW - 1)/ dstW;
 
-        filterSize = av_clip(filterSize, 1, srcW - 2);
+        filterSize = FFMIN(filterSize, srcW - 2);
+        filterSize = FFMAX(filterSize, 1);
 
         FF_ALLOC_OR_GOTO(NULL, filter, dstW*sizeof(*filter)*filterSize, fail);
 

From 3798205a77ce275613098ecb48645e6029811f14 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 12 Feb 2012 15:06:58 -0500
Subject: [PATCH 02/40] mov: set channel layout for AC-3 streams based on the
 'dac3' atom info

fixes Bug 225
---
 libavcodec/Makefile     | 2 +-
 libavcodec/ac3_parser.c | 2 +-
 libavcodec/ac3dec.c     | 2 +-
 libavcodec/ac3tab.c     | 2 +-
 libavcodec/ac3tab.h     | 2 +-
 libavformat/mov.c       | 5 +++++
 6 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 93ff7d4a67..a98ff5b52a 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -556,7 +556,7 @@ OBJS-$(CONFIG_MATROSKA_MUXER)          += xiph.o mpeg4audio.o \
                                           flacdec.o flacdata.o flac.o \
                                           mpegaudiodata.o
 OBJS-$(CONFIG_MP3_MUXER)               += mpegaudiodata.o mpegaudiodecheader.o
-OBJS-$(CONFIG_MOV_DEMUXER)             += mpeg4audio.o mpegaudiodata.o
+OBJS-$(CONFIG_MOV_DEMUXER)             += mpeg4audio.o mpegaudiodata.o ac3tab.o
 OBJS-$(CONFIG_MOV_MUXER)               += mpeg4audio.o mpegaudiodata.o
 OBJS-$(CONFIG_MPEGTS_MUXER)            += mpegvideo.o mpeg4audio.o
 OBJS-$(CONFIG_MPEGTS_DEMUXER)          += mpeg4audio.o mpegaudiodata.o
diff --git a/libavcodec/ac3_parser.c b/libavcodec/ac3_parser.c
index 067b4f9879..d9ba1fd70b 100644
--- a/libavcodec/ac3_parser.c
+++ b/libavcodec/ac3_parser.c
@@ -134,7 +134,7 @@ int avpriv_ac3_parse_header(GetBitContext *gbc, AC3HeaderInfo *hdr)
                         (hdr->num_blocks * 256.0));
         hdr->channels = ff_ac3_channels_tab[hdr->channel_mode] + hdr->lfe_on;
     }
-    hdr->channel_layout = ff_ac3_channel_layout_tab[hdr->channel_mode];
+    hdr->channel_layout = avpriv_ac3_channel_layout_tab[hdr->channel_mode];
     if (hdr->lfe_on)
         hdr->channel_layout |= AV_CH_LOW_FREQUENCY;
 
diff --git a/libavcodec/ac3dec.c b/libavcodec/ac3dec.c
index c0d7488d84..a8bc48ab6e 100644
--- a/libavcodec/ac3dec.c
+++ b/libavcodec/ac3dec.c
@@ -1378,7 +1378,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data,
                 avctx->request_channels < s->channels) {
             s->out_channels = avctx->request_channels;
             s->output_mode  = avctx->request_channels == 1 ? AC3_CHMODE_MONO : AC3_CHMODE_STEREO;
-            s->channel_layout = ff_ac3_channel_layout_tab[s->output_mode];
+            s->channel_layout = avpriv_ac3_channel_layout_tab[s->output_mode];
         }
         avctx->channels       = s->out_channels;
         avctx->channel_layout = s->channel_layout;
diff --git a/libavcodec/ac3tab.c b/libavcodec/ac3tab.c
index 7df3d828fb..951a1014ce 100644
--- a/libavcodec/ac3tab.c
+++ b/libavcodec/ac3tab.c
@@ -84,7 +84,7 @@ const uint8_t ff_ac3_channels_tab[8] = {
 /**
  * Map audio coding mode (acmod) to channel layout mask.
  */
-const uint16_t ff_ac3_channel_layout_tab[8] = {
+const uint16_t avpriv_ac3_channel_layout_tab[8] = {
     AV_CH_LAYOUT_STEREO,
     AV_CH_LAYOUT_MONO,
     AV_CH_LAYOUT_STEREO,
diff --git a/libavcodec/ac3tab.h b/libavcodec/ac3tab.h
index e5cd368bb7..8ed50520e6 100644
--- a/libavcodec/ac3tab.h
+++ b/libavcodec/ac3tab.h
@@ -33,7 +33,7 @@
 
 extern const uint16_t ff_ac3_frame_size_tab[38][3];
 extern const uint8_t  ff_ac3_channels_tab[8];
-extern const uint16_t ff_ac3_channel_layout_tab[8];
+extern const uint16_t avpriv_ac3_channel_layout_tab[8];
 extern const uint8_t  ff_ac3_enc_channel_map[8][2][6];
 extern const uint8_t  ff_ac3_dec_channel_map[8][2][6];
 extern const uint16_t ff_ac3_sample_rate_tab[3];
diff --git a/libavformat/mov.c b/libavformat/mov.c
index c2f13b6316..b11e0bfef7 100644
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -25,11 +25,13 @@
 //#define DEBUG
 //#define MOV_EXPORT_ALL_METADATA
 
+#include "libavutil/audioconvert.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/intfloat.h"
 #include "libavutil/mathematics.h"
 #include "libavutil/avstring.h"
 #include "libavutil/dict.h"
+#include "libavcodec/ac3tab.h"
 #include "avformat.h"
 #include "internal.h"
 #include "avio_internal.h"
@@ -552,6 +554,9 @@ static int mov_read_dac3(MOVContext *c, AVIOContext *pb, MOVAtom atom)
     acmod = (ac3info >> 11) & 0x7;
     lfeon = (ac3info >> 10) & 0x1;
     st->codec->channels = ((int[]){2,1,2,3,3,4,4,5})[acmod] + lfeon;
+    st->codec->channel_layout = avpriv_ac3_channel_layout_tab[acmod];
+    if (lfeon)
+        st->codec->channel_layout |= AV_CH_LOW_FREQUENCY;
     st->codec->audio_service_type = bsmod;
     if (st->codec->channels > 1 && bsmod == 0x7)
         st->codec->audio_service_type = AV_AUDIO_SERVICE_TYPE_KARAOKE;

From 62d5f9e5cab45df0505862e5ede2bae55d1881aa Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Thu, 9 Feb 2012 22:44:10 -0500
Subject: [PATCH 03/40] flacdec: set channel_layout based on channel count

Channel layouts are specified in the FLAC format description at
http://flac.sourceforge.net/format.html

fixes Bug 209
---
 libavcodec/flacdec.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/libavcodec/flacdec.c b/libavcodec/flacdec.c
index 7454d8b7f7..ad3827b6cc 100644
--- a/libavcodec/flacdec.c
+++ b/libavcodec/flacdec.c
@@ -33,6 +33,7 @@
 
 #include <limits.h>
 
+#include "libavutil/audioconvert.h"
 #include "libavutil/crc.h"
 #include "avcodec.h"
 #include "internal.h"
@@ -62,6 +63,15 @@ typedef struct FLACContext {
     int32_t *decoded[FLAC_MAX_CHANNELS];    ///< decoded samples
 } FLACContext;
 
+static const int64_t flac_channel_layouts[6] = {
+    AV_CH_LAYOUT_MONO,
+    AV_CH_LAYOUT_STEREO,
+    AV_CH_LAYOUT_SURROUND,
+    AV_CH_LAYOUT_QUAD,
+    AV_CH_LAYOUT_5POINT0,
+    AV_CH_LAYOUT_5POINT1
+};
+
 static void allocate_buffers(FLACContext *s);
 
 int avpriv_flac_is_extradata_valid(AVCodecContext *avctx,
@@ -120,6 +130,9 @@ static av_cold int flac_decode_init(AVCodecContext *avctx)
     avcodec_get_frame_defaults(&s->frame);
     avctx->coded_frame = &s->frame;
 
+    if (avctx->channels <= FF_ARRAY_ELEMS(flac_channel_layouts))
+        avctx->channel_layout = flac_channel_layouts[avctx->channels - 1];
+
     return 0;
 }
 

From 21b46747ad0c72e9c60740b75ec20a7a052126d5 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Sun, 12 Feb 2012 09:32:40 +0100
Subject: [PATCH 04/40] qtrleenc: switch to encode2().

---
 libavcodec/qtrleenc.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/libavcodec/qtrleenc.c b/libavcodec/qtrleenc.c
index 3ffce2bc7f..6936722df7 100644
--- a/libavcodec/qtrleenc.c
+++ b/libavcodec/qtrleenc.c
@@ -25,6 +25,7 @@
 #include "libavutil/imgutils.h"
 #include "avcodec.h"
 #include "bytestream.h"
+#include "internal.h"
 
 /** Maximum RLE code for bulk copy */
 #define MAX_RLE_BULK   127
@@ -96,7 +97,7 @@ static av_cold int qtrle_encode_init(AVCodecContext *avctx)
         return -1;
     }
 
-    s->max_buf_size = s->avctx->width*s->avctx->height*s->pixel_size /* image base material */
+    s->max_buf_size = s->avctx->width*s->avctx->height*s->pixel_size*2 /* image base material */
                       + 15                                           /* header + footer */
                       + s->avctx->height*2                           /* skip code+rle end */
                       + s->avctx->width/MAX_RLE_BULK + 1             /* rle codes */;
@@ -107,7 +108,7 @@ static av_cold int qtrle_encode_init(AVCodecContext *avctx)
 /**
  * Compute the best RLE sequence for a line
  */
-static void qtrle_encode_line(QtrleEncContext *s, AVFrame *p, int line, uint8_t **buf)
+static void qtrle_encode_line(QtrleEncContext *s, const AVFrame *p, int line, uint8_t **buf)
 {
     int width=s->avctx->width;
     int i;
@@ -237,7 +238,7 @@ static void qtrle_encode_line(QtrleEncContext *s, AVFrame *p, int line, uint8_t
 }
 
 /** Encode frame including header */
-static int encode_frame(QtrleEncContext *s, AVFrame *p, uint8_t *buf)
+static int encode_frame(QtrleEncContext *s, const AVFrame *p, uint8_t *buf)
 {
     int i;
     int start_line = 0;
@@ -278,19 +279,19 @@ static int encode_frame(QtrleEncContext *s, AVFrame *p, uint8_t *buf)
     return buf - orig_buf;
 }
 
-static int qtrle_encode_frame(AVCodecContext *avctx, uint8_t *buf, int buf_size, void *data)
+static int qtrle_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                              const AVFrame *pict, int *got_packet)
 {
     QtrleEncContext * const s = avctx->priv_data;
-    AVFrame *pict = data;
     AVFrame * const p = &s->frame;
-    int chunksize;
+    int ret;
 
     *p = *pict;
 
-    if (buf_size < s->max_buf_size) {
+    if ((ret = ff_alloc_packet(pkt, s->max_buf_size)) < 0) {
         /* Upper bound check for compressed data */
-        av_log(avctx, AV_LOG_ERROR, "buf_size %d <  %d\n", buf_size, s->max_buf_size);
-        return -1;
+        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", s->max_buf_size);
+        return ret;
     }
 
     if (avctx->gop_size == 0 || (s->avctx->frame_number % avctx->gop_size) == 0) {
@@ -303,11 +304,16 @@ static int qtrle_encode_frame(AVCodecContext *avctx, uint8_t *buf, int buf_size,
         p->key_frame = 0;
     }
 
-    chunksize = encode_frame(s, pict, buf);
+    pkt->size = encode_frame(s, pict, pkt->data);
 
     /* save the current frame */
     av_picture_copy(&s->previous_frame, (AVPicture *)p, avctx->pix_fmt, avctx->width, avctx->height);
-    return chunksize;
+
+    if (p->key_frame)
+        pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
 }
 
 static av_cold int qtrle_encode_end(AVCodecContext *avctx)
@@ -327,7 +333,7 @@ AVCodec ff_qtrle_encoder = {
     .id             = CODEC_ID_QTRLE,
     .priv_data_size = sizeof(QtrleEncContext),
     .init           = qtrle_encode_init,
-    .encode         = qtrle_encode_frame,
+    .encode2        = qtrle_encode_frame,
     .close          = qtrle_encode_end,
     .pix_fmts = (const enum PixelFormat[]){PIX_FMT_RGB24, PIX_FMT_RGB555BE, PIX_FMT_ARGB, PIX_FMT_NONE},
     .long_name = NULL_IF_CONFIG_SMALL("QuickTime Animation (RLE) video"),

From 148fc9950622bb28ebb2d4a4ada7f316fc090f5e Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Mon, 20 Feb 2012 13:21:58 +0100
Subject: [PATCH 05/40] lclenc: switch to encode2().

---
 libavcodec/lclenc.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/libavcodec/lclenc.c b/libavcodec/lclenc.c
index 4c902d5f17..eacd3228f5 100644
--- a/libavcodec/lclenc.c
+++ b/libavcodec/lclenc.c
@@ -68,12 +68,20 @@ typedef struct LclEncContext {
  * Encode a frame
  *
  */
-static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                        const AVFrame *pict, int *got_packet)
+{
     LclEncContext *c = avctx->priv_data;
-    AVFrame *pict = data;
     AVFrame * const p = &c->pic;
-    int i;
+    int i, ret;
     int zret; // Zlib return code
+    int max_size = deflateBound(&c->zstream, avctx->width * avctx->height * 3);
+
+    if (!pkt->data &&
+        (ret = av_new_packet(pkt, max_size)) < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Error allocating packet of size %d.\n", max_size);
+            return ret;
+    }
 
     *p = *pict;
     p->pict_type= AV_PICTURE_TYPE_I;
@@ -89,8 +97,8 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
         av_log(avctx, AV_LOG_ERROR, "Deflate reset error: %d\n", zret);
         return -1;
     }
-    c->zstream.next_out = buf;
-    c->zstream.avail_out = buf_size;
+    c->zstream.next_out  = pkt->data;
+    c->zstream.avail_out = pkt->size;
 
     for(i = avctx->height - 1; i >= 0; i--) {
         c->zstream.next_in = p->data[0]+p->linesize[0]*i;
@@ -107,7 +115,11 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
         return -1;
     }
 
-    return c->zstream.total_out;
+    pkt->size   = c->zstream.total_out;
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
 }
 
 /*
@@ -176,7 +188,7 @@ AVCodec ff_zlib_encoder = {
     .id             = CODEC_ID_ZLIB,
     .priv_data_size = sizeof(LclEncContext),
     .init           = encode_init,
-    .encode         = encode_frame,
+    .encode2        = encode_frame,
     .close          = encode_end,
     .pix_fmts = (const enum PixelFormat[]) { PIX_FMT_BGR24, PIX_FMT_NONE },
     .long_name = NULL_IF_CONFIG_SMALL("LCL (LossLess Codec Library) ZLIB"),

From 6d9c27dc85844adb37504fb43c78c12631ab7416 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Wed, 22 Feb 2012 07:38:13 +0100
Subject: [PATCH 06/40] jpeglsenc: switch to encode2().

---
 libavcodec/jpeglsenc.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/libavcodec/jpeglsenc.c b/libavcodec/jpeglsenc.c
index 2b6e54ddd4..f264c79ae7 100644
--- a/libavcodec/jpeglsenc.c
+++ b/libavcodec/jpeglsenc.c
@@ -28,6 +28,7 @@
 #include "avcodec.h"
 #include "get_bits.h"
 #include "golomb.h"
+#include "internal.h"
 #include "mathops.h"
 #include "dsputil.h"
 #include "mjpeg.h"
@@ -227,23 +228,19 @@ static void ls_store_lse(JLSState *state, PutBitContext *pb){
     put_bits(pb, 16, state->reset);
 }
 
-static int encode_picture_ls(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
+static int encode_picture_ls(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *pict, int *got_packet)
+{
     JpeglsContext * const s = avctx->priv_data;
-    AVFrame *pict = data;
     AVFrame * const p= (AVFrame*)&s->picture;
     const int near = avctx->prediction_method;
     PutBitContext pb, pb2;
     GetBitContext gb;
     uint8_t *buf2, *zero, *cur, *last;
     JLSState *state;
-    int i, size;
+    int i, size, ret;
     int comps;
 
-    buf2 = av_malloc(buf_size);
-
-    init_put_bits(&pb, buf, buf_size);
-    init_put_bits(&pb2, buf2, buf_size);
-
     *p = *pict;
     p->pict_type= AV_PICTURE_TYPE_I;
     p->key_frame= 1;
@@ -253,6 +250,17 @@ static int encode_picture_ls(AVCodecContext *avctx, unsigned char *buf, int buf_
     else
         comps = 3;
 
+    if ((ret = ff_alloc_packet(pkt, avctx->width*avctx->height*comps*4 +
+                                    FF_MIN_BUFFER_SIZE)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+        return ret;
+    }
+
+    buf2 = av_malloc(pkt->size);
+
+    init_put_bits(&pb, pkt->data, pkt->size);
+    init_put_bits(&pb2, buf2, pkt->size);
+
     /* write our own JPEG header, can't use mjpeg_picture_header */
     put_marker(&pb, SOI);
     put_marker(&pb, SOF48);
@@ -366,7 +374,10 @@ static int encode_picture_ls(AVCodecContext *avctx, unsigned char *buf, int buf_
 
     emms_c();
 
-    return put_bits_count(&pb) >> 3;
+    pkt->size   = put_bits_count(&pb) >> 3;
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+    return 0;
 }
 
 static av_cold int encode_init_ls(AVCodecContext *ctx) {
@@ -388,7 +399,7 @@ AVCodec ff_jpegls_encoder = { //FIXME avoid MPV_* lossless JPEG should not need
     .id             = CODEC_ID_JPEGLS,
     .priv_data_size = sizeof(JpeglsContext),
     .init           = encode_init_ls,
-    .encode         = encode_picture_ls,
+    .encode2        = encode_picture_ls,
     .pix_fmts       = (const enum PixelFormat[]){PIX_FMT_BGR24, PIX_FMT_RGB24, PIX_FMT_GRAY8, PIX_FMT_GRAY16, PIX_FMT_NONE},
     .long_name      = NULL_IF_CONFIG_SMALL("JPEG-LS"),
 };

From f7fa73ac917534cc5f77469cac2b80462e475417 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Sun, 12 Feb 2012 09:32:40 +0100
Subject: [PATCH 07/40] libtheoraenc: switch to encode2().

---
 libavcodec/libtheoraenc.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/libavcodec/libtheoraenc.c b/libavcodec/libtheoraenc.c
index ecceceaa12..957994a44f 100644
--- a/libavcodec/libtheoraenc.c
+++ b/libavcodec/libtheoraenc.c
@@ -35,6 +35,7 @@
 #include "libavutil/log.h"
 #include "libavutil/base64.h"
 #include "avcodec.h"
+#include "internal.h"
 
 /* libtheora includes */
 #include <theora/theoraenc.h>
@@ -260,14 +261,13 @@ static av_cold int encode_init(AVCodecContext* avc_context)
     return 0;
 }
 
-static int encode_frame(AVCodecContext* avc_context, uint8_t *outbuf,
-                        int buf_size, void *data)
+static int encode_frame(AVCodecContext* avc_context, AVPacket *pkt,
+                        const AVFrame *frame, int *got_packet)
 {
     th_ycbcr_buffer t_yuv_buffer;
     TheoraContext *h = avc_context->priv_data;
-    AVFrame *frame = data;
     ogg_packet o_packet;
-    int result, i;
+    int result, i, ret;
 
     // EOS, finish and get 1st pass stats if applicable
     if (!frame) {
@@ -328,18 +328,21 @@ static int encode_frame(AVCodecContext* avc_context, uint8_t *outbuf,
     }
 
     /* Copy ogg_packet content out to buffer */
-    if (buf_size < o_packet.bytes) {
-        av_log(avc_context, AV_LOG_ERROR, "encoded frame too large\n");
-        return -1;
+    if ((ret = ff_alloc_packet(pkt, o_packet.bytes)) < 0) {
+        av_log(avc_context, AV_LOG_ERROR, "Error getting output packet of size %ld.\n", o_packet.bytes);
+        return ret;
     }
-    memcpy(outbuf, o_packet.packet, o_packet.bytes);
+    memcpy(pkt->data, o_packet.packet, o_packet.bytes);
 
     // HACK: assumes no encoder delay, this is true until libtheora becomes
     // multithreaded (which will be disabled unless explictly requested)
-    avc_context->coded_frame->pts = frame->pts;
+    pkt->pts = pkt->dts = frame->pts;
     avc_context->coded_frame->key_frame = !(o_packet.granulepos & h->keyframe_mask);
+    if (avc_context->coded_frame->key_frame)
+        pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
 
-    return o_packet.bytes;
+    return 0;
 }
 
 static av_cold int encode_close(AVCodecContext* avc_context)
@@ -364,7 +367,7 @@ AVCodec ff_libtheora_encoder = {
     .priv_data_size = sizeof(TheoraContext),
     .init = encode_init,
     .close = encode_close,
-    .encode = encode_frame,
+    .encode2 = encode_frame,
     .capabilities = CODEC_CAP_DELAY, // needed to get the statsfile summary
     .pix_fmts= (const enum PixelFormat[]){PIX_FMT_YUV420P, PIX_FMT_YUV422P, PIX_FMT_YUV444P, PIX_FMT_NONE},
     .long_name = NULL_IF_CONFIG_SMALL("libtheora Theora"),

From 278d88689ba89c78f6c2667cf98025835567d78d Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Sun, 12 Feb 2012 09:32:40 +0100
Subject: [PATCH 08/40] ffv1enc: switch to encode2().

---
 libavcodec/ffv1.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/libavcodec/ffv1.c b/libavcodec/ffv1.c
index 366de02ac8..d2324fa41b 100644
--- a/libavcodec/ffv1.c
+++ b/libavcodec/ffv1.c
@@ -1080,17 +1080,25 @@ static int encode_slice(AVCodecContext *c, void *arg){
     return 0;
 }
 
-static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                        const AVFrame *pict, int *got_packet)
+{
     FFV1Context *f = avctx->priv_data;
     RangeCoder * const c= &f->slice_context[0]->c;
-    AVFrame *pict = data;
     AVFrame * const p= &f->picture;
     int used_count= 0;
     uint8_t keystate=128;
     uint8_t *buf_p;
-    int i;
+    int i, ret;
 
-    ff_init_range_encoder(c, buf, buf_size);
+    if (!pkt->data &&
+        (ret = av_new_packet(pkt, avctx->width*avctx->height*((8*2+1+1)*4)/8
+                                  + FF_MIN_BUFFER_SIZE)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+        return ret;
+    }
+
+    ff_init_range_encoder(c, pkt->data, pkt->size);
     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
 
     *p = *pict;
@@ -1110,7 +1118,7 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
     if(!f->ac){
         used_count += ff_rac_terminate(c);
 //printf("pos=%d\n", used_count);
-        init_put_bits(&f->slice_context[0]->pb, buf + used_count, buf_size - used_count);
+        init_put_bits(&f->slice_context[0]->pb, pkt->data + used_count, pkt->size - used_count);
     }else if (f->ac>1){
         int i;
         for(i=1; i<256; i++){
@@ -1121,8 +1129,8 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
 
     for(i=1; i<f->slice_count; i++){
         FFV1Context *fs= f->slice_context[i];
-        uint8_t *start= buf + (buf_size-used_count)*i/f->slice_count;
-        int len= buf_size/f->slice_count;
+        uint8_t *start = pkt->data + (pkt->size-used_count)*i/f->slice_count;
+        int len = pkt->size/f->slice_count;
 
         if(fs->ac){
             ff_init_range_encoder(&fs->c, start, len);
@@ -1132,7 +1140,7 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
     }
     avctx->execute(avctx, encode_slice, &f->slice_context[0], NULL, f->slice_count, sizeof(void*));
 
-    buf_p=buf;
+    buf_p = pkt->data;
     for(i=0; i<f->slice_count; i++){
         FFV1Context *fs= f->slice_context[i];
         int bytes;
@@ -1147,7 +1155,7 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
             used_count= 0;
         }
         if(i>0){
-            av_assert0(bytes < buf_size/f->slice_count);
+            av_assert0(bytes < pkt->size/f->slice_count);
             memmove(buf_p, fs->ac ? fs->c.bytestream_start : fs->pb.buf, bytes);
             av_assert0(bytes < (1<<24));
             AV_WB24(buf_p+bytes, bytes);
@@ -1200,7 +1208,11 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
         avctx->stats_out[0] = '\0';
 
     f->picture_number++;
-    return buf_p-buf;
+    pkt->size   = buf_p - pkt->data;
+    pkt->flags |= AV_PKT_FLAG_KEY*p->key_frame;
+    *got_packet = 1;
+
+    return 0;
 }
 #endif /* CONFIG_FFV1_ENCODER */
 
@@ -1742,7 +1754,7 @@ AVCodec ff_ffv1_encoder = {
     .id             = CODEC_ID_FFV1,
     .priv_data_size = sizeof(FFV1Context),
     .init           = encode_init,
-    .encode         = encode_frame,
+    .encode2        = encode_frame,
     .close          = common_end,
     .capabilities = CODEC_CAP_SLICE_THREADS,
     .pix_fmts= (const enum PixelFormat[]){PIX_FMT_YUV420P, PIX_FMT_YUV444P, PIX_FMT_YUV422P, PIX_FMT_YUV411P, PIX_FMT_YUV410P, PIX_FMT_RGB32, PIX_FMT_YUV420P16, PIX_FMT_YUV422P16, PIX_FMT_YUV444P16, PIX_FMT_NONE},

From df53a473170399bf7bc94b397a1d9f1d50366049 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Mon, 20 Feb 2012 13:21:58 +0100
Subject: [PATCH 09/40] libschroedingerenc: switch to encode2().

---
 libavcodec/libschroedingerenc.c | 43 ++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/libavcodec/libschroedingerenc.c b/libavcodec/libschroedingerenc.c
index 2aadd3af3f..ed017a1a1f 100644
--- a/libavcodec/libschroedingerenc.c
+++ b/libavcodec/libschroedingerenc.c
@@ -35,6 +35,7 @@
 #include <schroedinger/schrovideoformat.h>
 
 #include "avcodec.h"
+#include "internal.h"
 #include "libdirac_libschro.h"
 #include "libschroedinger.h"
 #include "bytestream.h"
@@ -71,6 +72,9 @@ typedef struct SchroEncoderParams {
 
     /** end of sequence pulled */
     int eos_pulled;
+
+    /* counter for frames submitted to encoder, used as dts */
+    int64_t dts;
 } SchroEncoderParams;
 
 /**
@@ -175,6 +179,7 @@ static int libschroedinger_encode_init(AVCodecContext *avccontext)
         schro_encoder_setting_set_double(p_schro_params->encoder,
                                          "au_distance", avccontext->gop_size);
         avccontext->has_b_frames = 1;
+        p_schro_params->dts = -1;
     }
 
     /* FIXME - Need to handle SCHRO_ENCODER_RATE_CONTROL_LOW_DELAY. */
@@ -236,7 +241,7 @@ static int libschroedinger_encode_init(AVCodecContext *avccontext)
 }
 
 static SchroFrame *libschroedinger_frame_from_data(AVCodecContext *avccontext,
-                                                   void *in_data)
+                                                   const AVFrame *frame)
 {
     SchroEncoderParams *p_schro_params = avccontext->priv_data;
     SchroFrame *in_frame;
@@ -246,7 +251,7 @@ static SchroFrame *libschroedinger_frame_from_data(AVCodecContext *avccontext,
     in_frame = ff_create_schro_frame(avccontext, p_schro_params->frame_format);
 
     if (in_frame)
-        avpicture_layout((AVPicture *)in_data, avccontext->pix_fmt,
+        avpicture_layout((const AVPicture *)frame, avccontext->pix_fmt,
                           avccontext->width, avccontext->height,
                           in_frame->components[0].data,
                           p_schro_params->frame_size);
@@ -262,9 +267,8 @@ static void SchroedingerFreeFrame(void *data)
     av_free(enc_frame);
 }
 
-static int libschroedinger_encode_frame(AVCodecContext *avccontext,
-                                        unsigned char *frame,
-                                        int buf_size, void *data)
+static int libschroedinger_encode_frame(AVCodecContext *avccontext, AVPacket *pkt,
+                                        const AVFrame *frame, int *got_packet)
 {
     int enc_size = 0;
     SchroEncoderParams *p_schro_params = avccontext->priv_data;
@@ -275,8 +279,9 @@ static int libschroedinger_encode_frame(AVCodecContext *avccontext,
     int presentation_frame;
     int parse_code;
     int last_frame_in_sequence = 0;
+    int pkt_size, ret;
 
-    if (!data) {
+    if (!frame) {
         /* Push end of sequence if not already signalled. */
         if (!p_schro_params->eos_signalled) {
             schro_encoder_end_of_stream(encoder);
@@ -285,7 +290,7 @@ static int libschroedinger_encode_frame(AVCodecContext *avccontext,
     } else {
         /* Allocate frame data to schro input buffer. */
         SchroFrame *in_frame = libschroedinger_frame_from_data(avccontext,
-                                                               data);
+                                                               frame);
         /* Load next frame. */
         schro_encoder_push_frame(encoder, in_frame);
     }
@@ -373,28 +378,42 @@ static int libschroedinger_encode_frame(AVCodecContext *avccontext,
     if (!p_frame_output)
         return 0;
 
-    memcpy(frame, p_frame_output->p_encbuf, p_frame_output->size);
+    pkt_size = p_frame_output->size;
+    if (last_frame_in_sequence && p_schro_params->enc_buf_size > 0)
+        pkt_size += p_schro_params->enc_buf_size;
+    if ((ret = ff_alloc_packet(pkt, pkt_size)) < 0) {
+        av_log(avccontext, AV_LOG_ERROR, "Error getting output packet of size %d.\n", pkt_size);
+        goto error;
+    }
+
+    memcpy(pkt->data, p_frame_output->p_encbuf, p_frame_output->size);
     avccontext->coded_frame->key_frame = p_frame_output->key_frame;
     /* Use the frame number of the encoded frame as the pts. It is OK to
      * do so since Dirac is a constant frame rate codec. It expects input
      * to be of constant frame rate. */
+    pkt->pts =
     avccontext->coded_frame->pts = p_frame_output->frame_num;
+    pkt->dts = p_schro_params->dts++;
     enc_size = p_frame_output->size;
 
     /* Append the end of sequence information to the last frame in the
      * sequence. */
     if (last_frame_in_sequence && p_schro_params->enc_buf_size > 0) {
-        memcpy(frame + enc_size, p_schro_params->enc_buf,
+        memcpy(pkt->data + enc_size, p_schro_params->enc_buf,
                p_schro_params->enc_buf_size);
         enc_size += p_schro_params->enc_buf_size;
         av_freep(&p_schro_params->enc_buf);
         p_schro_params->enc_buf_size = 0;
     }
 
+    if (p_frame_output->key_frame)
+        pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+error:
     /* free frame */
     SchroedingerFreeFrame(p_frame_output);
-
-    return enc_size;
+    return ret;
 }
 
 
@@ -427,7 +446,7 @@ AVCodec ff_libschroedinger_encoder = {
     .id             = CODEC_ID_DIRAC,
     .priv_data_size = sizeof(SchroEncoderParams),
     .init           = libschroedinger_encode_init,
-    .encode         = libschroedinger_encode_frame,
+    .encode2        = libschroedinger_encode_frame,
     .close          = libschroedinger_encode_close,
    .capabilities = CODEC_CAP_DELAY,
    .pix_fmts     = (const enum PixelFormat[]){PIX_FMT_YUV420P, PIX_FMT_YUV422P, PIX_FMT_YUV444P, PIX_FMT_NONE},

From 3c0ed7d1a8ccb170c1ce5acacfe5e9c26135541e Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Wed, 22 Feb 2012 07:38:13 +0100
Subject: [PATCH 10/40] asv1enc: switch to encode2().

---
 libavcodec/asv1.c | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/libavcodec/asv1.c b/libavcodec/asv1.c
index 71b6fba9ae..94073c087e 100644
--- a/libavcodec/asv1.c
+++ b/libavcodec/asv1.c
@@ -325,10 +325,12 @@ static inline int decode_mb(ASV1Context *a, DCTELEM block[6][64]){
     return 0;
 }
 
+#define MAX_MB_SIZE (30*16*16*3/2/8)
+
 static inline int encode_mb(ASV1Context *a, DCTELEM block[6][64]){
     int i;
 
-    if(a->pb.buf_end - a->pb.buf - (put_bits_count(&a->pb)>>3) < 30*16*16*3/2/8){
+    if (a->pb.buf_end - a->pb.buf - (put_bits_count(&a->pb)>>3) < MAX_MB_SIZE) {
         av_log(a->avctx, AV_LOG_ERROR, "encoded frame too large\n");
         return -1;
     }
@@ -461,14 +463,22 @@ static int decode_frame(AVCodecContext *avctx,
 }
 
 #if CONFIG_ASV1_ENCODER || CONFIG_ASV2_ENCODER
-static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                        const AVFrame *pict, int *got_packet)
+{
     ASV1Context * const a = avctx->priv_data;
-    AVFrame *pict = data;
     AVFrame * const p= &a->picture;
-    int size;
+    int size, ret;
     int mb_x, mb_y;
 
-    init_put_bits(&a->pb, buf, buf_size);
+    if (!pkt->data &&
+        (ret = av_new_packet(pkt, a->mb_height*a->mb_width*MAX_MB_SIZE +
+                                  FF_MIN_BUFFER_SIZE)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+        return ret;
+    }
+
+    init_put_bits(&a->pb, pkt->data, pkt->size);
 
     *p = *pict;
     p->pict_type= AV_PICTURE_TYPE_I;
@@ -505,14 +515,18 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
     size= put_bits_count(&a->pb)/32;
 
     if(avctx->codec_id == CODEC_ID_ASV1)
-        a->dsp.bswap_buf((uint32_t*)buf, (uint32_t*)buf, size);
+        a->dsp.bswap_buf((uint32_t*)pkt->data, (uint32_t*)pkt->data, size);
     else{
         int i;
         for(i=0; i<4*size; i++)
-            buf[i]= av_reverse[ buf[i] ];
+            pkt->data[i] = av_reverse[pkt->data[i]];
     }
 
-    return size*4;
+    pkt->size   = size*4;
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
 }
 #endif /* CONFIG_ASV1_ENCODER || CONFIG_ASV2_ENCODER */
 
@@ -634,7 +648,7 @@ AVCodec ff_asv1_encoder = {
     .id             = CODEC_ID_ASV1,
     .priv_data_size = sizeof(ASV1Context),
     .init           = encode_init,
-    .encode         = encode_frame,
+    .encode2        = encode_frame,
     //encode_end,
     .pix_fmts= (const enum PixelFormat[]){PIX_FMT_YUV420P, PIX_FMT_NONE},
     .long_name= NULL_IF_CONFIG_SMALL("ASUS V1"),
@@ -648,7 +662,7 @@ AVCodec ff_asv2_encoder = {
     .id             = CODEC_ID_ASV2,
     .priv_data_size = sizeof(ASV1Context),
     .init           = encode_init,
-    .encode         = encode_frame,
+    .encode2        = encode_frame,
     //encode_end,
     .pix_fmts= (const enum PixelFormat[]){PIX_FMT_YUV420P, PIX_FMT_NONE},
     .long_name= NULL_IF_CONFIG_SMALL("ASUS V2"),

From c2ff63e3ac82d5ee501e480d4714e982fc45cf8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 23 Feb 2012 11:38:24 +0200
Subject: [PATCH 11/40] rtpenc: Move the trailing comma into FF_RTP_FLAG_OPTS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This simplifies adding more flags to the macro.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavformat/movenc.c | 2 +-
 libavformat/rtpenc.c | 2 +-
 libavformat/rtpenc.h | 2 +-
 libavformat/rtsp.c   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libavformat/movenc.c b/libavformat/movenc.c
index d50a0e045e..2c6a6e1864 100644
--- a/libavformat/movenc.c
+++ b/libavformat/movenc.c
@@ -51,7 +51,7 @@ static const AVOption options[] = {
     { "separate_moof", "Write separate moof/mdat atoms for each track", 0, AV_OPT_TYPE_CONST, {.dbl = FF_MOV_FLAG_SEPARATE_MOOF}, INT_MIN, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM, "movflags" },
     { "frag_custom", "Flush fragments on caller requests", 0, AV_OPT_TYPE_CONST, {.dbl = FF_MOV_FLAG_FRAG_CUSTOM}, INT_MIN, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM, "movflags" },
     { "isml", "Create a live smooth streaming feed (for pushing to a publishing point)", 0, AV_OPT_TYPE_CONST, {.dbl = FF_MOV_FLAG_ISML}, INT_MIN, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM, "movflags" },
-    FF_RTP_FLAG_OPTS(MOVMuxContext, rtp_flags),
+    FF_RTP_FLAG_OPTS(MOVMuxContext, rtp_flags)
     { "skip_iods", "Skip writing iods atom.", offsetof(MOVMuxContext, iods_skip), AV_OPT_TYPE_INT, {.dbl = 0}, 0, 1, AV_OPT_FLAG_ENCODING_PARAM},
     { "iods_audio_profile", "iods audio profile atom.", offsetof(MOVMuxContext, iods_audio_profile), AV_OPT_TYPE_INT, {.dbl = -1}, -1, 255, AV_OPT_FLAG_ENCODING_PARAM},
     { "iods_video_profile", "iods video profile atom.", offsetof(MOVMuxContext, iods_video_profile), AV_OPT_TYPE_INT, {.dbl = -1}, -1, 255, AV_OPT_FLAG_ENCODING_PARAM},
diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c
index a4a6987032..e19541798b 100644
--- a/libavformat/rtpenc.c
+++ b/libavformat/rtpenc.c
@@ -31,7 +31,7 @@
 //#define DEBUG
 
 static const AVOption options[] = {
-    FF_RTP_FLAG_OPTS(RTPMuxContext, flags),
+    FF_RTP_FLAG_OPTS(RTPMuxContext, flags)
     { "payload_type", "Specify RTP payload type", offsetof(RTPMuxContext, payload_type), AV_OPT_TYPE_INT, {.dbl = -1 }, -1, 127, AV_OPT_FLAG_ENCODING_PARAM },
     { NULL },
 };
diff --git a/libavformat/rtpenc.h b/libavformat/rtpenc.h
index 72d7f65783..fdad24947b 100644
--- a/libavformat/rtpenc.h
+++ b/libavformat/rtpenc.h
@@ -67,7 +67,7 @@ typedef struct RTPMuxContext RTPMuxContext;
 
 #define FF_RTP_FLAG_OPTS(ctx, fieldname) \
     { "rtpflags", "RTP muxer flags", offsetof(ctx, fieldname), AV_OPT_TYPE_FLAGS, {.dbl = 0}, INT_MIN, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM, "rtpflags" }, \
-    { "latm", "Use MP4A-LATM packetization instead of MPEG4-GENERIC for AAC", 0, AV_OPT_TYPE_CONST, {.dbl = FF_RTP_FLAG_MP4A_LATM}, INT_MIN, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM, "rtpflags" } \
+    { "latm", "Use MP4A-LATM packetization instead of MPEG4-GENERIC for AAC", 0, AV_OPT_TYPE_CONST, {.dbl = FF_RTP_FLAG_MP4A_LATM}, INT_MIN, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM, "rtpflags" }, \
 
 void ff_rtp_send_data(AVFormatContext *s1, const uint8_t *buf1, int len, int m);
 
diff --git a/libavformat/rtsp.c b/libavformat/rtsp.c
index 359d910076..29994e4eb3 100644
--- a/libavformat/rtsp.c
+++ b/libavformat/rtsp.c
@@ -73,7 +73,7 @@
 
 const AVOption ff_rtsp_options[] = {
     { "initial_pause",  "Don't start playing the stream immediately", OFFSET(initial_pause), AV_OPT_TYPE_INT, {0}, 0, 1, DEC },
-    FF_RTP_FLAG_OPTS(RTSPState, rtp_muxer_flags),
+    FF_RTP_FLAG_OPTS(RTSPState, rtp_muxer_flags)
     { "rtsp_transport", "RTSP transport protocols", OFFSET(lower_transport_mask), AV_OPT_TYPE_FLAGS, {0}, INT_MIN, INT_MAX, DEC|ENC, "rtsp_transport" }, \
     { "udp", "UDP", 0, AV_OPT_TYPE_CONST, {1 << RTSP_LOWER_TRANSPORT_UDP}, 0, 0, DEC|ENC, "rtsp_transport" }, \
     { "tcp", "TCP", 0, AV_OPT_TYPE_CONST, {1 << RTSP_LOWER_TRANSPORT_TCP}, 0, 0, DEC|ENC, "rtsp_transport" }, \

From c4584f3c1ff3997fd98c3cc992fe82cb99f6b248 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 7 Feb 2012 16:39:14 +0200
Subject: [PATCH 12/40] rtpenc: Allow packetizing H263 according to the old RFC
 2190
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to newer RFCs, this packetization scheme should only
be used for interfacing with legacy systems.

Implementing this packetization mode properly requires parsing
the full H263 bitstream to find macroblock boundaries (and knowing
their macroblock and gob numbers and motion vector predictors).

This implementation tries to look for GOB headers (which
can be inserted by using -ps <small number>), but if the GOBs
aren't small enough to fit into the MTU, the packetizer blindly
splits packets at any offset and claims it to be a GOB boundary
(by using Mode A from the RFC). While not correct, this seems
to work with some receivers.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavformat/Makefile              |   1 +
 libavformat/rtp.c                 |   4 +-
 libavformat/rtpenc.c              |   5 ++
 libavformat/rtpenc.h              |   6 ++
 libavformat/rtpenc_h263.c         |   7 +-
 libavformat/rtpenc_h263_rfc2190.c | 104 ++++++++++++++++++++++++++++++
 libavformat/sdp.c                 |   3 +
 7 files changed, 126 insertions(+), 4 deletions(-)
 create mode 100644 libavformat/rtpenc_h263_rfc2190.c

diff --git a/libavformat/Makefile b/libavformat/Makefile
index 5c0e34242e..9682ece804 100644
--- a/libavformat/Makefile
+++ b/libavformat/Makefile
@@ -242,6 +242,7 @@ OBJS-$(CONFIG_RTP_MUXER)                 += rtp.o         \
                                             rtpenc_latm.o    \
                                             rtpenc_amr.o     \
                                             rtpenc_h263.o    \
+                                            rtpenc_h263_rfc2190.o \
                                             rtpenc_mpv.o     \
                                             rtpenc.o      \
                                             rtpenc_h264.o \
diff --git a/libavformat/rtp.c b/libavformat/rtp.c
index b6b4b72aa3..6516779feb 100644
--- a/libavformat/rtp.c
+++ b/libavformat/rtp.c
@@ -106,7 +106,9 @@ int ff_rtp_get_payload_type(AVFormatContext *fmt, AVCodecContext *codec)
     /* static payload type */
     for (i = 0; AVRtpPayloadTypes[i].pt >= 0; ++i)
         if (AVRtpPayloadTypes[i].codec_id == codec->codec_id) {
-            if (codec->codec_id == CODEC_ID_H263)
+            if (codec->codec_id == CODEC_ID_H263 && (!fmt ||
+                !fmt->oformat->priv_class ||
+                !av_opt_flag_is_set(fmt->priv_data, "rtpflags", "rfc2190")))
                 continue;
             if (codec->codec_id == CODEC_ID_PCM_S16BE)
                 if (codec->channels != AVRtpPayloadTypes[i].audio_channels)
diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c
index e19541798b..4d4e168c97 100644
--- a/libavformat/rtpenc.c
+++ b/libavformat/rtpenc.c
@@ -443,6 +443,11 @@ static int rtp_write_packet(AVFormatContext *s1, AVPacket *pkt)
         ff_rtp_send_h264(s1, pkt->data, size);
         break;
     case CODEC_ID_H263:
+        if (s->flags & FF_RTP_FLAG_RFC2190) {
+            ff_rtp_send_h263_rfc2190(s1, pkt->data, size);
+            break;
+        }
+        /* Fallthrough */
     case CODEC_ID_H263P:
         ff_rtp_send_h263(s1, pkt->data, size);
         break;
diff --git a/libavformat/rtpenc.h b/libavformat/rtpenc.h
index fdad24947b..ff423a55d1 100644
--- a/libavformat/rtpenc.h
+++ b/libavformat/rtpenc.h
@@ -64,15 +64,18 @@ struct RTPMuxContext {
 typedef struct RTPMuxContext RTPMuxContext;
 
 #define FF_RTP_FLAG_MP4A_LATM 1
+#define FF_RTP_FLAG_RFC2190   2
 
 #define FF_RTP_FLAG_OPTS(ctx, fieldname) \
     { "rtpflags", "RTP muxer flags", offsetof(ctx, fieldname), AV_OPT_TYPE_FLAGS, {.dbl = 0}, INT_MIN, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM, "rtpflags" }, \
     { "latm", "Use MP4A-LATM packetization instead of MPEG4-GENERIC for AAC", 0, AV_OPT_TYPE_CONST, {.dbl = FF_RTP_FLAG_MP4A_LATM}, INT_MIN, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM, "rtpflags" }, \
+    { "rfc2190", "Use RFC 2190 packetization instead of RFC 4629 for H.263", 0, AV_OPT_TYPE_CONST, {.dbl = FF_RTP_FLAG_RFC2190}, INT_MIN, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM, "rtpflags" }, \
 
 void ff_rtp_send_data(AVFormatContext *s1, const uint8_t *buf1, int len, int m);
 
 void ff_rtp_send_h264(AVFormatContext *s1, const uint8_t *buf1, int size);
 void ff_rtp_send_h263(AVFormatContext *s1, const uint8_t *buf1, int size);
+void ff_rtp_send_h263_rfc2190(AVFormatContext *s1, const uint8_t *buf1, int size);
 void ff_rtp_send_aac(AVFormatContext *s1, const uint8_t *buff, int size);
 void ff_rtp_send_latm(AVFormatContext *s1, const uint8_t *buff, int size);
 void ff_rtp_send_amr(AVFormatContext *s1, const uint8_t *buff, int size);
@@ -80,4 +83,7 @@ void ff_rtp_send_mpegvideo(AVFormatContext *s1, const uint8_t *buf1, int size);
 void ff_rtp_send_xiph(AVFormatContext *s1, const uint8_t *buff, int size);
 void ff_rtp_send_vp8(AVFormatContext *s1, const uint8_t *buff, int size);
 
+const uint8_t *ff_h263_find_resync_marker_reverse(const uint8_t *restrict start,
+                                                  const uint8_t *restrict end);
+
 #endif /* AVFORMAT_RTPENC_H */
diff --git a/libavformat/rtpenc_h263.c b/libavformat/rtpenc_h263.c
index fbc696e1b4..87f0bd7981 100644
--- a/libavformat/rtpenc_h263.c
+++ b/libavformat/rtpenc_h263.c
@@ -23,8 +23,8 @@
 #include "avformat.h"
 #include "rtpenc.h"
 
-static const uint8_t *find_resync_marker_reverse(const uint8_t *restrict start,
-                                                 const uint8_t *restrict end)
+const uint8_t *ff_h263_find_resync_marker_reverse(const uint8_t *restrict start,
+                                                  const uint8_t *restrict end)
 {
     const uint8_t *p = end - 1;
     start += 1; /* Make sure we never return the original start. */
@@ -63,7 +63,8 @@ void ff_rtp_send_h263(AVFormatContext *s1, const uint8_t *buf1, int size)
 
         /* Look for a better place to split the frame into packets. */
         if (len < size) {
-            const uint8_t *end = find_resync_marker_reverse(buf1, buf1 + len);
+            const uint8_t *end = ff_h263_find_resync_marker_reverse(buf1,
+                                                                    buf1 + len);
             len = end - buf1;
         }
 
diff --git a/libavformat/rtpenc_h263_rfc2190.c b/libavformat/rtpenc_h263_rfc2190.c
new file mode 100644
index 0000000000..305c1a27c8
--- /dev/null
+++ b/libavformat/rtpenc_h263_rfc2190.c
@@ -0,0 +1,104 @@
+/*
+ * RTP packetization for H.263 video
+ * Copyright (c) 2012 Martin Storsjo
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avformat.h"
+#include "rtpenc.h"
+#include "libavcodec/put_bits.h"
+#include "libavcodec/get_bits.h"
+
+struct H263Info {
+    int src;
+    int i;
+    int u;
+    int s;
+    int a;
+    int pb;
+    int tr;
+};
+
+static void send_mode_a(AVFormatContext *s1, const struct H263Info *info,
+                        const uint8_t *buf, int len, int m)
+{
+    RTPMuxContext *s = s1->priv_data;
+    PutBitContext pb;
+
+    init_put_bits(&pb, s->buf, 32);
+    put_bits(&pb, 1, 0); /* F - 0, mode A */
+    put_bits(&pb, 1, 0); /* P - 0, normal I/P */
+    put_bits(&pb, 3, 0); /* SBIT - 0 bits */
+    put_bits(&pb, 3, 0); /* EBIT - 0 bits */
+    put_bits(&pb, 3, info->src); /* SRC - source format */
+    put_bits(&pb, 1, info->i); /* I - inter/intra */
+    put_bits(&pb, 1, info->u); /* U - unrestricted motion vector */
+    put_bits(&pb, 1, info->s); /* S - syntax-baesd arithmetic coding */
+    put_bits(&pb, 1, info->a); /* A - advanced prediction */
+    put_bits(&pb, 4, 0); /* R - reserved */
+    put_bits(&pb, 2, 0); /* DBQ - 0 */
+    put_bits(&pb, 3, 0); /* TRB - 0 */
+    put_bits(&pb, 8, info->tr); /* TR */
+    flush_put_bits(&pb);
+    memcpy(s->buf + 4, buf, len);
+
+    ff_rtp_send_data(s1, s->buf, len + 4, m);
+}
+
+void ff_rtp_send_h263_rfc2190(AVFormatContext *s1, const uint8_t *buf, int size)
+{
+    RTPMuxContext *s = s1->priv_data;
+    int len;
+    GetBitContext gb;
+    struct H263Info info = { 0 };
+
+    s->timestamp = s->cur_timestamp;
+
+    init_get_bits(&gb, buf, size*8);
+    if (get_bits(&gb, 22) == 0x20) { /* Picture Start Code */
+        info.tr  = get_bits(&gb, 8);
+        skip_bits(&gb, 2); /* PTYPE start, H261 disambiguation */
+        skip_bits(&gb, 3); /* Split screen, document camera, freeze picture release */
+        info.src = get_bits(&gb, 3);
+        info.i   = get_bits(&gb, 1);
+        info.u   = get_bits(&gb, 1);
+        info.s   = get_bits(&gb, 1);
+        info.a   = get_bits(&gb, 1);
+        info.pb  = get_bits(&gb, 1);
+    }
+
+    while (size > 0) {
+        len = FFMIN(s->max_payload_size - 4, size);
+
+        /* Look for a better place to split the frame into packets. */
+        if (len < size) {
+            const uint8_t *end = ff_h263_find_resync_marker_reverse(buf,
+                                                                    buf + len);
+            len = end - buf;
+            if (len == s->max_payload_size - 4)
+                av_log(s1, AV_LOG_WARNING,
+                       "No GOB boundary found within MTU size, splitting at "
+                       "a random boundary\n");
+        }
+
+        send_mode_a(s1, &info, buf, len, len == size);
+
+        buf  += len;
+        size -= len;
+    }
+}
diff --git a/libavformat/sdp.c b/libavformat/sdp.c
index 5e0bf72050..b2c4f7bcd8 100644
--- a/libavformat/sdp.c
+++ b/libavformat/sdp.c
@@ -404,6 +404,9 @@ static char *sdp_write_media_attributes(char *buff, int size, AVCodecContext *c,
              * actually specifies the maximum video size, but we only know
              * the current size. This is required for playback on Android
              * stagefright and on Samsung bada. */
+            if (!fmt || !fmt->oformat->priv_class ||
+                !av_opt_flag_is_set(fmt->priv_data, "rtpflags", "rfc2190") ||
+                c->codec_id == CODEC_ID_H263P)
             av_strlcatf(buff, size, "a=rtpmap:%d H263-2000/90000\r\n"
                                     "a=framesize:%d %d-%d\r\n",
                                     payload_type,

From ada4e362b9fa8cef033afb040f99b07069a7a5ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sun, 5 Sep 2010 01:10:54 +0300
Subject: [PATCH 13/40] rtpenc: Add an error message
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Also return a proper error code.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavformat/rtpenc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c
index 4d4e168c97..604c4a0552 100644
--- a/libavformat/rtpenc.c
+++ b/libavformat/rtpenc.c
@@ -85,8 +85,10 @@ static int rtp_write_header(AVFormatContext *s1)
     int max_packet_size, n;
     AVStream *st;
 
-    if (s1->nb_streams != 1)
-        return -1;
+    if (s1->nb_streams != 1) {
+        av_log(s1, AV_LOG_ERROR, "Only one stream supported in the RTP muxer\n");
+        return AVERROR(EINVAL);
+    }
     st = s1->streams[0];
     if (!is_supported(st->codec->codec_id)) {
         av_log(s1, AV_LOG_ERROR, "Unsupported codec %x\n", st->codec->codec_id);

From 2b83e8b7005d531bc78b0fd4f699e9faa54ce9bb Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Wed, 22 Feb 2012 12:19:52 -0800
Subject: [PATCH 14/40] truemotion2: error out if the huffman tree has no
 nodes.

This prevents crashers and errors further down when reading nodes in the
empty tree.

Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
CC: libav-stable@libav.org
---
 libavcodec/truemotion2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/truemotion2.c b/libavcodec/truemotion2.c
index 9e45045342..7be148c602 100644
--- a/libavcodec/truemotion2.c
+++ b/libavcodec/truemotion2.c
@@ -132,7 +132,7 @@ static int tm2_build_huff_table(TM2Context *ctx, TM2Codes *code)
                huff.val_bits, huff.max_bits);
         return -1;
     }
-    if((huff.nodes < 0) || (huff.nodes > 0x10000)) {
+    if((huff.nodes <= 0) || (huff.nodes > 0x10000)) {
         av_log(ctx->avctx, AV_LOG_ERROR, "Incorrect number of Huffman tree nodes: %i\n", huff.nodes);
         return -1;
     }

From 8e8124e173c8584654d34fd9959c57dac951e36b Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Mon, 20 Feb 2012 13:21:58 +0100
Subject: [PATCH 15/40] ljpegenc: switch to encode2().

---
 libavcodec/ljpegenc.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/libavcodec/ljpegenc.c b/libavcodec/ljpegenc.c
index ac5c716496..d2b9317114 100644
--- a/libavcodec/ljpegenc.c
+++ b/libavcodec/ljpegenc.c
@@ -32,21 +32,37 @@
 
 #include "avcodec.h"
 #include "dsputil.h"
+#include "internal.h"
 #include "mpegvideo.h"
 #include "mjpeg.h"
 #include "mjpegenc.h"
 
 
-static int encode_picture_lossless(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
+static int encode_picture_lossless(AVCodecContext *avctx, AVPacket *pkt,
+                                   const AVFrame *pict, int *got_packet)
+{
     MpegEncContext * const s = avctx->priv_data;
     MJpegContext * const m = s->mjpeg_ctx;
-    AVFrame *pict = data;
     const int width= s->width;
     const int height= s->height;
     AVFrame * const p= (AVFrame*)&s->current_picture;
     const int predictor= avctx->prediction_method+1;
+    const int mb_width  = (width  + s->mjpeg_hsample[0] - 1) / s->mjpeg_hsample[0];
+    const int mb_height = (height + s->mjpeg_vsample[0] - 1) / s->mjpeg_vsample[0];
+    int ret, max_pkt_size = FF_MIN_BUFFER_SIZE;
 
-    init_put_bits(&s->pb, buf, buf_size);
+    if (avctx->pix_fmt == PIX_FMT_BGRA)
+        max_pkt_size += width * height * 3 * 4;
+    else {
+        max_pkt_size += mb_width * mb_height * 3 * 4
+                        * s->mjpeg_hsample[0] * s->mjpeg_vsample[0];
+    }
+    if ((ret = ff_alloc_packet(pkt, max_pkt_size)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", max_pkt_size);
+        return ret;
+    }
+
+    init_put_bits(&s->pb, pkt->data, pkt->size);
 
     *p = *pict;
     p->pict_type= AV_PICTURE_TYPE_I;
@@ -104,8 +120,6 @@ static int encode_picture_lossless(AVCodecContext *avctx, unsigned char *buf, in
         }
     }else{
         int mb_x, mb_y, i;
-        const int mb_width  = (width  + s->mjpeg_hsample[0] - 1) / s->mjpeg_hsample[0];
-        const int mb_height = (height + s->mjpeg_vsample[0] - 1) / s->mjpeg_vsample[0];
 
         for(mb_y = 0; mb_y < mb_height; mb_y++) {
             if(s->pb.buf_end - s->pb.buf - (put_bits_count(&s->pb)>>3) < mb_width * 4 * 3 * s->mjpeg_hsample[0] * s->mjpeg_vsample[0]){
@@ -181,7 +195,11 @@ static int encode_picture_lossless(AVCodecContext *avctx, unsigned char *buf, in
     s->picture_number++;
 
     flush_put_bits(&s->pb);
-    return put_bits_ptr(&s->pb) - s->pb.buf;
+    pkt->size   = put_bits_ptr(&s->pb) - s->pb.buf;
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
 //    return (put_bits_count(&f->pb)+7)/8;
 }
 
@@ -192,7 +210,7 @@ AVCodec ff_ljpeg_encoder = { //FIXME avoid MPV_* lossless JPEG should not need t
     .id             = CODEC_ID_LJPEG,
     .priv_data_size = sizeof(MpegEncContext),
     .init           = ff_MPV_encode_init,
-    .encode         = encode_picture_lossless,
+    .encode2        = encode_picture_lossless,
     .close          = ff_MPV_encode_end,
     .long_name      = NULL_IF_CONFIG_SMALL("Lossless JPEG"),
 };

From a5f848c86db28b0c328593abeea4678903a66f07 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Mon, 6 Feb 2012 07:34:57 +0100
Subject: [PATCH 16/40] libxavs: split extradata writing out of encode_nals().

This is done in preparation for the following patch implementing
encode2().

This commit is analogous to 05d699222dd5af4f5775f9890aa825ede05a144f for
libx264.
---
 libavcodec/libxavs.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/libavcodec/libxavs.c b/libavcodec/libxavs.c
index 8587eae99b..83f74a713c 100644
--- a/libavcodec/libxavs.c
+++ b/libavcodec/libxavs.c
@@ -86,14 +86,6 @@ static int encode_nals(AVCodecContext *ctx, uint8_t *buf,
     }
 
     for (i = 0; i < nnal; i++) {
-        /* Don't put the SEI in extradata. */
-        if (skip_sei && nals[i].i_type == NAL_SEI) {
-            x4->sei = av_malloc( 5 + nals[i].i_payload * 4 / 3 );
-            if (xavs_nal_encode(x4->sei, &x4->sei_size, 1, nals + i) < 0)
-                return -1;
-
-            continue;
-        }
         s = xavs_nal_encode(p, &size, 1, nals + i);
         if (s < 0)
             return -1;
@@ -329,12 +321,27 @@ static av_cold int XAVS_init(AVCodecContext *avctx)
     /* We Have PPS and SPS in AVS */
     if (avctx->flags & CODEC_FLAG_GLOBAL_HEADER) {
         xavs_nal_t *nal;
-        int nnal, s;
+        int nnal, s, i, size;
+        uint8_t *p;
 
         s = xavs_encoder_headers(x4->enc, &nal, &nnal);
 
-        avctx->extradata      = av_malloc(s);
-        avctx->extradata_size = encode_nals(avctx, avctx->extradata, s, nal, nnal, 1);
+        avctx->extradata = p = av_malloc(s);
+        for (i = 0; i < nnal; i++) {
+            /* Don't put the SEI in extradata. */
+            if (nal[i].i_type == NAL_SEI) {
+                x4->sei = av_malloc( 5 + nal[i].i_payload * 4 / 3 );
+                if (xavs_nal_encode(x4->sei, &x4->sei_size, 1, nal + i) < 0)
+                    return -1;
+
+                continue;
+            }
+            size = xavs_nal_encode(p, &s, 1, nal + i);
+            if (size < 0)
+                return -1;
+            p += size;
+        }
+        avctx->extradata_size = p - avctx->extradata;
     }
     return 0;
 }

From d3b577f00d14fb8b8a962163a136dde96d9396b4 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Tue, 21 Feb 2012 22:57:59 +0100
Subject: [PATCH 17/40] libxavs: add an AVClass at the beginning of
 XavsContext.

---
 libavcodec/libxavs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libavcodec/libxavs.c b/libavcodec/libxavs.c
index 83f74a713c..f285ac4e04 100644
--- a/libavcodec/libxavs.c
+++ b/libavcodec/libxavs.c
@@ -37,6 +37,7 @@
 #define XAVS_PART_B8X8 0x100 /* Analyze b16x8, b*/
 
 typedef struct XavsContext {
+    AVClass        *class;
     xavs_param_t    params;
     xavs_t         *enc;
     xavs_picture_t  pic;

From 84db922bf9066af37e38df6d22b6622b730a961f Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Wed, 22 Feb 2012 07:38:13 +0100
Subject: [PATCH 18/40] libxavs: switch to encode2().

---
 libavcodec/libxavs.c | 88 +++++++++++++++++++++++++++++++-------------
 1 file changed, 63 insertions(+), 25 deletions(-)

diff --git a/libavcodec/libxavs.c b/libavcodec/libxavs.c
index f285ac4e04..9e45a3796b 100644
--- a/libavcodec/libxavs.c
+++ b/libavcodec/libxavs.c
@@ -54,6 +54,9 @@ typedef struct XavsContext {
     int fast_pskip;
     int mbtree;
     int mixed_refs;
+
+    int64_t *pts_buffer;
+    int out_frame_count;
 } XavsContext;
 
 static void XAVS_log(void *p, int level, const char *fmt, va_list args)
@@ -71,13 +74,24 @@ static void XAVS_log(void *p, int level, const char *fmt, va_list args)
     av_vlog(p, level_map[level], fmt, args);
 }
 
-static int encode_nals(AVCodecContext *ctx, uint8_t *buf,
-                       int size, xavs_nal_t *nals,
-                       int nnal, int skip_sei)
+static int encode_nals(AVCodecContext *ctx, AVPacket *pkt,
+                       xavs_nal_t *nals, int nnal)
 {
     XavsContext *x4 = ctx->priv_data;
-    uint8_t *p = buf;
-    int i, s;
+    uint8_t *p;
+    int i, s, ret, size = x4->sei_size + FF_MIN_BUFFER_SIZE;
+
+    if (!nnal)
+        return 0;
+
+    for (i = 0; i < nnal; i++)
+        size += nals[i].i_payload;
+
+    if ((ret = ff_alloc_packet(pkt, size)) < 0) {
+        av_log(ctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", size);
+        return ret;
+    }
+    p = pkt->data;
 
     /* Write the SEI as part of the first frame. */
     if (x4->sei_size > 0 && nnal > 0) {
@@ -92,17 +106,17 @@ static int encode_nals(AVCodecContext *ctx, uint8_t *buf,
             return -1;
         p += s;
     }
+    pkt->size = p - pkt->data;
 
-    return p - buf;
+    return 1;
 }
 
-static int XAVS_frame(AVCodecContext *ctx, uint8_t *buf,
-                      int bufsize, void *data)
+static int XAVS_frame(AVCodecContext *ctx, AVPacket *pkt,
+                      const AVFrame *frame, int *got_packet)
 {
     XavsContext *x4 = ctx->priv_data;
-    AVFrame *frame = data;
     xavs_nal_t *nal;
-    int nnal, i;
+    int nnal, i, ret;
     xavs_picture_t pic_out;
 
     x4->pic.img.i_csp   = XAVS_CSP_I420;
@@ -116,29 +130,44 @@ static int XAVS_frame(AVCodecContext *ctx, uint8_t *buf,
 
         x4->pic.i_pts  = frame->pts;
         x4->pic.i_type = XAVS_TYPE_AUTO;
+        x4->pts_buffer[ctx->frame_number % (ctx->max_b_frames+1)] = frame->pts;
     }
 
     if (xavs_encoder_encode(x4->enc, &nal, &nnal,
                             frame? &x4->pic: NULL, &pic_out) < 0)
     return -1;
 
-    bufsize = encode_nals(ctx, buf, bufsize, nal, nnal, 0);
+    ret = encode_nals(ctx, pkt, nal, nnal);
 
-    if (bufsize < 0)
+    if (ret < 0)
         return -1;
 
-    if (!bufsize && !frame && !(x4->end_of_stream)){
-        buf[bufsize]   = 0x0;
-        buf[bufsize+1] = 0x0;
-        buf[bufsize+2] = 0x01;
-        buf[bufsize+3] = 0xb1;
-        bufsize += 4;
-        x4->end_of_stream = END_OF_STREAM;
-        return bufsize;
+    if (!ret) {
+        if (!frame && !(x4->end_of_stream)) {
+            if ((ret = ff_alloc_packet(pkt, 4)) < 0)
+                return ret;
+
+            pkt->data[0] = 0x0;
+            pkt->data[1] = 0x0;
+            pkt->data[2] = 0x01;
+            pkt->data[3] = 0xb1;
+            pkt->dts = 2*x4->pts_buffer[(x4->out_frame_count-1)%(ctx->max_b_frames+1)] -
+                       x4->pts_buffer[(x4->out_frame_count-2)%(ctx->max_b_frames+1)];
+            x4->end_of_stream = END_OF_STREAM;
+            *got_packet = 1;
+        }
+        return 0;
     }
-    /* FIXME: libxavs now provides DTS */
-    /* but AVFrame doesn't have a field for it. */
+
     x4->out_pic.pts = pic_out.i_pts;
+    pkt->pts = pic_out.i_pts;
+    if (ctx->has_b_frames) {
+        if (!x4->out_frame_count)
+            pkt->dts = pkt->pts - (x4->pts_buffer[1] - x4->pts_buffer[0]);
+        else
+            pkt->dts = x4->pts_buffer[(x4->out_frame_count-1)%(ctx->max_b_frames+1)];
+    } else
+        pkt->dts = pkt->pts;
 
     switch (pic_out.i_type) {
     case XAVS_TYPE_IDR:
@@ -156,11 +185,16 @@ static int XAVS_frame(AVCodecContext *ctx, uint8_t *buf,
 
     /* There is no IDR frame in AVS JiZhun */
     /* Sequence header is used as a flag */
-    x4->out_pic.key_frame = pic_out.i_type == XAVS_TYPE_I;
+    if (pic_out.i_type == XAVS_TYPE_I) {
+        x4->out_pic.key_frame = 1;
+        pkt->flags |= AV_PKT_FLAG_KEY;
+    }
 
     x4->out_pic.quality   = (pic_out.i_qpplus1 - 1) * FF_QP2LAMBDA;
 
-    return bufsize;
+    x4->out_frame_count++;
+    *got_packet = ret;
+    return 0;
 }
 
 static av_cold int XAVS_close(AVCodecContext *avctx)
@@ -169,6 +203,7 @@ static av_cold int XAVS_close(AVCodecContext *avctx)
 
     av_freep(&avctx->extradata);
     av_free(x4->sei);
+    av_freep(&x4->pts_buffer);
 
     if (x4->enc)
         xavs_encoder_close(x4->enc);
@@ -317,6 +352,9 @@ static av_cold int XAVS_init(AVCodecContext *avctx)
     if (!x4->enc)
         return -1;
 
+    if (!(x4->pts_buffer = av_mallocz((avctx->max_b_frames+1) * sizeof(*x4->pts_buffer))))
+        return AVERROR(ENOMEM);
+
     avctx->coded_frame = &x4->out_pic;
     /* TAG: Do we have GLOBAL HEADER in AVS */
     /* We Have PPS and SPS in AVS */
@@ -384,7 +422,7 @@ AVCodec ff_libxavs_encoder = {
     .id             = CODEC_ID_CAVS,
     .priv_data_size = sizeof(XavsContext),
     .init           = XAVS_init,
-    .encode         = XAVS_frame,
+    .encode2        = XAVS_frame,
     .close          = XAVS_close,
     .capabilities   = CODEC_CAP_DELAY | CODEC_CAP_AUTO_THREADS,
     .pix_fmts       = (const enum PixelFormat[]) { PIX_FMT_YUV420P, PIX_FMT_NONE },

From 19a65b5be47944c607a9e979edb098924d95f2e4 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Wed, 22 Feb 2012 16:46:31 -0800
Subject: [PATCH 19/40] swscale: fix overflows in filterPos[] calculation for
 large sizes.

Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
CC: libav-stable@libav.org
---
 libswscale/utils.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index 6ae8af64e9..796adb60e1 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -244,7 +244,7 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
             xDstInSrc+= xInc;
         }
     } else {
-        int xDstInSrc;
+        int64_t xDstInSrc;
         int sizeFactor;
 
         if      (flags&SWS_BICUBIC)      sizeFactor=  4;
@@ -810,8 +810,8 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
     if (!dstFilter) dstFilter= &dummyFilter;
     if (!srcFilter) srcFilter= &dummyFilter;
 
-    c->lumXInc= ((srcW<<16) + (dstW>>1))/dstW;
-    c->lumYInc= ((srcH<<16) + (dstH>>1))/dstH;
+    c->lumXInc= (((int64_t)srcW<<16) + (dstW>>1))/dstW;
+    c->lumYInc= (((int64_t)srcH<<16) + (dstH>>1))/dstH;
     c->dstFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[dstFormat]);
     c->srcFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[srcFormat]);
     c->vRounder= 4* 0x0001000100010001ULL;
@@ -897,8 +897,8 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
     else
         c->canMMX2BeUsed=0;
 
-    c->chrXInc= ((c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
-    c->chrYInc= ((c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
+    c->chrXInc= (((int64_t)c->chrSrcW<<16) + (c->chrDstW>>1))/c->chrDstW;
+    c->chrYInc= (((int64_t)c->chrSrcH<<16) + (c->chrDstH>>1))/c->chrDstH;
 
     // match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
     // but only for the FAST_BILINEAR mode otherwise do correct scaling
@@ -913,8 +913,8 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
         }
         //we don't use the x86 asm scaler if MMX is available
         else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
-            c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
-            c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
+            c->lumXInc = ((int64_t)(srcW-2)<<16)/(dstW-2) - 20;
+            c->chrXInc = ((int64_t)(c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
         }
     }
 

From 491865b57db5fbb3053c221fd6d94b0435cad105 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Wed, 22 Feb 2012 16:47:14 -0800
Subject: [PATCH 20/40] swscale: fix underflows in firstline calculations for
 extreme resizes.

Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
CC: libav-stable@libav.org
---
 libswscale/swscale.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index b231302216..bd909bf163 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -407,9 +407,9 @@ static int swScale(SwsContext *c, const uint8_t* src[],
             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
         };
 
-        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
-        const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
-        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
+        const int firstLumSrcY= FFMAX(1 - vLumFilterSize, vLumFilterPos[dstY]); //First line needed as input
+        const int firstLumSrcY2= FFMAX(1 - vLumFilterSize, vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)]);
+        const int firstChrSrcY= FFMAX(1 - vChrFilterSize, vChrFilterPos[chrDstY]); //First line needed as input
 
         // Last line needed as input
         int lastLumSrcY  = FFMIN(c->srcH,    firstLumSrcY  + vLumFilterSize) - 1;

From 1d8c4af396b6ed84c84b5ebf0bf1163c4a7a3017 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Wed, 22 Feb 2012 16:48:38 -0800
Subject: [PATCH 21/40] swscale: take first/lastline over/underflows into
 account for MMX.

Fixes crashes for extremely large resizes (several 100-fold).

Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
CC: libav-stable@libav.org
---
 libswscale/x86/swscale_mmx.c | 38 ++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c
index 764472e95e..64d5f0fc9d 100644
--- a/libswscale/x86/swscale_mmx.c
+++ b/libswscale/x86/swscale_mmx.c
@@ -117,6 +117,44 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
         const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
         const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
         int i;
+
+        if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
+            const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize;
+            int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
+            for (i = 0; i < neg;            i++)
+                tmpY[i] = lumSrcPtr[neg];
+            for (     ; i < end;            i++)
+                tmpY[i] = lumSrcPtr[i];
+            for (     ; i < vLumFilterSize; i++)
+                tmpY[i] = tmpY[i-1];
+            lumSrcPtr = tmpY;
+
+            if (alpSrcPtr) {
+                const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize;
+                for (i = 0; i < neg;            i++)
+                    tmpA[i] = alpSrcPtr[neg];
+                for (     ; i < end;            i++)
+                    tmpA[i] = alpSrcPtr[i];
+                for (     ; i < vLumFilterSize; i++)
+                    tmpA[i] = tmpA[i - 1];
+                alpSrcPtr = tmpA;
+            }
+        }
+        if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
+            const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize;
+            int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
+            for (i = 0; i < neg;            i++) {
+                tmpU[i] = chrUSrcPtr[neg];
+            }
+            for (     ; i < end;            i++) {
+                tmpU[i] = chrUSrcPtr[i];
+            }
+            for (     ; i < vChrFilterSize; i++) {
+                tmpU[i] = tmpU[i - 1];
+            }
+            chrUSrcPtr = tmpU;
+        }
+
         if (flags & SWS_ACCURATE_RND) {
             int s= APCK_SIZE / 8;
             for (i=0; i<vLumFilterSize; i+=2) {

From 01cb62aba2503b4173f101154f9f840f04f9c7f8 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Wed, 22 Feb 2012 14:22:56 -0800
Subject: [PATCH 22/40] lcl: don't overwrite input memory.

If the PNG filter is enabled, a PNG-style filter will run over the
input buffer, writing into the buffer. Therefore, if no zlib compression
was used, ensure that we copy into a temporary buffer, otherwise we
overwrite user-provided input data.
---
 libavcodec/lcldec.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/libavcodec/lcldec.c b/libavcodec/lcldec.c
index b66a3ce65b..a7f0bde23e 100644
--- a/libavcodec/lcldec.c
+++ b/libavcodec/lcldec.c
@@ -236,9 +236,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPac
          * gives a file with ZLIB fourcc, but frame is really uncompressed.
          * To be sure that's true check also frame size */
         if (c->compression == COMP_ZLIB_NORMAL && c->imgtype == IMGTYPE_RGB24 &&
-            len == width * height * 3)
-            break;
-        if (c->flags & FLAG_MULTITHREAD) {
+            len == width * height * 3) {
+            if (c->flags & FLAG_PNGFILTER) {
+                memcpy(c->decomp_buf, encoded, len);
+                encoded = c->decomp_buf;
+            } else {
+                break;
+            }
+        } else if (c->flags & FLAG_MULTITHREAD) {
             int ret;
             mthread_inlen = AV_RL32(encoded);
             mthread_inlen = FFMIN(mthread_inlen, len - 8);

From b315042c8ce984bec431c5965120853a843cbfa5 Mon Sep 17 00:00:00 2001
From: Diego Biurrun <diego@biurrun.de>
Date: Thu, 23 Feb 2012 17:17:01 +0100
Subject: [PATCH 23/40] Remove libpostproc.

This library does not fit into Libav as a whole and its code is just a
maintenance burden.  Furthermore it is now available as an external project,
which completely obviates any reason to keep it around.

URL: http://git.videolan.org/?p=libpostproc.git
---
 .gitignore                                 |    1 -
 Changelog                                  |    1 +
 LICENSE                                    |    1 -
 Makefile                                   |    3 +-
 cmdutils.c                                 |    6 -
 configure                                  |    8 +-
 doc/APIchanges                             |    1 -
 libavutil/avutil.h                         |    1 -
 libpostproc/Makefile                       |    6 -
 libpostproc/libpostproc.v                  |    4 -
 libpostproc/postprocess.c                  | 1071 ------
 libpostproc/postprocess.h                  |  104 -
 libpostproc/postprocess_altivec_template.c | 1210 -------
 libpostproc/postprocess_internal.h         |  179 -
 libpostproc/postprocess_template.c         | 3634 --------------------
 15 files changed, 3 insertions(+), 6227 deletions(-)
 delete mode 100644 libpostproc/Makefile
 delete mode 100644 libpostproc/libpostproc.v
 delete mode 100644 libpostproc/postprocess.c
 delete mode 100644 libpostproc/postprocess.h
 delete mode 100644 libpostproc/postprocess_altivec_template.c
 delete mode 100644 libpostproc/postprocess_internal.h
 delete mode 100644 libpostproc/postprocess_template.c

diff --git a/.gitignore b/.gitignore
index 754051d79d..e08648ba03 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,7 +24,6 @@ libavfilter/libavfilter*
 libavformat/libavformat*
 libavutil/avconfig.h
 libavutil/libavutil*
-libpostproc/libpostproc*
 libswscale/libswscale*
 tests/audiogen
 tests/base64
diff --git a/Changelog b/Changelog
index 23d9f7eabb..65accca721 100644
--- a/Changelog
+++ b/Changelog
@@ -9,6 +9,7 @@ version <next>:
 - CDXL demuxer and decoder
 - Apple ProRes encoder
 - Sun Rasterfile Encoder
+- remove libpostproc
 
 
 version 0.8:
diff --git a/LICENSE b/LICENSE
index 293be1dcf6..97923b1380 100644
--- a/LICENSE
+++ b/LICENSE
@@ -13,7 +13,6 @@ configure to activate them. In this case, Libav's license changes to GPL v2+.
 
 Specifically, the GPL parts of Libav are
 
-- libpostproc
 - optional x86 optimizations in the files
   libavcodec/x86/idct_mmx.c
 - the X11 grabber in libavdevice/x11grab.c
diff --git a/Makefile b/Makefile
index 006e35cce6..ae9191cd10 100644
--- a/Makefile
+++ b/Makefile
@@ -20,7 +20,7 @@ $(foreach VAR,$(SILENT),$(eval override $(VAR) = @$($(VAR))))
 $(eval INSTALL = @$(call ECHO,INSTALL,$$(^:$(SRC_PATH)/%=%)); $(INSTALL))
 endif
 
-ALLFFLIBS = avcodec avdevice avfilter avformat avutil postproc swscale
+ALLFFLIBS = avcodec avdevice avfilter avformat avutil swscale
 
 IFLAGS     := -I. -I$(SRC_PATH)
 CPPFLAGS   := $(IFLAGS) $(CPPFLAGS)
@@ -72,7 +72,6 @@ FFLIBS-$(CONFIG_AVDEVICE) += avdevice
 FFLIBS-$(CONFIG_AVFILTER) += avfilter
 FFLIBS-$(CONFIG_AVFORMAT) += avformat
 FFLIBS-$(CONFIG_AVCODEC)  += avcodec
-FFLIBS-$(CONFIG_POSTPROC) += postproc
 FFLIBS-$(CONFIG_SWSCALE)  += swscale
 
 FFLIBS := avutil
diff --git a/cmdutils.c b/cmdutils.c
index 8ee2cddf68..985931cce6 100644
--- a/cmdutils.c
+++ b/cmdutils.c
@@ -33,9 +33,6 @@
 #include "libavfilter/avfilter.h"
 #include "libavdevice/avdevice.h"
 #include "libswscale/swscale.h"
-#if CONFIG_POSTPROC
-#include "libpostproc/postprocess.h"
-#endif
 #include "libavutil/avstring.h"
 #include "libavutil/mathematics.h"
 #include "libavutil/parseutils.h"
@@ -496,9 +493,6 @@ static void print_all_libs_info(int flags, int level)
     PRINT_LIB_INFO(avdevice, AVDEVICE, flags, level);
     PRINT_LIB_INFO(avfilter, AVFILTER, flags, level);
     PRINT_LIB_INFO(swscale,  SWSCALE,  flags, level);
-#if CONFIG_POSTPROC
-    PRINT_LIB_INFO(postproc, POSTPROC, flags, level);
-#endif
 }
 
 void show_banner(void)
diff --git a/configure b/configure
index 5e62e22f36..cbeffbb6f6 100755
--- a/configure
+++ b/configure
@@ -88,7 +88,6 @@ Configuration options:
   --disable-avcodec        disable libavcodec build
   --disable-avformat       disable libavformat build
   --disable-swscale        disable libswscale build
-  --enable-postproc        enable libpostproc build (deprecated) [no]
   --disable-avfilter       disable video filter support [no]
   --disable-pthreads       disable pthreads [auto]
   --disable-w32threads     disable Win32 threads [auto]
@@ -974,7 +973,6 @@ CONFIG_LIST="
     nonfree
     openssl
     pic
-    postproc
     rdft
     rtpdec
     runtime_cpudetect
@@ -1532,7 +1530,6 @@ yadif_filter_deps="gpl"
 # libraries
 avdevice_deps="avcodec avformat"
 avformat_deps="avcodec"
-postproc_deps="gpl"
 
 # programs
 avconv_deps="avcodec avformat swscale"
@@ -3066,7 +3063,7 @@ enabled extra_warnings && check_cflags -Winline
 
 # add some linker flags
 check_ldflags -Wl,--warn-common
-check_ldflags -Wl,-rpath-link=libpostproc:libswscale:libavfilter:libavdevice:libavformat:libavcodec:libavutil
+check_ldflags -Wl,-rpath-link=libswscale:libavfilter:libavdevice:libavformat:libavcodec:libavutil
 test_ldflags -Wl,-Bsymbolic && append SHFLAGS -Wl,-Bsymbolic
 
 enabled xmm_clobber_test &&                             \
@@ -3222,7 +3219,6 @@ echo "optimize for size         ${small-no}"
 echo "optimizations             ${optimizations-no}"
 echo "static                    ${static-no}"
 echo "shared                    ${shared-no}"
-echo "postprocessing support    ${postproc-no}"
 echo "new filter support        ${avfilter-no}"
 echo "network support           ${network-no}"
 echo "threading support         ${thread_type-no}"
@@ -3374,7 +3370,6 @@ get_version LIBAVDEVICE libavdevice/avdevice.h
 get_version LIBAVFILTER libavfilter/version.h
 get_version LIBAVFORMAT libavformat/version.h
 get_version LIBAVUTIL   libavutil/avutil.h
-get_version LIBPOSTPROC libpostproc/postprocess.h
 get_version LIBSWSCALE  libswscale/swscale.h
 
 cat > $TMPH <<EOF
@@ -3493,5 +3488,4 @@ pkgconfig_generate libavcodec "Libav codec library" "$LIBAVCODEC_VERSION" "$extr
 pkgconfig_generate libavformat "Libav container format library" "$LIBAVFORMAT_VERSION" "$extralibs" "libavcodec = $LIBAVCODEC_VERSION"
 pkgconfig_generate libavdevice "Libav device handling library" "$LIBAVDEVICE_VERSION" "$extralibs" "libavformat = $LIBAVFORMAT_VERSION"
 pkgconfig_generate libavfilter "Libav video filtering library" "$LIBAVFILTER_VERSION" "$extralibs"
-pkgconfig_generate libpostproc "Libav postprocessing library" "$LIBPOSTPROC_VERSION" "" "libavutil = $LIBAVUTIL_VERSION"
 pkgconfig_generate libswscale "Libav image rescaling library" "$LIBSWSCALE_VERSION" "$LIBM" "libavutil = $LIBAVUTIL_VERSION"
diff --git a/doc/APIchanges b/doc/APIchanges
index 12fa80396a..78e2bfc9a3 100644
--- a/doc/APIchanges
+++ b/doc/APIchanges
@@ -6,7 +6,6 @@ libavcodec:  2012-01-27
 libavdevice: 2011-04-18
 libavfilter: 2011-04-18
 libavformat: 2012-01-27
-libpostproc: 2011-04-18 (deprecated, to be removed later)
 libswscale:  2011-06-20
 libavutil:   2011-04-18
 
diff --git a/libavutil/avutil.h b/libavutil/avutil.h
index b5f9a24f14..8d474e4546 100644
--- a/libavutil/avutil.h
+++ b/libavutil/avutil.h
@@ -39,7 +39,6 @@
  * @li @ref libavf "libavformat" I/O and muxing/demuxing library
  * @li @ref lavd "libavdevice" special devices muxing/demuxing library
  * @li @ref lavu "libavutil" common utility library
- * @li @subpage libpostproc post processing library
  * @li @subpage libswscale  color conversion and scaling library
  *
  */
diff --git a/libpostproc/Makefile b/libpostproc/Makefile
deleted file mode 100644
index 86c9b5f47d..0000000000
--- a/libpostproc/Makefile
+++ /dev/null
@@ -1,6 +0,0 @@
-NAME = postproc
-FFLIBS = avutil
-
-HEADERS = postprocess.h
-
-OBJS = postprocess.o
diff --git a/libpostproc/libpostproc.v b/libpostproc/libpostproc.v
deleted file mode 100644
index e65d76f4f6..0000000000
--- a/libpostproc/libpostproc.v
+++ /dev/null
@@ -1,4 +0,0 @@
-LIBPOSTPROC_$MAJOR {
-        global: postproc_*; pp_*;
-        local: *;
-};
diff --git a/libpostproc/postprocess.c b/libpostproc/postprocess.c
deleted file mode 100644
index c363fa7965..0000000000
--- a/libpostproc/postprocess.c
+++ /dev/null
@@ -1,1071 +0,0 @@
-/*
- * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
- *
- * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * postprocessing.
- */
-
-/*
-                        C       MMX     MMX2    3DNow   AltiVec
-isVertDC                Ec      Ec                      Ec
-isVertMinMaxOk          Ec      Ec                      Ec
-doVertLowPass           E               e       e       Ec
-doVertDefFilter         Ec      Ec      e       e       Ec
-isHorizDC               Ec      Ec                      Ec
-isHorizMinMaxOk         a       E                       Ec
-doHorizLowPass          E               e       e       Ec
-doHorizDefFilter        Ec      Ec      e       e       Ec
-do_a_deblock            Ec      E       Ec      E
-deRing                  E               e       e*      Ecp
-Vertical RKAlgo1        E               a       a
-Horizontal RKAlgo1                      a       a
-Vertical X1#            a               E       E
-Horizontal X1#          a               E       E
-LinIpolDeinterlace      e               E       E*
-CubicIpolDeinterlace    a               e       e*
-LinBlendDeinterlace     e               E       E*
-MedianDeinterlace#      E       Ec      Ec
-TempDeNoiser#           E               e       e       Ec
-
-* I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
-# more or less selfinvented filters so the exactness is not too meaningful
-E = Exact implementation
-e = almost exact implementation (slightly different rounding,...)
-a = alternative / approximate impl
-c = checked against the other implementations (-vo md5)
-p = partially optimized, still some work to do
-*/
-
-/*
-TODO:
-reduce the time wasted on the mem transfer
-unroll stuff if instructions depend too much on the prior one
-move YScale thing to the end instead of fixing QP
-write a faster and higher quality deblocking filter :)
-make the mainloop more flexible (variable number of blocks at once
-        (the if/else stuff per block is slowing things down)
-compare the quality & speed of all filters
-split this huge file
-optimize c versions
-try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
-...
-*/
-
-//Changelog: use git log
-
-#include "config.h"
-#include "libavutil/avutil.h"
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-//#undef HAVE_MMX2
-//#define HAVE_AMD3DNOW
-//#undef HAVE_MMX
-//#undef ARCH_X86
-//#define DEBUG_BRIGHTNESS
-#include "postprocess.h"
-#include "postprocess_internal.h"
-#include "libavutil/avstring.h"
-
-unsigned postproc_version(void)
-{
-    return LIBPOSTPROC_VERSION_INT;
-}
-
-const char *postproc_configuration(void)
-{
-    return LIBAV_CONFIGURATION;
-}
-
-const char *postproc_license(void)
-{
-#define LICENSE_PREFIX "libpostproc license: "
-    return LICENSE_PREFIX LIBAV_LICENSE + sizeof(LICENSE_PREFIX) - 1;
-}
-
-#if HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
-
-#define GET_MODE_BUFFER_SIZE 500
-#define OPTIONS_ARRAY_SIZE 10
-#define BLOCK_SIZE 8
-#define TEMP_STRIDE 8
-//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
-
-#if ARCH_X86
-DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
-DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
-DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
-DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
-DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
-DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
-DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
-DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
-#endif
-
-DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
-
-
-static struct PPFilter filters[]=
-{
-    {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
-    {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
-/*  {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
-    {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
-    {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
-    {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
-    {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
-    {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
-    {"dr", "dering",                1, 5, 6, DERING},
-    {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
-    {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
-    {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
-    {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
-    {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
-    {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
-    {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
-    {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
-    {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
-    {NULL, NULL,0,0,0,0} //End Marker
-};
-
-static const char *replaceTable[]=
-{
-    "default",      "hb:a,vb:a,dr:a",
-    "de",           "hb:a,vb:a,dr:a",
-    "fast",         "h1:a,v1:a,dr:a",
-    "fa",           "h1:a,v1:a,dr:a",
-    "ac",           "ha:a:128:7,va:a,dr:a",
-    NULL //End Marker
-};
-
-
-#if ARCH_X86
-static inline void prefetchnta(void *p)
-{
-    __asm__ volatile(   "prefetchnta (%0)\n\t"
-        : : "r" (p)
-    );
-}
-
-static inline void prefetcht0(void *p)
-{
-    __asm__ volatile(   "prefetcht0 (%0)\n\t"
-        : : "r" (p)
-    );
-}
-
-static inline void prefetcht1(void *p)
-{
-    __asm__ volatile(   "prefetcht1 (%0)\n\t"
-        : : "r" (p)
-    );
-}
-
-static inline void prefetcht2(void *p)
-{
-    __asm__ volatile(   "prefetcht2 (%0)\n\t"
-        : : "r" (p)
-    );
-}
-#endif
-
-/* The horizontal functions exist only in C because the MMX
- * code is faster with vertical filters and transposing. */
-
-/**
- * Check if the given 8x8 Block is mostly "flat"
- */
-static inline int isHorizDC_C(uint8_t src[], int stride, PPContext *c)
-{
-    int numEq= 0;
-    int y;
-    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
-    const int dcThreshold= dcOffset*2 + 1;
-
-    for(y=0; y<BLOCK_SIZE; y++){
-        if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
-        src+= stride;
-    }
-    return numEq > c->ppMode.flatnessThreshold;
-}
-
-/**
- * Check if the middle 8x8 Block in the given 8x16 block is flat
- */
-static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c)
-{
-    int numEq= 0;
-    int y;
-    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
-    const int dcThreshold= dcOffset*2 + 1;
-
-    src+= stride*4; // src points to begin of the 8x8 Block
-    for(y=0; y<BLOCK_SIZE-1; y++){
-        if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
-        src+= stride;
-    }
-    return numEq > c->ppMode.flatnessThreshold;
-}
-
-static inline int isHorizMinMaxOk_C(uint8_t src[], int stride, int QP)
-{
-    int i;
-    for(i=0; i<2; i++){
-        if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
-        src += stride;
-        if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
-        src += stride;
-        if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
-        src += stride;
-        if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
-        src += stride;
-    }
-    return 1;
-}
-
-static inline int isVertMinMaxOk_C(uint8_t src[], int stride, int QP)
-{
-    int x;
-    src+= stride*4;
-    for(x=0; x<BLOCK_SIZE; x+=4){
-        if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
-        if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
-        if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
-        if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
-    }
-    return 1;
-}
-
-static inline int horizClassify_C(uint8_t src[], int stride, PPContext *c)
-{
-    if( isHorizDC_C(src, stride, c) ){
-        if( isHorizMinMaxOk_C(src, stride, c->QP) )
-            return 1;
-        else
-            return 0;
-    }else{
-        return 2;
-    }
-}
-
-static inline int vertClassify_C(uint8_t src[], int stride, PPContext *c)
-{
-    if( isVertDC_C(src, stride, c) ){
-        if( isVertMinMaxOk_C(src, stride, c->QP) )
-            return 1;
-        else
-            return 0;
-    }else{
-        return 2;
-    }
-}
-
-static inline void doHorizDefFilter_C(uint8_t dst[], int stride, PPContext *c)
-{
-    int y;
-    for(y=0; y<BLOCK_SIZE; y++){
-        const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
-
-        if(FFABS(middleEnergy) < 8*c->QP){
-            const int q=(dst[3] - dst[4])/2;
-            const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
-            const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
-
-            int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
-            d= FFMAX(d, 0);
-
-            d= (5*d + 32) >> 6;
-            d*= FFSIGN(-middleEnergy);
-
-            if(q>0)
-            {
-                d= d<0 ? 0 : d;
-                d= d>q ? q : d;
-            }
-            else
-            {
-                d= d>0 ? 0 : d;
-                d= d<q ? q : d;
-            }
-
-            dst[3]-= d;
-            dst[4]+= d;
-        }
-        dst+= stride;
-    }
-}
-
-/**
- * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
- * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
- */
-static inline void doHorizLowPass_C(uint8_t dst[], int stride, PPContext *c)
-{
-    int y;
-    for(y=0; y<BLOCK_SIZE; y++){
-        const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
-        const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
-
-        int sums[10];
-        sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
-        sums[1] = sums[0] - first  + dst[3];
-        sums[2] = sums[1] - first  + dst[4];
-        sums[3] = sums[2] - first  + dst[5];
-        sums[4] = sums[3] - first  + dst[6];
-        sums[5] = sums[4] - dst[0] + dst[7];
-        sums[6] = sums[5] - dst[1] + last;
-        sums[7] = sums[6] - dst[2] + last;
-        sums[8] = sums[7] - dst[3] + last;
-        sums[9] = sums[8] - dst[4] + last;
-
-        dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
-        dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
-        dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
-        dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
-        dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
-        dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
-        dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
-        dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
-
-        dst+= stride;
-    }
-}
-
-/**
- * Experimental Filter 1 (Horizontal)
- * will not damage linear gradients
- * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
- * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
- * MMX2 version does correct clipping C version does not
- * not identical with the vertical one
- */
-static inline void horizX1Filter(uint8_t *src, int stride, int QP)
-{
-    int y;
-    static uint64_t *lut= NULL;
-    if(lut==NULL)
-    {
-        int i;
-        lut = av_malloc(256*8);
-        for(i=0; i<256; i++)
-        {
-            int v= i < 128 ? 2*i : 2*(i-256);
-/*
-//Simulate 112242211 9-Tap filter
-            uint64_t a= (v/16)  & 0xFF;
-            uint64_t b= (v/8)   & 0xFF;
-            uint64_t c= (v/4)   & 0xFF;
-            uint64_t d= (3*v/8) & 0xFF;
-*/
-//Simulate piecewise linear interpolation
-            uint64_t a= (v/16)   & 0xFF;
-            uint64_t b= (v*3/16) & 0xFF;
-            uint64_t c= (v*5/16) & 0xFF;
-            uint64_t d= (7*v/16) & 0xFF;
-            uint64_t A= (0x100 - a)&0xFF;
-            uint64_t B= (0x100 - b)&0xFF;
-            uint64_t C= (0x100 - c)&0xFF;
-            uint64_t D= (0x100 - c)&0xFF;
-
-            lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
-                       (D<<24) | (C<<16) | (B<<8)  | (A);
-            //lut[i] = (v<<32) | (v<<24);
-        }
-    }
-
-    for(y=0; y<BLOCK_SIZE; y++){
-        int a= src[1] - src[2];
-        int b= src[3] - src[4];
-        int c= src[5] - src[6];
-
-        int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
-
-        if(d < QP){
-            int v = d * FFSIGN(-b);
-
-            src[1] +=v/8;
-            src[2] +=v/4;
-            src[3] +=3*v/8;
-            src[4] -=3*v/8;
-            src[5] -=v/4;
-            src[6] -=v/8;
-        }
-        src+=stride;
-    }
-}
-
-/**
- * accurate deblock filter
- */
-static av_always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPContext *c){
-    int y;
-    const int QP= c->QP;
-    const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
-    const int dcThreshold= dcOffset*2 + 1;
-//START_TIMER
-    src+= step*4; // src points to begin of the 8x8 Block
-    for(y=0; y<8; y++){
-        int numEq= 0;
-
-        if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
-        if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
-        if(numEq > c->ppMode.flatnessThreshold){
-            int min, max, x;
-
-            if(src[0] > src[step]){
-                max= src[0];
-                min= src[step];
-            }else{
-                max= src[step];
-                min= src[0];
-            }
-            for(x=2; x<8; x+=2){
-                if(src[x*step] > src[(x+1)*step]){
-                        if(src[x    *step] > max) max= src[ x   *step];
-                        if(src[(x+1)*step] < min) min= src[(x+1)*step];
-                }else{
-                        if(src[(x+1)*step] > max) max= src[(x+1)*step];
-                        if(src[ x   *step] < min) min= src[ x   *step];
-                }
-            }
-            if(max-min < 2*QP){
-                const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
-                const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
-
-                int sums[10];
-                sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
-                sums[1] = sums[0] - first       + src[3*step];
-                sums[2] = sums[1] - first       + src[4*step];
-                sums[3] = sums[2] - first       + src[5*step];
-                sums[4] = sums[3] - first       + src[6*step];
-                sums[5] = sums[4] - src[0*step] + src[7*step];
-                sums[6] = sums[5] - src[1*step] + last;
-                sums[7] = sums[6] - src[2*step] + last;
-                sums[8] = sums[7] - src[3*step] + last;
-                sums[9] = sums[8] - src[4*step] + last;
-
-                src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
-                src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
-                src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
-                src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
-                src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
-                src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
-                src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
-                src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
-            }
-        }else{
-            const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
-
-            if(FFABS(middleEnergy) < 8*QP){
-                const int q=(src[3*step] - src[4*step])/2;
-                const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
-                const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
-
-                int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
-                d= FFMAX(d, 0);
-
-                d= (5*d + 32) >> 6;
-                d*= FFSIGN(-middleEnergy);
-
-                if(q>0){
-                    d= d<0 ? 0 : d;
-                    d= d>q ? q : d;
-                }else{
-                    d= d>0 ? 0 : d;
-                    d= d<q ? q : d;
-                }
-
-                src[3*step]-= d;
-                src[4*step]+= d;
-            }
-        }
-
-        src += stride;
-    }
-/*if(step==16){
-    STOP_TIMER("step16")
-}else{
-    STOP_TIMER("stepX")
-}*/
-}
-
-//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
-//Plain C versions
-#if !(HAVE_MMX || HAVE_ALTIVEC) || CONFIG_RUNTIME_CPUDETECT
-#define COMPILE_C
-#endif
-
-#if HAVE_ALTIVEC
-#define COMPILE_ALTIVEC
-#endif //HAVE_ALTIVEC
-
-#if ARCH_X86
-
-#if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
-#define COMPILE_MMX
-#endif
-
-#if HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT
-#define COMPILE_MMX2
-#endif
-
-#if (HAVE_AMD3DNOW && !HAVE_MMX2) || CONFIG_RUNTIME_CPUDETECT
-#define COMPILE_3DNOW
-#endif
-#endif /* ARCH_X86 */
-
-#undef HAVE_MMX
-#define HAVE_MMX 0
-#undef HAVE_MMX2
-#define HAVE_MMX2 0
-#undef HAVE_AMD3DNOW
-#define HAVE_AMD3DNOW 0
-#undef HAVE_ALTIVEC
-#define HAVE_ALTIVEC 0
-
-#ifdef COMPILE_C
-#define RENAME(a) a ## _C
-#include "postprocess_template.c"
-#endif
-
-#ifdef COMPILE_ALTIVEC
-#undef RENAME
-#undef HAVE_ALTIVEC
-#define HAVE_ALTIVEC 1
-#define RENAME(a) a ## _altivec
-#include "postprocess_altivec_template.c"
-#include "postprocess_template.c"
-#endif
-
-//MMX versions
-#ifdef COMPILE_MMX
-#undef RENAME
-#undef HAVE_MMX
-#define HAVE_MMX 1
-#define RENAME(a) a ## _MMX
-#include "postprocess_template.c"
-#endif
-
-//MMX2 versions
-#ifdef COMPILE_MMX2
-#undef RENAME
-#undef HAVE_MMX
-#undef HAVE_MMX2
-#define HAVE_MMX 1
-#define HAVE_MMX2 1
-#define RENAME(a) a ## _MMX2
-#include "postprocess_template.c"
-#endif
-
-//3DNOW versions
-#ifdef COMPILE_3DNOW
-#undef RENAME
-#undef HAVE_MMX
-#undef HAVE_MMX2
-#undef HAVE_AMD3DNOW
-#define HAVE_MMX 1
-#define HAVE_MMX2 0
-#define HAVE_AMD3DNOW 1
-#define RENAME(a) a ## _3DNow
-#include "postprocess_template.c"
-#endif
-
-// minor note: the HAVE_xyz is messed up after that line so do not use it.
-
-static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
-        const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
-{
-    PPContext *c= (PPContext *)vc;
-    PPMode *ppMode= (PPMode *)vm;
-    c->ppMode= *ppMode; //FIXME
-
-    // Using ifs here as they are faster than function pointers although the
-    // difference would not be measurable here but it is much better because
-    // someone might exchange the CPU whithout restarting MPlayer ;)
-#if CONFIG_RUNTIME_CPUDETECT
-#if ARCH_X86
-    // ordered per speed fastest first
-    if(c->cpuCaps & PP_CPU_CAPS_MMX2)
-        postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
-    else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
-        postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
-    else if(c->cpuCaps & PP_CPU_CAPS_MMX)
-        postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
-    else
-        postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
-#else
-#if HAVE_ALTIVEC
-    if(c->cpuCaps & PP_CPU_CAPS_ALTIVEC)
-            postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
-    else
-#endif
-            postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
-#endif
-#else /* CONFIG_RUNTIME_CPUDETECT */
-#if   HAVE_MMX2
-            postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
-#elif HAVE_AMD3DNOW
-            postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
-#elif HAVE_MMX
-            postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
-#elif HAVE_ALTIVEC
-            postProcess_altivec(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
-#else
-            postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
-#endif
-#endif /* !CONFIG_RUNTIME_CPUDETECT */
-}
-
-//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
-//        QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
-
-/* -pp Command line Help
-*/
-const char pp_help[] =
-"Available postprocessing filters:\n"
-"Filters                        Options\n"
-"short  long name       short   long option     Description\n"
-"*      *               a       autoq           CPU power dependent enabler\n"
-"                       c       chrom           chrominance filtering enabled\n"
-"                       y       nochrom         chrominance filtering disabled\n"
-"                       n       noluma          luma filtering disabled\n"
-"hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
-"       1. difference factor: default=32, higher -> more deblocking\n"
-"       2. flatness threshold: default=39, lower -> more deblocking\n"
-"                       the h & v deblocking filters share these\n"
-"                       so you can't set different thresholds for h / v\n"
-"vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
-"ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
-"va     vadeblock       (2 threshold)           vertical deblocking filter\n"
-"h1     x1hdeblock                              experimental h deblock filter 1\n"
-"v1     x1vdeblock                              experimental v deblock filter 1\n"
-"dr     dering                                  deringing filter\n"
-"al     autolevels                              automatic brightness / contrast\n"
-"                       f        fullyrange     stretch luminance to (0..255)\n"
-"lb     linblenddeint                           linear blend deinterlacer\n"
-"li     linipoldeint                            linear interpolating deinterlace\n"
-"ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
-"md     mediandeint                             median deinterlacer\n"
-"fd     ffmpegdeint                             ffmpeg deinterlacer\n"
-"l5     lowpass5                                FIR lowpass deinterlacer\n"
-"de     default                                 hb:a,vb:a,dr:a\n"
-"fa     fast                                    h1:a,v1:a,dr:a\n"
-"ac                                             ha:a:128:7,va:a,dr:a\n"
-"tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
-"                     1. <= 2. <= 3.            larger -> stronger filtering\n"
-"fq     forceQuant      <quantizer>             force quantizer\n"
-"Usage:\n"
-"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
-"long form example:\n"
-"vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
-"short form example:\n"
-"vb:a/hb:a/lb                                   de,-vb\n"
-"more examples:\n"
-"tn:64:128:256\n"
-"\n"
-;
-
-pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
-{
-    char temp[GET_MODE_BUFFER_SIZE];
-    char *p= temp;
-    static const char filterDelimiters[] = ",/";
-    static const char optionDelimiters[] = ":";
-    struct PPMode *ppMode;
-    char *filterToken;
-
-    ppMode= av_malloc(sizeof(PPMode));
-
-    ppMode->lumMode= 0;
-    ppMode->chromMode= 0;
-    ppMode->maxTmpNoise[0]= 700;
-    ppMode->maxTmpNoise[1]= 1500;
-    ppMode->maxTmpNoise[2]= 3000;
-    ppMode->maxAllowedY= 234;
-    ppMode->minAllowedY= 16;
-    ppMode->baseDcDiff= 256/8;
-    ppMode->flatnessThreshold= 56-16-1;
-    ppMode->maxClippedThreshold= 0.01;
-    ppMode->error=0;
-
-    memset(temp, 0, GET_MODE_BUFFER_SIZE);
-    av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
-
-    av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
-
-    for(;;){
-        char *filterName;
-        int q= 1000000; //PP_QUALITY_MAX;
-        int chrom=-1;
-        int luma=-1;
-        char *option;
-        char *options[OPTIONS_ARRAY_SIZE];
-        int i;
-        int filterNameOk=0;
-        int numOfUnknownOptions=0;
-        int enable=1; //does the user want us to enabled or disabled the filter
-
-        filterToken= strtok(p, filterDelimiters);
-        if(filterToken == NULL) break;
-        p+= strlen(filterToken) + 1; // p points to next filterToken
-        filterName= strtok(filterToken, optionDelimiters);
-        av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
-
-        if(*filterName == '-'){
-            enable=0;
-            filterName++;
-        }
-
-        for(;;){ //for all options
-            option= strtok(NULL, optionDelimiters);
-            if(option == NULL) break;
-
-            av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
-            if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
-            else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
-            else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
-            else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
-            else{
-                options[numOfUnknownOptions] = option;
-                numOfUnknownOptions++;
-            }
-            if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
-        }
-        options[numOfUnknownOptions] = NULL;
-
-        /* replace stuff from the replace Table */
-        for(i=0; replaceTable[2*i]!=NULL; i++){
-            if(!strcmp(replaceTable[2*i], filterName)){
-                int newlen= strlen(replaceTable[2*i + 1]);
-                int plen;
-                int spaceLeft;
-
-                if(p==NULL) p= temp, *p=0;      //last filter
-                else p--, *p=',';               //not last filter
-
-                plen= strlen(p);
-                spaceLeft= p - temp + plen;
-                if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE - 1){
-                    ppMode->error++;
-                    break;
-                }
-                memmove(p + newlen, p, plen+1);
-                memcpy(p, replaceTable[2*i + 1], newlen);
-                filterNameOk=1;
-            }
-        }
-
-        for(i=0; filters[i].shortName!=NULL; i++){
-            if(   !strcmp(filters[i].longName, filterName)
-               || !strcmp(filters[i].shortName, filterName)){
-                ppMode->lumMode &= ~filters[i].mask;
-                ppMode->chromMode &= ~filters[i].mask;
-
-                filterNameOk=1;
-                if(!enable) break; // user wants to disable it
-
-                if(q >= filters[i].minLumQuality && luma)
-                    ppMode->lumMode|= filters[i].mask;
-                if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
-                    if(q >= filters[i].minChromQuality)
-                            ppMode->chromMode|= filters[i].mask;
-
-                if(filters[i].mask == LEVEL_FIX){
-                    int o;
-                    ppMode->minAllowedY= 16;
-                    ppMode->maxAllowedY= 234;
-                    for(o=0; options[o]!=NULL; o++){
-                        if(  !strcmp(options[o],"fullyrange")
-                           ||!strcmp(options[o],"f")){
-                            ppMode->minAllowedY= 0;
-                            ppMode->maxAllowedY= 255;
-                            numOfUnknownOptions--;
-                        }
-                    }
-                }
-                else if(filters[i].mask == TEMP_NOISE_FILTER)
-                {
-                    int o;
-                    int numOfNoises=0;
-
-                    for(o=0; options[o]!=NULL; o++){
-                        char *tail;
-                        ppMode->maxTmpNoise[numOfNoises]=
-                            strtol(options[o], &tail, 0);
-                        if(tail!=options[o]){
-                            numOfNoises++;
-                            numOfUnknownOptions--;
-                            if(numOfNoises >= 3) break;
-                        }
-                    }
-                }
-                else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
-                     || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
-                    int o;
-
-                    for(o=0; options[o]!=NULL && o<2; o++){
-                        char *tail;
-                        int val= strtol(options[o], &tail, 0);
-                        if(tail==options[o]) break;
-
-                        numOfUnknownOptions--;
-                        if(o==0) ppMode->baseDcDiff= val;
-                        else ppMode->flatnessThreshold= val;
-                    }
-                }
-                else if(filters[i].mask == FORCE_QUANT){
-                    int o;
-                    ppMode->forcedQuant= 15;
-
-                    for(o=0; options[o]!=NULL && o<1; o++){
-                        char *tail;
-                        int val= strtol(options[o], &tail, 0);
-                        if(tail==options[o]) break;
-
-                        numOfUnknownOptions--;
-                        ppMode->forcedQuant= val;
-                    }
-                }
-            }
-        }
-        if(!filterNameOk) ppMode->error++;
-        ppMode->error += numOfUnknownOptions;
-    }
-
-    av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
-    if(ppMode->error){
-        av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
-        av_free(ppMode);
-        return NULL;
-    }
-    return ppMode;
-}
-
-void pp_free_mode(pp_mode *mode){
-    av_free(mode);
-}
-
-static void reallocAlign(void **p, int alignment, int size){
-    av_free(*p);
-    *p= av_mallocz(size);
-}
-
-static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
-    int mbWidth = (width+15)>>4;
-    int mbHeight= (height+15)>>4;
-    int i;
-
-    c->stride= stride;
-    c->qpStride= qpStride;
-
-    reallocAlign((void **)&c->tempDst, 8, stride*24);
-    reallocAlign((void **)&c->tempSrc, 8, stride*24);
-    reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
-    reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
-    for(i=0; i<256; i++)
-            c->yHistogram[i]= width*height/64*15/256;
-
-    for(i=0; i<3; i++){
-        //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
-        reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024);
-        reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
-    }
-
-    reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
-    reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
-    reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
-    reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
-}
-
-static const char * context_to_name(void * ptr) {
-    return "postproc";
-}
-
-static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
-
-pp_context *pp_get_context(int width, int height, int cpuCaps){
-    PPContext *c= av_malloc(sizeof(PPContext));
-    int stride= FFALIGN(width, 16);  //assumed / will realloc if needed
-    int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
-
-    memset(c, 0, sizeof(PPContext));
-    c->av_class = &av_codec_context_class;
-    c->cpuCaps= cpuCaps;
-    if(cpuCaps&PP_FORMAT){
-        c->hChromaSubSample= cpuCaps&0x3;
-        c->vChromaSubSample= (cpuCaps>>4)&0x3;
-    }else{
-        c->hChromaSubSample= 1;
-        c->vChromaSubSample= 1;
-    }
-
-    reallocBuffers(c, width, height, stride, qpStride);
-
-    c->frameNum=-1;
-
-    return c;
-}
-
-void pp_free_context(void *vc){
-    PPContext *c = (PPContext*)vc;
-    int i;
-
-    for(i=0; i<3; i++) av_free(c->tempBlurred[i]);
-    for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]);
-
-    av_free(c->tempBlocks);
-    av_free(c->yHistogram);
-    av_free(c->tempDst);
-    av_free(c->tempSrc);
-    av_free(c->deintTemp);
-    av_free(c->stdQPTable);
-    av_free(c->nonBQPTable);
-    av_free(c->forcedQPTable);
-
-    memset(c, 0, sizeof(PPContext));
-
-    av_free(c);
-}
-
-void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
-                     uint8_t * dst[3], const int dstStride[3],
-                     int width, int height,
-                     const QP_STORE_T *QP_store,  int QPStride,
-                     pp_mode *vm,  void *vc, int pict_type)
-{
-    int mbWidth = (width+15)>>4;
-    int mbHeight= (height+15)>>4;
-    PPMode *mode = (PPMode*)vm;
-    PPContext *c = (PPContext*)vc;
-    int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
-    int absQPStride = FFABS(QPStride);
-
-    // c->stride and c->QPStride are always positive
-    if(c->stride < minStride || c->qpStride < absQPStride)
-        reallocBuffers(c, width, height,
-                       FFMAX(minStride, c->stride),
-                       FFMAX(c->qpStride, absQPStride));
-
-    if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)){
-        int i;
-        QP_store= c->forcedQPTable;
-        absQPStride = QPStride = 0;
-        if(mode->lumMode & FORCE_QUANT)
-            for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
-        else
-            for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
-    }
-
-    if(pict_type & PP_PICT_TYPE_QP2){
-        int i;
-        const int count= mbHeight * absQPStride;
-        for(i=0; i<(count>>2); i++){
-            ((uint32_t*)c->stdQPTable)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
-        }
-        for(i<<=2; i<count; i++){
-            c->stdQPTable[i] = QP_store[i]>>1;
-        }
-        QP_store= c->stdQPTable;
-        QPStride= absQPStride;
-    }
-
-    if(0){
-        int x,y;
-        for(y=0; y<mbHeight; y++){
-            for(x=0; x<mbWidth; x++){
-                av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
-            }
-            av_log(c, AV_LOG_INFO, "\n");
-        }
-        av_log(c, AV_LOG_INFO, "\n");
-    }
-
-    if((pict_type&7)!=3){
-        if (QPStride >= 0){
-            int i;
-            const int count= mbHeight * QPStride;
-            for(i=0; i<(count>>2); i++){
-                ((uint32_t*)c->nonBQPTable)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
-            }
-            for(i<<=2; i<count; i++){
-                c->nonBQPTable[i] = QP_store[i] & 0x3F;
-            }
-        } else {
-            int i,j;
-            for(i=0; i<mbHeight; i++) {
-                for(j=0; j<absQPStride; j++) {
-                    c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
-                }
-            }
-        }
-    }
-
-    av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
-           mode->lumMode, mode->chromMode);
-
-    postProcess(src[0], srcStride[0], dst[0], dstStride[0],
-                width, height, QP_store, QPStride, 0, mode, c);
-
-    width  = (width )>>c->hChromaSubSample;
-    height = (height)>>c->vChromaSubSample;
-
-    if(mode->chromMode){
-        postProcess(src[1], srcStride[1], dst[1], dstStride[1],
-                    width, height, QP_store, QPStride, 1, mode, c);
-        postProcess(src[2], srcStride[2], dst[2], dstStride[2],
-                    width, height, QP_store, QPStride, 2, mode, c);
-    }
-    else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
-        linecpy(dst[1], src[1], height, srcStride[1]);
-        linecpy(dst[2], src[2], height, srcStride[2]);
-    }else{
-        int y;
-        for(y=0; y<height; y++){
-            memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
-            memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
-        }
-    }
-}
diff --git a/libpostproc/postprocess.h b/libpostproc/postprocess.h
deleted file mode 100644
index 4cc6925924..0000000000
--- a/libpostproc/postprocess.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef POSTPROC_POSTPROCESS_H
-#define POSTPROC_POSTPROCESS_H
-
-/**
- * @file
- * @brief
- *     external postprocessing API
- */
-
-#include "libavutil/avutil.h"
-
-#define LIBPOSTPROC_VERSION_MAJOR 52
-#define LIBPOSTPROC_VERSION_MINOR  0
-#define LIBPOSTPROC_VERSION_MICRO  0
-
-#define LIBPOSTPROC_VERSION_INT AV_VERSION_INT(LIBPOSTPROC_VERSION_MAJOR, \
-                                               LIBPOSTPROC_VERSION_MINOR, \
-                                               LIBPOSTPROC_VERSION_MICRO)
-#define LIBPOSTPROC_VERSION     AV_VERSION(LIBPOSTPROC_VERSION_MAJOR, \
-                                           LIBPOSTPROC_VERSION_MINOR, \
-                                           LIBPOSTPROC_VERSION_MICRO)
-#define LIBPOSTPROC_BUILD       LIBPOSTPROC_VERSION_INT
-
-#define LIBPOSTPROC_IDENT       "postproc" AV_STRINGIFY(LIBPOSTPROC_VERSION)
-
-/**
- * Return the LIBPOSTPROC_VERSION_INT constant.
- */
-unsigned postproc_version(void);
-
-/**
- * Return the libpostproc build-time configuration.
- */
-const char *postproc_configuration(void);
-
-/**
- * Return the libpostproc license.
- */
-const char *postproc_license(void);
-
-#define PP_QUALITY_MAX 6
-
-#define QP_STORE_T int8_t
-
-#include <inttypes.h>
-
-typedef void pp_context;
-typedef void pp_mode;
-
-extern const char pp_help[]; ///< a simple help text
-
-void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
-                     uint8_t * dst[3], const int dstStride[3],
-                     int horizontalSize, int verticalSize,
-                     const QP_STORE_T *QP_store,  int QP_stride,
-                     pp_mode *mode, pp_context *ppContext, int pict_type);
-
-
-/**
- * Return a pp_mode or NULL if an error occurred.
- *
- * @param name    the string after "-pp" on the command line
- * @param quality a number from 0 to PP_QUALITY_MAX
- */
-pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality);
-void pp_free_mode(pp_mode *mode);
-
-pp_context *pp_get_context(int width, int height, int flags);
-void pp_free_context(pp_context *ppContext);
-
-#define PP_CPU_CAPS_MMX   0x80000000
-#define PP_CPU_CAPS_MMX2  0x20000000
-#define PP_CPU_CAPS_3DNOW 0x40000000
-#define PP_CPU_CAPS_ALTIVEC 0x10000000
-
-#define PP_FORMAT         0x00000008
-#define PP_FORMAT_420    (0x00000011|PP_FORMAT)
-#define PP_FORMAT_422    (0x00000001|PP_FORMAT)
-#define PP_FORMAT_411    (0x00000002|PP_FORMAT)
-#define PP_FORMAT_444    (0x00000000|PP_FORMAT)
-
-#define PP_PICT_TYPE_QP2  0x00000010 ///< MPEG2 style QScale
-
-#endif /* POSTPROC_POSTPROCESS_H */
diff --git a/libpostproc/postprocess_altivec_template.c b/libpostproc/postprocess_altivec_template.c
deleted file mode 100644
index ac65df897b..0000000000
--- a/libpostproc/postprocess_altivec_template.c
+++ /dev/null
@@ -1,1210 +0,0 @@
-/*
- * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
- *
- * based on code by Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/avutil.h"
-
-#define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \
-    do {                                                          \
-        __typeof__(src_a) tempA1, tempB1, tempC1, tempD1;         \
-        __typeof__(src_a) tempE1, tempF1, tempG1, tempH1;         \
-        __typeof__(src_a) tempA2, tempB2, tempC2, tempD2;         \
-        __typeof__(src_a) tempE2, tempF2, tempG2, tempH2;         \
-        tempA1 = vec_mergeh (src_a, src_e);                       \
-        tempB1 = vec_mergel (src_a, src_e);                       \
-        tempC1 = vec_mergeh (src_b, src_f);                       \
-        tempD1 = vec_mergel (src_b, src_f);                       \
-        tempE1 = vec_mergeh (src_c, src_g);                       \
-        tempF1 = vec_mergel (src_c, src_g);                       \
-        tempG1 = vec_mergeh (src_d, src_h);                       \
-        tempH1 = vec_mergel (src_d, src_h);                       \
-        tempA2 = vec_mergeh (tempA1, tempE1);                     \
-        tempB2 = vec_mergel (tempA1, tempE1);                     \
-        tempC2 = vec_mergeh (tempB1, tempF1);                     \
-        tempD2 = vec_mergel (tempB1, tempF1);                     \
-        tempE2 = vec_mergeh (tempC1, tempG1);                     \
-        tempF2 = vec_mergel (tempC1, tempG1);                     \
-        tempG2 = vec_mergeh (tempD1, tempH1);                     \
-        tempH2 = vec_mergel (tempD1, tempH1);                     \
-        src_a = vec_mergeh (tempA2, tempE2);                      \
-        src_b = vec_mergel (tempA2, tempE2);                      \
-        src_c = vec_mergeh (tempB2, tempF2);                      \
-        src_d = vec_mergel (tempB2, tempF2);                      \
-        src_e = vec_mergeh (tempC2, tempG2);                      \
-        src_f = vec_mergel (tempC2, tempG2);                      \
-        src_g = vec_mergeh (tempD2, tempH2);                      \
-        src_h = vec_mergel (tempD2, tempH2);                      \
-    } while (0)
-
-
-static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) {
-    /*
-    this code makes no assumption on src or stride.
-    One could remove the recomputation of the perm
-    vector by assuming (stride % 16) == 0, unfortunately
-    this is not always true.
-    */
-    short data_0 = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
-    DECLARE_ALIGNED(16, short, data)[8] =
-                    {
-                        data_0,
-                        data_0 * 2 + 1,
-                        c->QP * 2,
-                        c->QP * 4
-                    };
-    int numEq;
-    uint8_t *src2 = src;
-    vector signed short v_dcOffset;
-    vector signed short v2QP;
-    vector unsigned short v4QP;
-    vector unsigned short v_dcThreshold;
-    const int properStride = (stride % 16);
-    const int srcAlign = ((unsigned long)src2 % 16);
-    const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;
-    const vector signed int zero = vec_splat_s32(0);
-    const vector signed short mask = vec_splat_s16(1);
-    vector signed int v_numEq = vec_splat_s32(0);
-    vector signed short v_data = vec_ld(0, data);
-    vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3,
-                        v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;
-//FIXME avoid this mess if possible
-    register int j0 = 0,
-                 j1 = stride,
-                 j2 = 2 * stride,
-                 j3 = 3 * stride,
-                 j4 = 4 * stride,
-                 j5 = 5 * stride,
-                 j6 = 6 * stride,
-                 j7 = 7 * stride;
-    vector unsigned char v_srcA0, v_srcA1, v_srcA2, v_srcA3,
-                         v_srcA4, v_srcA5, v_srcA6, v_srcA7;
-
-    v_dcOffset = vec_splat(v_data, 0);
-    v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);
-    v2QP = vec_splat(v_data, 2);
-    v4QP = (vector unsigned short)vec_splat(v_data, 3);
-
-    src2 += stride * 4;
-
-#define LOAD_LINE(i)                                                    \
-    {                                                                   \
-    vector unsigned char perm##i = vec_lvsl(j##i, src2);                \
-    vector unsigned char v_srcA2##i;                                    \
-    vector unsigned char v_srcA1##i = vec_ld(j##i, src2);               \
-    if (two_vectors)                                                    \
-        v_srcA2##i = vec_ld(j##i + 16, src2);                           \
-    v_srcA##i =                                                         \
-        vec_perm(v_srcA1##i, v_srcA2##i, perm##i);                      \
-    v_srcAss##i =                                                       \
-        (vector signed short)vec_mergeh((vector signed char)zero,       \
-                                        (vector signed char)v_srcA##i); }
-
-#define LOAD_LINE_ALIGNED(i)                                            \
-    v_srcA##i = vec_ld(j##i, src2);                                     \
-    v_srcAss##i =                                                       \
-        (vector signed short)vec_mergeh((vector signed char)zero,       \
-                                        (vector signed char)v_srcA##i)
-
-    /* Special-casing the aligned case is worthwhile, as all calls from
-     * the (transposed) horizontable deblocks will be aligned, in addition
-     * to the naturally aligned vertical deblocks. */
-    if (properStride && srcAlign) {
-        LOAD_LINE_ALIGNED(0);
-        LOAD_LINE_ALIGNED(1);
-        LOAD_LINE_ALIGNED(2);
-        LOAD_LINE_ALIGNED(3);
-        LOAD_LINE_ALIGNED(4);
-        LOAD_LINE_ALIGNED(5);
-        LOAD_LINE_ALIGNED(6);
-        LOAD_LINE_ALIGNED(7);
-    } else {
-        LOAD_LINE(0);
-        LOAD_LINE(1);
-        LOAD_LINE(2);
-        LOAD_LINE(3);
-        LOAD_LINE(4);
-        LOAD_LINE(5);
-        LOAD_LINE(6);
-        LOAD_LINE(7);
-    }
-#undef LOAD_LINE
-#undef LOAD_LINE_ALIGNED
-
-#define ITER(i, j)                                                      \
-    const vector signed short v_diff##i =                               \
-        vec_sub(v_srcAss##i, v_srcAss##j);                              \
-    const vector signed short v_sum##i =                                \
-        vec_add(v_diff##i, v_dcOffset);                                 \
-    const vector signed short v_comp##i =                               \
-        (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \
-                                       v_dcThreshold);                  \
-    const vector signed short v_part##i = vec_and(mask, v_comp##i);
-
-    {
-        ITER(0, 1)
-        ITER(1, 2)
-        ITER(2, 3)
-        ITER(3, 4)
-        ITER(4, 5)
-        ITER(5, 6)
-        ITER(6, 7)
-
-        v_numEq = vec_sum4s(v_part0, v_numEq);
-        v_numEq = vec_sum4s(v_part1, v_numEq);
-        v_numEq = vec_sum4s(v_part2, v_numEq);
-        v_numEq = vec_sum4s(v_part3, v_numEq);
-        v_numEq = vec_sum4s(v_part4, v_numEq);
-        v_numEq = vec_sum4s(v_part5, v_numEq);
-        v_numEq = vec_sum4s(v_part6, v_numEq);
-    }
-
-#undef ITER
-
-    v_numEq = vec_sums(v_numEq, zero);
-
-    v_numEq = vec_splat(v_numEq, 3);
-    vec_ste(v_numEq, 0, &numEq);
-
-    if (numEq > c->ppMode.flatnessThreshold){
-        const vector unsigned char mmoP1 = (const vector unsigned char)
-            {0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
-             0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B};
-        const vector unsigned char mmoP2 = (const vector unsigned char)
-            {0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
-             0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f};
-        const vector unsigned char mmoP = (const vector unsigned char)
-            vec_lvsl(8, (unsigned char*)0);
-
-        vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
-        vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
-        vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
-        vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
-        vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
-        vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
-        vector signed short mmoDiff = vec_sub(mmoL, mmoR);
-        vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
-
-        if (vec_any_gt(mmoSum, v4QP))
-            return 0;
-        else
-            return 1;
-    }
-    else return 2;
-}
-
-static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {
-    /*
-    this code makes no assumption on src or stride.
-    One could remove the recomputation of the perm
-    vector by assuming (stride % 16) == 0, unfortunately
-    this is not always true. Quite a lot of load/stores
-    can be removed by assuming proper alignment of
-    src & stride :-(
-    */
-    uint8_t *src2 = src;
-    const vector signed int zero = vec_splat_s32(0);
-    const int properStride = (stride % 16);
-    const int srcAlign = ((unsigned long)src2 % 16);
-    DECLARE_ALIGNED(16, short, qp)[8] = {c->QP};
-    vector signed short vqp = vec_ld(0, qp);
-    vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;
-    vector unsigned char vbA0, av_uninit(vbA1), av_uninit(vbA2), av_uninit(vbA3), av_uninit(vbA4), av_uninit(vbA5), av_uninit(vbA6), av_uninit(vbA7), av_uninit(vbA8), vbA9;
-    vector unsigned char vbB0, av_uninit(vbB1), av_uninit(vbB2), av_uninit(vbB3), av_uninit(vbB4), av_uninit(vbB5), av_uninit(vbB6), av_uninit(vbB7), av_uninit(vbB8), vbB9;
-    vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;
-    vector unsigned char perml0, perml1, perml2, perml3, perml4,
-                         perml5, perml6, perml7, perml8, perml9;
-    register int j0 = 0,
-                 j1 = stride,
-                 j2 = 2 * stride,
-                 j3 = 3 * stride,
-                 j4 = 4 * stride,
-                 j5 = 5 * stride,
-                 j6 = 6 * stride,
-                 j7 = 7 * stride,
-                 j8 = 8 * stride,
-                 j9 = 9 * stride;
-
-    vqp = vec_splat(vqp, 0);
-
-    src2 += stride*3;
-
-#define LOAD_LINE(i)                                                    \
-    perml##i = vec_lvsl(i * stride, src2);                              \
-    vbA##i = vec_ld(i * stride, src2);                                  \
-    vbB##i = vec_ld(i * stride + 16, src2);                             \
-    vbT##i = vec_perm(vbA##i, vbB##i, perml##i);                        \
-    vb##i =                                                             \
-        (vector signed short)vec_mergeh((vector unsigned char)zero,     \
-                                        (vector unsigned char)vbT##i)
-
-#define LOAD_LINE_ALIGNED(i)                                            \
-    vbT##i = vec_ld(j##i, src2);                                        \
-    vb##i =                                                             \
-        (vector signed short)vec_mergeh((vector signed char)zero,       \
-                                        (vector signed char)vbT##i)
-
-      /* Special-casing the aligned case is worthwhile, as all calls from
-       * the (transposed) horizontable deblocks will be aligned, in addition
-       * to the naturally aligned vertical deblocks. */
-    if (properStride && srcAlign) {
-          LOAD_LINE_ALIGNED(0);
-          LOAD_LINE_ALIGNED(1);
-          LOAD_LINE_ALIGNED(2);
-          LOAD_LINE_ALIGNED(3);
-          LOAD_LINE_ALIGNED(4);
-          LOAD_LINE_ALIGNED(5);
-          LOAD_LINE_ALIGNED(6);
-          LOAD_LINE_ALIGNED(7);
-          LOAD_LINE_ALIGNED(8);
-          LOAD_LINE_ALIGNED(9);
-    } else {
-          LOAD_LINE(0);
-          LOAD_LINE(1);
-          LOAD_LINE(2);
-          LOAD_LINE(3);
-          LOAD_LINE(4);
-          LOAD_LINE(5);
-          LOAD_LINE(6);
-          LOAD_LINE(7);
-          LOAD_LINE(8);
-          LOAD_LINE(9);
-    }
-#undef LOAD_LINE
-#undef LOAD_LINE_ALIGNED
-    {
-        const vector unsigned short v_2 = vec_splat_u16(2);
-        const vector unsigned short v_4 = vec_splat_u16(4);
-
-        const vector signed short v_diff01 = vec_sub(vb0, vb1);
-        const vector unsigned short v_cmp01 =
-            (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);
-        const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);
-        const vector signed short v_diff89 = vec_sub(vb8, vb9);
-        const vector unsigned short v_cmp89 =
-            (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
-        const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);
-
-        const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
-        const vector signed short temp02 = vec_add(vb2, vb3);
-        const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);
-        const vector signed short v_sumsB0 = vec_add(temp02, temp03);
-
-        const vector signed short temp11 = vec_sub(v_sumsB0, v_first);
-        const vector signed short v_sumsB1 = vec_add(temp11, vb4);
-
-        const vector signed short temp21 = vec_sub(v_sumsB1, v_first);
-        const vector signed short v_sumsB2 = vec_add(temp21, vb5);
-
-        const vector signed short temp31 = vec_sub(v_sumsB2, v_first);
-        const vector signed short v_sumsB3 = vec_add(temp31, vb6);
-
-        const vector signed short temp41 = vec_sub(v_sumsB3, v_first);
-        const vector signed short v_sumsB4 = vec_add(temp41, vb7);
-
-        const vector signed short temp51 = vec_sub(v_sumsB4, vb1);
-        const vector signed short v_sumsB5 = vec_add(temp51, vb8);
-
-        const vector signed short temp61 = vec_sub(v_sumsB5, vb2);
-        const vector signed short v_sumsB6 = vec_add(temp61, v_last);
-
-        const vector signed short temp71 = vec_sub(v_sumsB6, vb3);
-        const vector signed short v_sumsB7 = vec_add(temp71, v_last);
-
-        const vector signed short temp81 = vec_sub(v_sumsB7, vb4);
-        const vector signed short v_sumsB8 = vec_add(temp81, v_last);
-
-        const vector signed short temp91 = vec_sub(v_sumsB8, vb5);
-        const vector signed short v_sumsB9 = vec_add(temp91, v_last);
-
-    #define COMPUTE_VR(i, j, k)                                             \
-        const vector signed short temps1##i =                               \
-            vec_add(v_sumsB##i, v_sumsB##k);                                \
-        const vector signed short temps2##i =                               \
-            vec_mladd(vb##j, (vector signed short)v_2, temps1##i);          \
-        const vector signed short  vr##j = vec_sra(temps2##i, v_4)
-
-        COMPUTE_VR(0, 1, 2);
-        COMPUTE_VR(1, 2, 3);
-        COMPUTE_VR(2, 3, 4);
-        COMPUTE_VR(3, 4, 5);
-        COMPUTE_VR(4, 5, 6);
-        COMPUTE_VR(5, 6, 7);
-        COMPUTE_VR(6, 7, 8);
-        COMPUTE_VR(7, 8, 9);
-
-        const vector signed char neg1 = vec_splat_s8(-1);
-        const vector unsigned char permHH = (const vector unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-                                                                         0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
-
-#define PACK_AND_STORE(i)                                       \
-{   const vector unsigned char perms##i =                       \
-        vec_lvsr(i * stride, src2);                             \
-    const vector unsigned char vf##i =                          \
-        vec_packsu(vr##i, (vector signed short)zero);           \
-    const vector unsigned char vg##i =                          \
-        vec_perm(vf##i, vbT##i, permHH);                        \
-    const vector unsigned char mask##i =                        \
-        vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
-    const vector unsigned char vg2##i =                         \
-        vec_perm(vg##i, vg##i, perms##i);                       \
-    const vector unsigned char svA##i =                         \
-        vec_sel(vbA##i, vg2##i, mask##i);                       \
-    const vector unsigned char svB##i =                         \
-        vec_sel(vg2##i, vbB##i, mask##i);                       \
-    vec_st(svA##i, i * stride, src2);                           \
-    vec_st(svB##i, i * stride + 16, src2);}
-
-#define PACK_AND_STORE_ALIGNED(i)                               \
-{   const vector unsigned char vf##i =                          \
-        vec_packsu(vr##i, (vector signed short)zero);           \
-    const vector unsigned char vg##i =                          \
-        vec_perm(vf##i, vbT##i, permHH);                        \
-    vec_st(vg##i, i * stride, src2);}
-
-        /* Special-casing the aligned case is worthwhile, as all calls from
-         * the (transposed) horizontable deblocks will be aligned, in addition
-         * to the naturally aligned vertical deblocks. */
-        if (properStride && srcAlign) {
-            PACK_AND_STORE_ALIGNED(1)
-            PACK_AND_STORE_ALIGNED(2)
-            PACK_AND_STORE_ALIGNED(3)
-            PACK_AND_STORE_ALIGNED(4)
-            PACK_AND_STORE_ALIGNED(5)
-            PACK_AND_STORE_ALIGNED(6)
-            PACK_AND_STORE_ALIGNED(7)
-            PACK_AND_STORE_ALIGNED(8)
-        } else {
-            PACK_AND_STORE(1)
-            PACK_AND_STORE(2)
-            PACK_AND_STORE(3)
-            PACK_AND_STORE(4)
-            PACK_AND_STORE(5)
-            PACK_AND_STORE(6)
-            PACK_AND_STORE(7)
-            PACK_AND_STORE(8)
-        }
-    #undef PACK_AND_STORE
-    #undef PACK_AND_STORE_ALIGNED
-    }
-}
-
-
-
-static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c) {
-    /*
-    this code makes no assumption on src or stride.
-    One could remove the recomputation of the perm
-    vector by assuming (stride % 16) == 0, unfortunately
-    this is not always true. Quite a lot of load/stores
-    can be removed by assuming proper alignment of
-    src & stride :-(
-    */
-    uint8_t *src2 = src + stride*3;
-    const vector signed int zero = vec_splat_s32(0);
-    DECLARE_ALIGNED(16, short, qp)[8] = {8*c->QP};
-    vector signed short vqp = vec_splat(
-                                (vector signed short)vec_ld(0, qp), 0);
-
-#define LOAD_LINE(i)                                                    \
-    const vector unsigned char perm##i =                                \
-        vec_lvsl(i * stride, src2);                                     \
-    const vector unsigned char vbA##i =                                 \
-        vec_ld(i * stride, src2);                                       \
-    const vector unsigned char vbB##i =                                 \
-        vec_ld(i * stride + 16, src2);                                  \
-    const vector unsigned char vbT##i =                                 \
-        vec_perm(vbA##i, vbB##i, perm##i);                              \
-    const vector signed short vb##i =                                   \
-        (vector signed short)vec_mergeh((vector unsigned char)zero,     \
-                                        (vector unsigned char)vbT##i)
-
-     LOAD_LINE(1);
-     LOAD_LINE(2);
-     LOAD_LINE(3);
-     LOAD_LINE(4);
-     LOAD_LINE(5);
-     LOAD_LINE(6);
-     LOAD_LINE(7);
-     LOAD_LINE(8);
-#undef LOAD_LINE
-
-     const vector signed short v_1 = vec_splat_s16(1);
-     const vector signed short v_2 = vec_splat_s16(2);
-     const vector signed short v_5 = vec_splat_s16(5);
-     const vector signed short v_32 = vec_sl(v_1,
-                                             (vector unsigned short)v_5);
-     /* middle energy */
-     const vector signed short l3minusl6 = vec_sub(vb3, vb6);
-     const vector signed short l5minusl4 = vec_sub(vb5, vb4);
-     const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero);
-     const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6);
-     const vector signed short absmE = vec_abs(mE);
-     /* left & right energy */
-     const vector signed short l1minusl4 = vec_sub(vb1, vb4);
-     const vector signed short l3minusl2 = vec_sub(vb3, vb2);
-     const vector signed short l5minusl8 = vec_sub(vb5, vb8);
-     const vector signed short l7minusl6 = vec_sub(vb7, vb6);
-     const vector signed short twotimes_l1minusl4 = vec_mladd(v_2, l1minusl4, (vector signed short)zero);
-     const vector signed short twotimes_l5minusl8 = vec_mladd(v_2, l5minusl8, (vector signed short)zero);
-     const vector signed short lE = vec_mladd(v_5, l3minusl2, twotimes_l1minusl4);
-     const vector signed short rE = vec_mladd(v_5, l7minusl6, twotimes_l5minusl8);
-     /* d */
-     const vector signed short ddiff = vec_sub(absmE,
-                                               vec_min(vec_abs(lE),
-                                                       vec_abs(rE)));
-     const vector signed short ddiffclamp = vec_max(ddiff, (vector signed short)zero);
-     const vector signed short dtimes64 = vec_mladd(v_5, ddiffclamp, v_32);
-     const vector signed short d = vec_sra(dtimes64, vec_splat_u16(6));
-     const vector signed short minusd = vec_sub((vector signed short)zero, d);
-     const vector signed short finald = vec_sel(minusd,
-                                                d,
-                                                vec_cmpgt(vec_sub((vector signed short)zero, mE),
-                                                          (vector signed short)zero));
-     /* q */
-     const vector signed short qtimes2 = vec_sub(vb4, vb5);
-     /* for a shift right to behave like /2, we need to add one
-        to all negative integer */
-     const vector signed short rounddown = vec_sel((vector signed short)zero,
-                                                   v_1,
-                                                   vec_cmplt(qtimes2, (vector signed short)zero));
-     const vector signed short q = vec_sra(vec_add(qtimes2, rounddown), vec_splat_u16(1));
-     /* clamp */
-     const vector signed short dclamp_P1 = vec_max((vector signed short)zero, finald);
-     const vector signed short dclamp_P = vec_min(dclamp_P1, q);
-     const vector signed short dclamp_N1 = vec_min((vector signed short)zero, finald);
-     const vector signed short dclamp_N = vec_max(dclamp_N1, q);
-
-     const vector signed short dclampedfinal = vec_sel(dclamp_N,
-                                                       dclamp_P,
-                                                       vec_cmpgt(q, (vector signed short)zero));
-     const vector signed short dornotd = vec_sel((vector signed short)zero,
-                                                 dclampedfinal,
-                                                 vec_cmplt(absmE, vqp));
-     /* add/subtract to l4 and l5 */
-     const vector signed short vb4minusd = vec_sub(vb4, dornotd);
-     const vector signed short vb5plusd  = vec_add(vb5, dornotd);
-     /* finally, stores */
-     const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero);
-     const vector unsigned char st5 = vec_packsu(vb5plusd,  (vector signed short)zero);
-
-     const vector signed char neg1 = vec_splat_s8(-1);
-     const vector unsigned char permHH = (const vector unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-                                                                      0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
-
-#define STORE(i)                                                \
-{    const vector unsigned char perms##i =                      \
-         vec_lvsr(i * stride, src2);                            \
-     const vector unsigned char vg##i =                         \
-         vec_perm(st##i, vbT##i, permHH);                       \
-     const vector unsigned char mask##i =                       \
-         vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
-     const vector unsigned char vg2##i =                        \
-         vec_perm(vg##i, vg##i, perms##i);                      \
-     const vector unsigned char svA##i =                        \
-         vec_sel(vbA##i, vg2##i, mask##i);                      \
-     const vector unsigned char svB##i =                        \
-         vec_sel(vg2##i, vbB##i, mask##i);                      \
-     vec_st(svA##i, i * stride, src2);                          \
-     vec_st(svB##i, i * stride + 16, src2);}
-
-     STORE(4)
-     STORE(5)
-}
-
-static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {
-    const vector signed int vsint32_8 = vec_splat_s32(8);
-    const vector unsigned int vuint32_4 = vec_splat_u32(4);
-    const vector signed char neg1 = vec_splat_s8(-1);
-
-    const vector unsigned char permA1 = (vector unsigned char)
-        {0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F,
-         0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F};
-    const vector unsigned char permA2 = (vector unsigned char)
-        {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11,
-         0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F};
-    const vector unsigned char permA1inc = (vector unsigned char)
-        {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
-    const vector unsigned char permA2inc = (vector unsigned char)
-        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
-         0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
-    const vector unsigned char magic = (vector unsigned char)
-        {0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02,
-         0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
-    const vector unsigned char extractPerm = (vector unsigned char)
-        {0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01,
-         0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01};
-    const vector unsigned char extractPermInc = (vector unsigned char)
-        {0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
-         0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01};
-    const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0);
-    const vector unsigned char tenRight = (vector unsigned char)
-        {0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
-    const vector unsigned char eightLeft = (vector unsigned char)
-        {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08};
-
-    /*
-    this code makes no assumption on src or stride.
-    One could remove the recomputation of the perm
-    vector by assuming (stride % 16) == 0, unfortunately
-    this is not always true. Quite a lot of load/stores
-    can be removed by assuming proper alignment of
-    src & stride :-(
-    */
-    uint8_t *srcCopy = src;
-    DECLARE_ALIGNED(16, uint8_t, dt)[16] = { deringThreshold };
-    const vector signed int zero = vec_splat_s32(0);
-    vector unsigned char v_dt = vec_splat(vec_ld(0, dt), 0);
-
-#define LOAD_LINE(i)                                                  \
-    const vector unsigned char perm##i =                              \
-        vec_lvsl(i * stride, srcCopy);                                \
-    vector unsigned char sA##i = vec_ld(i * stride, srcCopy);         \
-    vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy);    \
-    vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i)
-
-    LOAD_LINE(0);
-    LOAD_LINE(1);
-    LOAD_LINE(2);
-    LOAD_LINE(3);
-    LOAD_LINE(4);
-    LOAD_LINE(5);
-    LOAD_LINE(6);
-    LOAD_LINE(7);
-    LOAD_LINE(8);
-    LOAD_LINE(9);
-#undef LOAD_LINE
-
-    vector unsigned char v_avg;
-    DECLARE_ALIGNED(16, signed int, S)[8];
-    DECLARE_ALIGNED(16, int, tQP2)[4] = { c->QP/2 + 1 };
-    vector signed int vQP2 = vec_ld(0, tQP2);
-    vQP2 = vec_splat(vQP2, 0);
-
-    {
-    const vector unsigned char trunc_perm = (vector unsigned char)
-        {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
-         0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
-    const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm);
-    const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm);
-    const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm);
-    const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm);
-
-#define EXTRACT(op) do {                                                \
-    const vector unsigned char s_1   = vec_##op(trunc_src12, trunc_src34); \
-    const vector unsigned char s_2   = vec_##op(trunc_src56, trunc_src78); \
-    const vector unsigned char s_6   = vec_##op(s_1, s_2);     \
-    const vector unsigned char s_8h  = vec_mergeh(s_6, s_6);   \
-    const vector unsigned char s_8l  = vec_mergel(s_6, s_6);   \
-    const vector unsigned char s_9   = vec_##op(s_8h, s_8l);   \
-    const vector unsigned char s_9h  = vec_mergeh(s_9, s_9);   \
-    const vector unsigned char s_9l  = vec_mergel(s_9, s_9);   \
-    const vector unsigned char s_10  = vec_##op(s_9h, s_9l);   \
-    const vector unsigned char s_10h = vec_mergeh(s_10, s_10); \
-    const vector unsigned char s_10l = vec_mergel(s_10, s_10); \
-    const vector unsigned char s_11  = vec_##op(s_10h, s_10l); \
-    const vector unsigned char s_11h = vec_mergeh(s_11, s_11); \
-    const vector unsigned char s_11l = vec_mergel(s_11, s_11); \
-    v_##op = vec_##op(s_11h, s_11l);                           \
-} while (0)
-
-    vector unsigned char v_min;
-    vector unsigned char v_max;
-    EXTRACT(min);
-    EXTRACT(max);
-#undef EXTRACT
-
-    if (vec_all_lt(vec_sub(v_max, v_min), v_dt))
-        return;
-
-    v_avg = vec_avg(v_min, v_max);
-    }
-
-    {
-    const vector unsigned short mask1 = (vector unsigned short)
-                                        {0x0001, 0x0002, 0x0004, 0x0008,
-                                         0x0010, 0x0020, 0x0040, 0x0080};
-    const vector unsigned short mask2 = (vector unsigned short)
-                                        {0x0100, 0x0200, 0x0000, 0x0000,
-                                         0x0000, 0x0000, 0x0000, 0x0000};
-
-    const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4));
-    const vector unsigned int vuint32_1 = vec_splat_u32(1);
-
-    vector signed int sumA2;
-    vector signed int sumB2;
-    vector signed int sum0, sum1, sum2, sum3, sum4;
-    vector signed int sum5, sum6, sum7, sum8, sum9;
-
-#define COMPARE(i)                                                      \
-    do {                                                                \
-        const vector unsigned char cmp =                                \
-            (vector unsigned char)vec_cmpgt(src##i, v_avg);             \
-        const vector unsigned short cmpHi =                             \
-            (vector unsigned short)vec_mergeh(cmp, cmp);                \
-        const vector unsigned short cmpLi =                             \
-            (vector unsigned short)vec_mergel(cmp, cmp);                \
-        const vector signed short cmpHf =                               \
-            (vector signed short)vec_and(cmpHi, mask1);                 \
-        const vector signed short cmpLf =                               \
-            (vector signed short)vec_and(cmpLi, mask2);                 \
-        const vector signed int sump = vec_sum4s(cmpHf, zero);          \
-        const vector signed int sumq = vec_sum4s(cmpLf, sump);          \
-        sum##i  = vec_sums(sumq, zero);                                 \
-    } while (0)
-
-    COMPARE(0);
-    COMPARE(1);
-    COMPARE(2);
-    COMPARE(3);
-    COMPARE(4);
-    COMPARE(5);
-    COMPARE(6);
-    COMPARE(7);
-    COMPARE(8);
-    COMPARE(9);
-#undef COMPARE
-
-    {
-    const vector signed int sump02 = vec_mergel(sum0, sum2);
-    const vector signed int sump13 = vec_mergel(sum1, sum3);
-    const vector signed int sumA = vec_mergel(sump02, sump13);
-
-    const vector signed int sump46 = vec_mergel(sum4, sum6);
-    const vector signed int sump57 = vec_mergel(sum5, sum7);
-    const vector signed int sumB = vec_mergel(sump46, sump57);
-
-    const vector signed int sump8A = vec_mergel(sum8, zero);
-    const vector signed int sump9B = vec_mergel(sum9, zero);
-    const vector signed int sumC = vec_mergel(sump8A, sump9B);
-
-    const vector signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16);
-    const vector signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16);
-    const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16);
-    const vector signed int t2A = vec_or(sumA, tA);
-    const vector signed int t2B = vec_or(sumB, tB);
-    const vector signed int t2C = vec_or(sumC, tC);
-    const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1),
-                                          vec_sl(t2A, vuint32_1));
-    const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1),
-                                          vec_sl(t2B, vuint32_1));
-    const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1),
-                                          vec_sl(t2C, vuint32_1));
-    const vector signed int yA = vec_and(t2A, t3A);
-    const vector signed int yB = vec_and(t2B, t3B);
-    const vector signed int yC = vec_and(t2C, t3C);
-
-    const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0);
-    const vector unsigned char strangeperm2 = vec_lvsl(8, (unsigned char*)0);
-    const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1);
-    const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2);
-    const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1);
-    const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2);
-    const vector signed int sumAp = vec_and(yA,
-                                            vec_and(sumAd4,sumAd8));
-    const vector signed int sumBp = vec_and(yB,
-                                            vec_and(sumBd4,sumBd8));
-    sumA2 = vec_or(sumAp,
-                   vec_sra(sumAp,
-                           vuint32_16));
-    sumB2  = vec_or(sumBp,
-                    vec_sra(sumBp,
-                            vuint32_16));
-    }
-    vec_st(sumA2, 0, S);
-    vec_st(sumB2, 16, S);
-    }
-
-    /* I'm not sure the following is actually faster
-       than straight, unvectorized C code :-( */
-
-#define F_INIT()                                       \
-    vector unsigned char tenRightM = tenRight;         \
-    vector unsigned char permA1M = permA1;             \
-    vector unsigned char permA2M = permA2;             \
-    vector unsigned char extractPermM = extractPerm
-
-#define F2(i, j, k, l)                                                  \
-    if (S[i] & (1 << (l+1))) {                                          \
-        const vector unsigned char a_A = vec_perm(src##i, src##j, permA1M); \
-        const vector unsigned char a_B = vec_perm(a_A, src##k, permA2M); \
-        const vector signed int a_sump =                                \
-            (vector signed int)vec_msum(a_B, magic, (vector unsigned int)zero);\
-        vector signed int F = vec_sr(vec_sums(a_sump, vsint32_8), vuint32_4); \
-        const vector signed int p =                                     \
-            (vector signed int)vec_perm(src##j, (vector unsigned char)zero, \
-                                        extractPermM);                  \
-        const vector signed int sum  = vec_add(p, vQP2);                \
-        const vector signed int diff = vec_sub(p, vQP2);                \
-        vector signed int newpm;                                        \
-        vector unsigned char newpm2, mask;                              \
-        F = vec_splat(F, 3);                                            \
-        if (vec_all_lt(sum, F))                                         \
-            newpm = sum;                                                \
-        else if (vec_all_gt(diff, F))                                   \
-            newpm = diff;                                               \
-        else newpm = F;                                                 \
-        newpm2 = vec_splat((vector unsigned char)newpm, 15);            \
-        mask = vec_add(identity, tenRightM);                            \
-        src##j = vec_perm(src##j, newpm2, mask);                        \
-    }                                                                   \
-    permA1M = vec_add(permA1M, permA1inc);                              \
-    permA2M = vec_add(permA2M, permA2inc);                              \
-    tenRightM = vec_sro(tenRightM, eightLeft);                          \
-    extractPermM = vec_add(extractPermM, extractPermInc)
-
-#define ITER(i, j, k) do {                      \
-    F_INIT();                                   \
-    F2(i, j, k, 0);                             \
-    F2(i, j, k, 1);                             \
-    F2(i, j, k, 2);                             \
-    F2(i, j, k, 3);                             \
-    F2(i, j, k, 4);                             \
-    F2(i, j, k, 5);                             \
-    F2(i, j, k, 6);                             \
-    F2(i, j, k, 7);                             \
-} while (0)
-
-    ITER(0, 1, 2);
-    ITER(1, 2, 3);
-    ITER(2, 3, 4);
-    ITER(3, 4, 5);
-    ITER(4, 5, 6);
-    ITER(5, 6, 7);
-    ITER(6, 7, 8);
-    ITER(7, 8, 9);
-
-#define STORE_LINE(i) do {                              \
-    const vector unsigned char permST =                 \
-        vec_lvsr(i * stride, srcCopy);                  \
-    const vector unsigned char maskST =                 \
-        vec_perm((vector unsigned char)zero,            \
-                 (vector unsigned char)neg1, permST);   \
-    src##i = vec_perm(src##i ,src##i, permST);          \
-    sA##i= vec_sel(sA##i, src##i, maskST);              \
-    sB##i= vec_sel(src##i, sB##i, maskST);              \
-    vec_st(sA##i, i * stride, srcCopy);                 \
-    vec_st(sB##i, i * stride + 16, srcCopy);            \
-} while (0)
-
-    STORE_LINE(1);
-    STORE_LINE(2);
-    STORE_LINE(3);
-    STORE_LINE(4);
-    STORE_LINE(5);
-    STORE_LINE(6);
-    STORE_LINE(7);
-    STORE_LINE(8);
-
-#undef STORE_LINE
-#undef ITER
-#undef F2
-}
-
-#define doHorizLowPass_altivec(a...) doHorizLowPass_C(a)
-#define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a)
-#define do_a_deblock_altivec(a...) do_a_deblock_C(a)
-
-static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
-                                            uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
-{
-    const vector signed char neg1 = vec_splat_s8(-1);
-    const vector unsigned char permHH = (const vector unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-                                                                     0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
-
-    const vector signed int zero = vec_splat_s32(0);
-    const vector signed short vsint16_1 = vec_splat_s16(1);
-    vector signed int v_dp = zero;
-    vector signed int v_sysdp = zero;
-    int d, sysd, i;
-
-#define LOAD_LINE(src, i)                                               \
-    register int j##src##i = i * stride;                                \
-    vector unsigned char perm##src##i = vec_lvsl(j##src##i, src);       \
-    const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \
-    const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \
-    const vector unsigned char v_##src##A##i =                          \
-        vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i);         \
-    vector signed short v_##src##Ass##i =                               \
-        (vector signed short)vec_mergeh((vector signed char)zero,       \
-                                        (vector signed char)v_##src##A##i)
-
-    LOAD_LINE(src, 0);
-    LOAD_LINE(src, 1);
-    LOAD_LINE(src, 2);
-    LOAD_LINE(src, 3);
-    LOAD_LINE(src, 4);
-    LOAD_LINE(src, 5);
-    LOAD_LINE(src, 6);
-    LOAD_LINE(src, 7);
-
-    LOAD_LINE(tempBlurred, 0);
-    LOAD_LINE(tempBlurred, 1);
-    LOAD_LINE(tempBlurred, 2);
-    LOAD_LINE(tempBlurred, 3);
-    LOAD_LINE(tempBlurred, 4);
-    LOAD_LINE(tempBlurred, 5);
-    LOAD_LINE(tempBlurred, 6);
-    LOAD_LINE(tempBlurred, 7);
-#undef LOAD_LINE
-
-#define ACCUMULATE_DIFFS(i) do {                                \
-        vector signed short v_d = vec_sub(v_tempBlurredAss##i,  \
-                                          v_srcAss##i);         \
-        v_dp = vec_msums(v_d, v_d, v_dp);                       \
-        v_sysdp = vec_msums(v_d, vsint16_1, v_sysdp);           \
-    } while (0)
-
-    ACCUMULATE_DIFFS(0);
-    ACCUMULATE_DIFFS(1);
-    ACCUMULATE_DIFFS(2);
-    ACCUMULATE_DIFFS(3);
-    ACCUMULATE_DIFFS(4);
-    ACCUMULATE_DIFFS(5);
-    ACCUMULATE_DIFFS(6);
-    ACCUMULATE_DIFFS(7);
-#undef ACCUMULATE_DIFFS
-
-    tempBlurredPast[127]= maxNoise[0];
-    tempBlurredPast[128]= maxNoise[1];
-    tempBlurredPast[129]= maxNoise[2];
-
-    v_dp = vec_sums(v_dp, zero);
-    v_sysdp = vec_sums(v_sysdp, zero);
-
-    v_dp = vec_splat(v_dp, 3);
-    v_sysdp = vec_splat(v_sysdp, 3);
-
-    vec_ste(v_dp, 0, &d);
-    vec_ste(v_sysdp, 0, &sysd);
-
-    i = d;
-    d = (4*d
-         +(*(tempBlurredPast-256))
-         +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
-         +(*(tempBlurredPast+256))
-         +4)>>3;
-
-    *tempBlurredPast=i;
-
-    if (d > maxNoise[1]) {
-        if (d < maxNoise[2]) {
-#define OP(i) v_tempBlurredAss##i = vec_avg(v_tempBlurredAss##i, v_srcAss##i);
-
-            OP(0);
-            OP(1);
-            OP(2);
-            OP(3);
-            OP(4);
-            OP(5);
-            OP(6);
-            OP(7);
-#undef OP
-        } else {
-#define OP(i) v_tempBlurredAss##i = v_srcAss##i;
-
-            OP(0);
-            OP(1);
-            OP(2);
-            OP(3);
-            OP(4);
-            OP(5);
-            OP(6);
-            OP(7);
-#undef OP
-        }
-    } else {
-        if (d < maxNoise[0]) {
-            const vector signed short vsint16_7 = vec_splat_s16(7);
-            const vector signed short vsint16_4 = vec_splat_s16(4);
-            const vector unsigned short vuint16_3 = vec_splat_u16(3);
-
-#define OP(i) do {                                                      \
-            const vector signed short v_temp =                          \
-                vec_mladd(v_tempBlurredAss##i, vsint16_7, v_srcAss##i); \
-            const vector signed short v_temp2 = vec_add(v_temp, vsint16_4); \
-            v_tempBlurredAss##i = vec_sr(v_temp2, vuint16_3);           \
-        } while (0)
-
-            OP(0);
-            OP(1);
-            OP(2);
-            OP(3);
-            OP(4);
-            OP(5);
-            OP(6);
-            OP(7);
-#undef OP
-        } else {
-            const vector signed short vsint16_3 = vec_splat_s16(3);
-            const vector signed short vsint16_2 = vec_splat_s16(2);
-
-#define OP(i) do {                                              \
-            const vector signed short v_temp =                  \
-                vec_mladd(v_tempBlurredAss##i, vsint16_3, v_srcAss##i); \
-            const vector signed short v_temp2 = vec_add(v_temp, vsint16_2); \
-            v_tempBlurredAss##i =                                       \
-                vec_sr(v_temp2, (vector unsigned short)vsint16_2);      \
-        } while (0)
-
-            OP(0);
-            OP(1);
-            OP(2);
-            OP(3);
-            OP(4);
-            OP(5);
-            OP(6);
-            OP(7);
-#undef OP
-        }
-    }
-
-#define PACK_AND_STORE(src, i) do {                                      \
-    const vector unsigned char perms = vec_lvsr(i * stride, src);        \
-    const vector unsigned char vf =                                      \
-        vec_packsu(v_tempBlurredAss##1, (vector signed short)zero);     \
-    const vector unsigned char vg = vec_perm(vf, v_##src##A##i, permHH); \
-    const vector unsigned char mask =                                    \
-        vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms); \
-    const vector unsigned char vg2 = vec_perm(vg, vg, perms);            \
-    const vector unsigned char svA = vec_sel(v_##src##A1##i, vg2, mask); \
-    const vector unsigned char svB = vec_sel(vg2, v_##src##A2##i, mask); \
-    vec_st(svA, i * stride, src);                                        \
-    vec_st(svB, i * stride + 16, src);                                   \
-} while (0)
-
-    PACK_AND_STORE(src, 0);
-    PACK_AND_STORE(src, 1);
-    PACK_AND_STORE(src, 2);
-    PACK_AND_STORE(src, 3);
-    PACK_AND_STORE(src, 4);
-    PACK_AND_STORE(src, 5);
-    PACK_AND_STORE(src, 6);
-    PACK_AND_STORE(src, 7);
-    PACK_AND_STORE(tempBlurred, 0);
-    PACK_AND_STORE(tempBlurred, 1);
-    PACK_AND_STORE(tempBlurred, 2);
-    PACK_AND_STORE(tempBlurred, 3);
-    PACK_AND_STORE(tempBlurred, 4);
-    PACK_AND_STORE(tempBlurred, 5);
-    PACK_AND_STORE(tempBlurred, 6);
-    PACK_AND_STORE(tempBlurred, 7);
-#undef PACK_AND_STORE
-}
-
-static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
-    const vector unsigned char zero = vec_splat_u8(0);
-
-#define LOAD_DOUBLE_LINE(i, j)                                          \
-    vector unsigned char perm1##i = vec_lvsl(i * stride, src);          \
-    vector unsigned char perm2##i = vec_lvsl(j * stride, src);          \
-    vector unsigned char srcA##i = vec_ld(i * stride, src);             \
-    vector unsigned char srcB##i = vec_ld(i * stride + 16, src);        \
-    vector unsigned char srcC##i = vec_ld(j * stride, src);             \
-    vector unsigned char srcD##i = vec_ld(j * stride+ 16, src);         \
-    vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \
-    vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i)
-
-    LOAD_DOUBLE_LINE(0, 1);
-    LOAD_DOUBLE_LINE(2, 3);
-    LOAD_DOUBLE_LINE(4, 5);
-    LOAD_DOUBLE_LINE(6, 7);
-#undef LOAD_DOUBLE_LINE
-
-    vector unsigned char tempA = vec_mergeh(src0, zero);
-    vector unsigned char tempB = vec_mergel(src0, zero);
-    vector unsigned char tempC = vec_mergeh(src1, zero);
-    vector unsigned char tempD = vec_mergel(src1, zero);
-    vector unsigned char tempE = vec_mergeh(src2, zero);
-    vector unsigned char tempF = vec_mergel(src2, zero);
-    vector unsigned char tempG = vec_mergeh(src3, zero);
-    vector unsigned char tempH = vec_mergel(src3, zero);
-    vector unsigned char tempI = vec_mergeh(src4, zero);
-    vector unsigned char tempJ = vec_mergel(src4, zero);
-    vector unsigned char tempK = vec_mergeh(src5, zero);
-    vector unsigned char tempL = vec_mergel(src5, zero);
-    vector unsigned char tempM = vec_mergeh(src6, zero);
-    vector unsigned char tempN = vec_mergel(src6, zero);
-    vector unsigned char tempO = vec_mergeh(src7, zero);
-    vector unsigned char tempP = vec_mergel(src7, zero);
-
-    vector unsigned char temp0  = vec_mergeh(tempA, tempI);
-    vector unsigned char temp1  = vec_mergel(tempA, tempI);
-    vector unsigned char temp2  = vec_mergeh(tempB, tempJ);
-    vector unsigned char temp3  = vec_mergel(tempB, tempJ);
-    vector unsigned char temp4  = vec_mergeh(tempC, tempK);
-    vector unsigned char temp5  = vec_mergel(tempC, tempK);
-    vector unsigned char temp6  = vec_mergeh(tempD, tempL);
-    vector unsigned char temp7  = vec_mergel(tempD, tempL);
-    vector unsigned char temp8  = vec_mergeh(tempE, tempM);
-    vector unsigned char temp9  = vec_mergel(tempE, tempM);
-    vector unsigned char temp10 = vec_mergeh(tempF, tempN);
-    vector unsigned char temp11 = vec_mergel(tempF, tempN);
-    vector unsigned char temp12 = vec_mergeh(tempG, tempO);
-    vector unsigned char temp13 = vec_mergel(tempG, tempO);
-    vector unsigned char temp14 = vec_mergeh(tempH, tempP);
-    vector unsigned char temp15 = vec_mergel(tempH, tempP);
-
-    tempA = vec_mergeh(temp0, temp8);
-    tempB = vec_mergel(temp0, temp8);
-    tempC = vec_mergeh(temp1, temp9);
-    tempD = vec_mergel(temp1, temp9);
-    tempE = vec_mergeh(temp2, temp10);
-    tempF = vec_mergel(temp2, temp10);
-    tempG = vec_mergeh(temp3, temp11);
-    tempH = vec_mergel(temp3, temp11);
-    tempI = vec_mergeh(temp4, temp12);
-    tempJ = vec_mergel(temp4, temp12);
-    tempK = vec_mergeh(temp5, temp13);
-    tempL = vec_mergel(temp5, temp13);
-    tempM = vec_mergeh(temp6, temp14);
-    tempN = vec_mergel(temp6, temp14);
-    tempO = vec_mergeh(temp7, temp15);
-    tempP = vec_mergel(temp7, temp15);
-
-    temp0  = vec_mergeh(tempA, tempI);
-    temp1  = vec_mergel(tempA, tempI);
-    temp2  = vec_mergeh(tempB, tempJ);
-    temp3  = vec_mergel(tempB, tempJ);
-    temp4  = vec_mergeh(tempC, tempK);
-    temp5  = vec_mergel(tempC, tempK);
-    temp6  = vec_mergeh(tempD, tempL);
-    temp7  = vec_mergel(tempD, tempL);
-    temp8  = vec_mergeh(tempE, tempM);
-    temp9  = vec_mergel(tempE, tempM);
-    temp10 = vec_mergeh(tempF, tempN);
-    temp11 = vec_mergel(tempF, tempN);
-    temp12 = vec_mergeh(tempG, tempO);
-    temp13 = vec_mergel(tempG, tempO);
-    temp14 = vec_mergeh(tempH, tempP);
-    temp15 = vec_mergel(tempH, tempP);
-
-    vec_st(temp0,    0, dst);
-    vec_st(temp1,   16, dst);
-    vec_st(temp2,   32, dst);
-    vec_st(temp3,   48, dst);
-    vec_st(temp4,   64, dst);
-    vec_st(temp5,   80, dst);
-    vec_st(temp6,   96, dst);
-    vec_st(temp7,  112, dst);
-    vec_st(temp8,  128, dst);
-    vec_st(temp9,  144, dst);
-    vec_st(temp10, 160, dst);
-    vec_st(temp11, 176, dst);
-    vec_st(temp12, 192, dst);
-    vec_st(temp13, 208, dst);
-    vec_st(temp14, 224, dst);
-    vec_st(temp15, 240, dst);
-}
-
-static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
-    const vector unsigned char zero = vec_splat_u8(0);
-    const vector signed   char neg1 = vec_splat_s8(-1);
-
-#define LOAD_DOUBLE_LINE(i, j)                                  \
-    vector unsigned char src##i = vec_ld(i * 16, src);            \
-    vector unsigned char src##j = vec_ld(j * 16, src)
-
-    LOAD_DOUBLE_LINE(0, 1);
-    LOAD_DOUBLE_LINE(2, 3);
-    LOAD_DOUBLE_LINE(4, 5);
-    LOAD_DOUBLE_LINE(6, 7);
-    LOAD_DOUBLE_LINE(8, 9);
-    LOAD_DOUBLE_LINE(10, 11);
-    LOAD_DOUBLE_LINE(12, 13);
-    LOAD_DOUBLE_LINE(14, 15);
-#undef LOAD_DOUBLE_LINE
-
-    vector unsigned char tempA = vec_mergeh(src0, src8);
-    vector unsigned char tempB;
-    vector unsigned char tempC = vec_mergeh(src1, src9);
-    vector unsigned char tempD;
-    vector unsigned char tempE = vec_mergeh(src2, src10);
-    vector unsigned char tempG = vec_mergeh(src3, src11);
-    vector unsigned char tempI = vec_mergeh(src4, src12);
-    vector unsigned char tempJ;
-    vector unsigned char tempK = vec_mergeh(src5, src13);
-    vector unsigned char tempL;
-    vector unsigned char tempM = vec_mergeh(src6, src14);
-    vector unsigned char tempO = vec_mergeh(src7, src15);
-
-    vector unsigned char temp0 = vec_mergeh(tempA, tempI);
-    vector unsigned char temp1 = vec_mergel(tempA, tempI);
-    vector unsigned char temp2;
-    vector unsigned char temp3;
-    vector unsigned char temp4 = vec_mergeh(tempC, tempK);
-    vector unsigned char temp5 = vec_mergel(tempC, tempK);
-    vector unsigned char temp6;
-    vector unsigned char temp7;
-    vector unsigned char temp8 = vec_mergeh(tempE, tempM);
-    vector unsigned char temp9 = vec_mergel(tempE, tempM);
-    vector unsigned char temp12 = vec_mergeh(tempG, tempO);
-    vector unsigned char temp13 = vec_mergel(tempG, tempO);
-
-    tempA = vec_mergeh(temp0, temp8);
-    tempB = vec_mergel(temp0, temp8);
-    tempC = vec_mergeh(temp1, temp9);
-    tempD = vec_mergel(temp1, temp9);
-    tempI = vec_mergeh(temp4, temp12);
-    tempJ = vec_mergel(temp4, temp12);
-    tempK = vec_mergeh(temp5, temp13);
-    tempL = vec_mergel(temp5, temp13);
-
-    temp0 = vec_mergeh(tempA, tempI);
-    temp1 = vec_mergel(tempA, tempI);
-    temp2 = vec_mergeh(tempB, tempJ);
-    temp3 = vec_mergel(tempB, tempJ);
-    temp4 = vec_mergeh(tempC, tempK);
-    temp5 = vec_mergel(tempC, tempK);
-    temp6 = vec_mergeh(tempD, tempL);
-    temp7 = vec_mergel(tempD, tempL);
-
-
-#define STORE_DOUBLE_LINE(i, j) do {                                    \
-    vector unsigned char dstAi = vec_ld(i * stride, dst);               \
-    vector unsigned char dstBi = vec_ld(i * stride + 16, dst);          \
-    vector unsigned char dstAj = vec_ld(j * stride, dst);               \
-    vector unsigned char dstBj = vec_ld(j * stride+ 16, dst);           \
-    vector unsigned char aligni = vec_lvsr(i * stride, dst);            \
-    vector unsigned char alignj = vec_lvsr(j * stride, dst);            \
-    vector unsigned char maski =                                        \
-        vec_perm(zero, (vector unsigned char)neg1, aligni);             \
-    vector unsigned char maskj =                                        \
-        vec_perm(zero, (vector unsigned char)neg1, alignj);             \
-    vector unsigned char dstRi = vec_perm(temp##i, temp##i, aligni);    \
-    vector unsigned char dstRj = vec_perm(temp##j, temp##j, alignj);    \
-    vector unsigned char dstAFi = vec_sel(dstAi, dstRi, maski);         \
-    vector unsigned char dstBFi = vec_sel(dstRi, dstBi, maski);         \
-    vector unsigned char dstAFj = vec_sel(dstAj, dstRj, maskj);         \
-    vector unsigned char dstBFj = vec_sel(dstRj, dstBj, maskj);         \
-    vec_st(dstAFi, i * stride, dst);                                    \
-    vec_st(dstBFi, i * stride + 16, dst);                               \
-    vec_st(dstAFj, j * stride, dst);                                    \
-    vec_st(dstBFj, j * stride + 16, dst);                               \
-} while (0)
-
-    STORE_DOUBLE_LINE(0,1);
-    STORE_DOUBLE_LINE(2,3);
-    STORE_DOUBLE_LINE(4,5);
-    STORE_DOUBLE_LINE(6,7);
-}
diff --git a/libpostproc/postprocess_internal.h b/libpostproc/postprocess_internal.h
deleted file mode 100644
index d2c6708d76..0000000000
--- a/libpostproc/postprocess_internal.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * internal api header.
- */
-
-#ifndef POSTPROC_POSTPROCESS_INTERNAL_H
-#define POSTPROC_POSTPROCESS_INTERNAL_H
-
-#include <string.h>
-#include "libavutil/avutil.h"
-#include "libavutil/log.h"
-#include "postprocess.h"
-
-#define V_DEBLOCK       0x01
-#define H_DEBLOCK       0x02
-#define DERING          0x04
-#define LEVEL_FIX       0x08 ///< Brightness & Contrast
-
-#define LUM_V_DEBLOCK   V_DEBLOCK               //   1
-#define LUM_H_DEBLOCK   H_DEBLOCK               //   2
-#define CHROM_V_DEBLOCK (V_DEBLOCK<<4)          //  16
-#define CHROM_H_DEBLOCK (H_DEBLOCK<<4)          //  32
-#define LUM_DERING      DERING                  //   4
-#define CHROM_DERING    (DERING<<4)             //  64
-#define LUM_LEVEL_FIX   LEVEL_FIX               //   8
-#define CHROM_LEVEL_FIX (LEVEL_FIX<<4)          // 128 (not implemented yet)
-
-// Experimental vertical filters
-#define V_X1_FILTER     0x0200                  // 512
-#define V_A_DEBLOCK     0x0400
-
-// Experimental horizontal filters
-#define H_X1_FILTER     0x2000                  // 8192
-#define H_A_DEBLOCK     0x4000
-
-/// select between full y range (255-0) or standart one (234-16)
-#define FULL_Y_RANGE    0x8000                  // 32768
-
-//Deinterlacing Filters
-#define LINEAR_IPOL_DEINT_FILTER        0x10000 // 65536
-#define LINEAR_BLEND_DEINT_FILTER       0x20000 // 131072
-#define CUBIC_BLEND_DEINT_FILTER        0x8000  // (not implemented yet)
-#define CUBIC_IPOL_DEINT_FILTER         0x40000 // 262144
-#define MEDIAN_DEINT_FILTER             0x80000 // 524288
-#define FFMPEG_DEINT_FILTER             0x400000
-#define LOWPASS5_DEINT_FILTER           0x800000
-
-#define TEMP_NOISE_FILTER               0x100000
-#define FORCE_QUANT                     0x200000
-
-//use if you want a faster postprocessing code
-//cannot differentiate between chroma & luma filters (both on or both off)
-//obviously the -pp option on the command line has no effect except turning the here selected
-//filters on
-//#define COMPILE_TIME_MODE 0x77
-
-static inline int CLIP(int a){
-    if(a&256) return ((a)>>31)^(-1);
-    else      return a;
-}
-/**
- * Postprocessng filter.
- */
-struct PPFilter{
-    const char *shortName;
-    const char *longName;
-    int chromDefault;       ///< is chrominance filtering on by default if this filter is manually activated
-    int minLumQuality;      ///< minimum quality to turn luminance filtering on
-    int minChromQuality;    ///< minimum quality to turn chrominance filtering on
-    int mask;               ///< Bitmask to turn this filter on
-};
-
-/**
- * Postprocessng mode.
- */
-typedef struct PPMode{
-    int lumMode;                    ///< acivates filters for luminance
-    int chromMode;                  ///< acivates filters for chrominance
-    int error;                      ///< non zero on error
-
-    int minAllowedY;                ///< for brigtness correction
-    int maxAllowedY;                ///< for brihtness correction
-    float maxClippedThreshold;      ///< amount of "black" you are willing to lose to get a brightness-corrected picture
-
-    int maxTmpNoise[3];             ///< for Temporal Noise Reducing filter (Maximal sum of abs differences)
-
-    int baseDcDiff;
-    int flatnessThreshold;
-
-    int forcedQuant;                ///< quantizer if FORCE_QUANT is used
-} PPMode;
-
-/**
- * postprocess context.
- */
-typedef struct PPContext{
-    /**
-     * info on struct for av_log
-     */
-    const AVClass *av_class;
-
-    uint8_t *tempBlocks; ///<used for the horizontal code
-
-    /**
-     * luma histogram.
-     * we need 64bit here otherwise we'll going to have a problem
-     * after watching a black picture for 5 hours
-     */
-    uint64_t *yHistogram;
-
-    DECLARE_ALIGNED(8, uint64_t, packedYOffset);
-    DECLARE_ALIGNED(8, uint64_t, packedYScale);
-
-    /** Temporal noise reducing buffers */
-    uint8_t *tempBlurred[3];
-    int32_t *tempBlurredPast[3];
-
-    /** Temporary buffers for handling the last row(s) */
-    uint8_t *tempDst;
-    uint8_t *tempSrc;
-
-    uint8_t *deintTemp;
-
-    DECLARE_ALIGNED(8, uint64_t, pQPb);
-    DECLARE_ALIGNED(8, uint64_t, pQPb2);
-
-    DECLARE_ALIGNED(8, uint64_t, mmxDcOffset)[64];
-    DECLARE_ALIGNED(8, uint64_t, mmxDcThreshold)[64];
-
-    QP_STORE_T *stdQPTable;       ///< used to fix MPEG2 style qscale
-    QP_STORE_T *nonBQPTable;
-    QP_STORE_T *forcedQPTable;
-
-    int QP;
-    int nonBQP;
-
-    int frameNum;
-
-    int cpuCaps;
-
-    int qpStride; ///<size of qp buffers (needed to realloc them if needed)
-    int stride;   ///<size of some buffers (needed to realloc them if needed)
-
-    int hChromaSubSample;
-    int vChromaSubSample;
-
-    PPMode ppMode;
-} PPContext;
-
-
-static inline void linecpy(void *dest, const void *src, int lines, int stride) {
-    if (stride > 0) {
-        memcpy(dest, src, lines*stride);
-    } else {
-        memcpy((uint8_t*)dest+(lines-1)*stride, (const uint8_t*)src+(lines-1)*stride, -lines*stride);
-    }
-}
-
-#endif /* POSTPROC_POSTPROCESS_INTERNAL_H */
diff --git a/libpostproc/postprocess_template.c b/libpostproc/postprocess_template.c
deleted file mode 100644
index a609ce8121..0000000000
--- a/libpostproc/postprocess_template.c
+++ /dev/null
@@ -1,3634 +0,0 @@
-/*
- * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * mmx/mmx2/3dnow postprocess code.
- */
-
-#include "libavutil/x86_cpu.h"
-
-#define ALIGN_MASK "$-8"
-
-#undef REAL_PAVGB
-#undef PAVGB
-#undef PMINUB
-#undef PMAXUB
-
-#if   HAVE_MMX2
-#define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
-#elif HAVE_AMD3DNOW
-#define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
-#endif
-#define PAVGB(a,b)  REAL_PAVGB(a,b)
-
-#if   HAVE_MMX2
-#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
-#elif HAVE_MMX
-#define PMINUB(b,a,t) \
-    "movq " #a ", " #t " \n\t"\
-    "psubusb " #b ", " #t " \n\t"\
-    "psubb " #t ", " #a " \n\t"
-#endif
-
-#if   HAVE_MMX2
-#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
-#elif HAVE_MMX
-#define PMAXUB(a,b) \
-    "psubusb " #a ", " #b " \n\t"\
-    "paddb " #a ", " #b " \n\t"
-#endif
-
-//FIXME? |255-0| = 1 (should not be a problem ...)
-#if HAVE_MMX
-/**
- * Check if the middle 8x8 Block in the given 8x16 block is flat
- */
-static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
-    int numEq= 0, dcOk;
-    src+= stride*4; // src points to begin of the 8x8 Block
-    __asm__ volatile(
-        "movq %0, %%mm7                         \n\t"
-        "movq %1, %%mm6                         \n\t"
-        : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
-        );
-
-    __asm__ volatile(
-        "lea (%2, %3), %%"REG_a"                \n\t"
-//      0       1       2       3       4       5       6       7       8       9
-//      %1      eax     eax+%2  eax+2%2 %1+4%2  ecx     ecx+%2  ecx+2%2 %1+8%2  ecx+4%2
-
-        "movq (%2), %%mm0                       \n\t"
-        "movq (%%"REG_a"), %%mm1                \n\t"
-        "movq %%mm0, %%mm3                      \n\t"
-        "movq %%mm0, %%mm4                      \n\t"
-        PMAXUB(%%mm1, %%mm4)
-        PMINUB(%%mm1, %%mm3, %%mm5)
-        "psubb %%mm1, %%mm0                     \n\t" // mm0 = differnece
-        "paddb %%mm7, %%mm0                     \n\t"
-        "pcmpgtb %%mm6, %%mm0                   \n\t"
-
-        "movq (%%"REG_a",%3), %%mm2             \n\t"
-        PMAXUB(%%mm2, %%mm4)
-        PMINUB(%%mm2, %%mm3, %%mm5)
-        "psubb %%mm2, %%mm1                     \n\t"
-        "paddb %%mm7, %%mm1                     \n\t"
-        "pcmpgtb %%mm6, %%mm1                   \n\t"
-        "paddb %%mm1, %%mm0                     \n\t"
-
-        "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
-        PMAXUB(%%mm1, %%mm4)
-        PMINUB(%%mm1, %%mm3, %%mm5)
-        "psubb %%mm1, %%mm2                     \n\t"
-        "paddb %%mm7, %%mm2                     \n\t"
-        "pcmpgtb %%mm6, %%mm2                   \n\t"
-        "paddb %%mm2, %%mm0                     \n\t"
-
-        "lea (%%"REG_a", %3, 4), %%"REG_a"      \n\t"
-
-        "movq (%2, %3, 4), %%mm2                \n\t"
-        PMAXUB(%%mm2, %%mm4)
-        PMINUB(%%mm2, %%mm3, %%mm5)
-        "psubb %%mm2, %%mm1                     \n\t"
-        "paddb %%mm7, %%mm1                     \n\t"
-        "pcmpgtb %%mm6, %%mm1                   \n\t"
-        "paddb %%mm1, %%mm0                     \n\t"
-
-        "movq (%%"REG_a"), %%mm1                \n\t"
-        PMAXUB(%%mm1, %%mm4)
-        PMINUB(%%mm1, %%mm3, %%mm5)
-        "psubb %%mm1, %%mm2                     \n\t"
-        "paddb %%mm7, %%mm2                     \n\t"
-        "pcmpgtb %%mm6, %%mm2                   \n\t"
-        "paddb %%mm2, %%mm0                     \n\t"
-
-        "movq (%%"REG_a", %3), %%mm2            \n\t"
-        PMAXUB(%%mm2, %%mm4)
-        PMINUB(%%mm2, %%mm3, %%mm5)
-        "psubb %%mm2, %%mm1                     \n\t"
-        "paddb %%mm7, %%mm1                     \n\t"
-        "pcmpgtb %%mm6, %%mm1                   \n\t"
-        "paddb %%mm1, %%mm0                     \n\t"
-
-        "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
-        PMAXUB(%%mm1, %%mm4)
-        PMINUB(%%mm1, %%mm3, %%mm5)
-        "psubb %%mm1, %%mm2                     \n\t"
-        "paddb %%mm7, %%mm2                     \n\t"
-        "pcmpgtb %%mm6, %%mm2                   \n\t"
-        "paddb %%mm2, %%mm0                     \n\t"
-        "psubusb %%mm3, %%mm4                   \n\t"
-
-        "                                       \n\t"
-#if HAVE_MMX2
-        "pxor %%mm7, %%mm7                      \n\t"
-        "psadbw %%mm7, %%mm0                    \n\t"
-#else
-        "movq %%mm0, %%mm1                      \n\t"
-        "psrlw $8, %%mm0                        \n\t"
-        "paddb %%mm1, %%mm0                     \n\t"
-        "movq %%mm0, %%mm1                      \n\t"
-        "psrlq $16, %%mm0                       \n\t"
-        "paddb %%mm1, %%mm0                     \n\t"
-        "movq %%mm0, %%mm1                      \n\t"
-        "psrlq $32, %%mm0                       \n\t"
-        "paddb %%mm1, %%mm0                     \n\t"
-#endif
-        "movq %4, %%mm7                         \n\t" // QP,..., QP
-        "paddusb %%mm7, %%mm7                   \n\t" // 2QP ... 2QP
-        "psubusb %%mm7, %%mm4                   \n\t" // Diff <= 2QP -> 0
-        "packssdw %%mm4, %%mm4                  \n\t"
-        "movd %%mm0, %0                         \n\t"
-        "movd %%mm4, %1                         \n\t"
-
-        : "=r" (numEq), "=r" (dcOk)
-        : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
-        : "%"REG_a
-        );
-
-    numEq= (-numEq) &0xFF;
-    if(numEq > c->ppMode.flatnessThreshold){
-        if(dcOk) return 0;
-        else     return 1;
-    }else{
-        return 2;
-    }
-}
-#endif //HAVE_MMX
-
-/**
- * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
- * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
- */
-#if !HAVE_ALTIVEC
-static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
-{
-#if HAVE_MMX2 || HAVE_AMD3DNOW
-    src+= stride*3;
-    __asm__ volatile(        //"movv %0 %1 %2\n\t"
-        "movq %2, %%mm0                         \n\t"  // QP,..., QP
-        "pxor %%mm4, %%mm4                      \n\t"
-
-        "movq (%0), %%mm6                       \n\t"
-        "movq (%0, %1), %%mm5                   \n\t"
-        "movq %%mm5, %%mm1                      \n\t"
-        "movq %%mm6, %%mm2                      \n\t"
-        "psubusb %%mm6, %%mm5                   \n\t"
-        "psubusb %%mm1, %%mm2                   \n\t"
-        "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
-        "psubusb %%mm0, %%mm2                   \n\t" // diff <= QP -> 0
-        "pcmpeqb %%mm4, %%mm2                   \n\t" // diff <= QP -> FF
-
-        "pand %%mm2, %%mm6                      \n\t"
-        "pandn %%mm1, %%mm2                     \n\t"
-        "por %%mm2, %%mm6                       \n\t"// First Line to Filter
-
-        "movq (%0, %1, 8), %%mm5                \n\t"
-        "lea (%0, %1, 4), %%"REG_a"             \n\t"
-        "lea (%0, %1, 8), %%"REG_c"             \n\t"
-        "sub %1, %%"REG_c"                      \n\t"
-        "add %1, %0                             \n\t" // %0 points to line 1 not 0
-        "movq (%0, %1, 8), %%mm7                \n\t"
-        "movq %%mm5, %%mm1                      \n\t"
-        "movq %%mm7, %%mm2                      \n\t"
-        "psubusb %%mm7, %%mm5                   \n\t"
-        "psubusb %%mm1, %%mm2                   \n\t"
-        "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
-        "psubusb %%mm0, %%mm2                   \n\t" // diff <= QP -> 0
-        "pcmpeqb %%mm4, %%mm2                   \n\t" // diff <= QP -> FF
-
-        "pand %%mm2, %%mm7                      \n\t"
-        "pandn %%mm1, %%mm2                     \n\t"
-        "por %%mm2, %%mm7                       \n\t" // First Line to Filter
-
-
-        //      1       2       3       4       5       6       7       8
-        //      %0      %0+%1   %0+2%1  eax     %0+4%1  eax+2%1 ecx     eax+4%1
-        // 6 4 2 2 1 1
-        // 6 4 4 2
-        // 6 8 2
-
-        "movq (%0, %1), %%mm0                   \n\t" //  1
-        "movq %%mm0, %%mm1                      \n\t" //  1
-        PAVGB(%%mm6, %%mm0)                           //1 1        /2
-        PAVGB(%%mm6, %%mm0)                           //3 1        /4
-
-        "movq (%0, %1, 4), %%mm2                \n\t" //     1
-        "movq %%mm2, %%mm5                      \n\t" //     1
-        PAVGB((%%REGa), %%mm2)                        //    11        /2
-        PAVGB((%0, %1, 2), %%mm2)                     //   211        /4
-        "movq %%mm2, %%mm3                      \n\t" //   211        /4
-        "movq (%0), %%mm4                       \n\t" // 1
-        PAVGB(%%mm4, %%mm3)                           // 4 211        /8
-        PAVGB(%%mm0, %%mm3)                           //642211        /16
-        "movq %%mm3, (%0)                       \n\t" // X
-        // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
-        "movq %%mm1, %%mm0                      \n\t" //  1
-        PAVGB(%%mm6, %%mm0)                           //1 1        /2
-        "movq %%mm4, %%mm3                      \n\t" // 1
-        PAVGB((%0,%1,2), %%mm3)                       // 1 1        /2
-        PAVGB((%%REGa,%1,2), %%mm5)                   //     11        /2
-        PAVGB((%%REGa), %%mm5)                        //    211 /4
-        PAVGB(%%mm5, %%mm3)                           // 2 2211 /8
-        PAVGB(%%mm0, %%mm3)                           //4242211 /16
-        "movq %%mm3, (%0,%1)                    \n\t" //  X
-        // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
-        PAVGB(%%mm4, %%mm6)                                   //11        /2
-        "movq (%%"REG_c"), %%mm0                \n\t" //       1
-        PAVGB((%%REGa, %1, 2), %%mm0)                 //      11/2
-        "movq %%mm0, %%mm3                      \n\t" //      11/2
-        PAVGB(%%mm1, %%mm0)                           //  2   11/4
-        PAVGB(%%mm6, %%mm0)                           //222   11/8
-        PAVGB(%%mm2, %%mm0)                           //22242211/16
-        "movq (%0, %1, 2), %%mm2                \n\t" //   1
-        "movq %%mm0, (%0, %1, 2)                \n\t" //   X
-        // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
-        "movq (%%"REG_a", %1, 4), %%mm0         \n\t" //        1
-        PAVGB((%%REGc), %%mm0)                        //       11        /2
-        PAVGB(%%mm0, %%mm6)                           //11     11        /4
-        PAVGB(%%mm1, %%mm4)                           // 11                /2
-        PAVGB(%%mm2, %%mm1)                           //  11                /2
-        PAVGB(%%mm1, %%mm6)                           //1122   11        /8
-        PAVGB(%%mm5, %%mm6)                           //112242211        /16
-        "movq (%%"REG_a"), %%mm5                \n\t" //    1
-        "movq %%mm6, (%%"REG_a")                \n\t" //    X
-        // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
-        "movq (%%"REG_a", %1, 4), %%mm6         \n\t" //        1
-        PAVGB(%%mm7, %%mm6)                           //        11        /2
-        PAVGB(%%mm4, %%mm6)                           // 11     11        /4
-        PAVGB(%%mm3, %%mm6)                           // 11   2211        /8
-        PAVGB(%%mm5, %%mm2)                           //   11                /2
-        "movq (%0, %1, 4), %%mm4                \n\t" //     1
-        PAVGB(%%mm4, %%mm2)                           //   112                /4
-        PAVGB(%%mm2, %%mm6)                           // 112242211        /16
-        "movq %%mm6, (%0, %1, 4)                \n\t" //     X
-        // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
-        PAVGB(%%mm7, %%mm1)                           //  11     2        /4
-        PAVGB(%%mm4, %%mm5)                           //    11                /2
-        PAVGB(%%mm5, %%mm0)                           //    11 11        /4
-        "movq (%%"REG_a", %1, 2), %%mm6         \n\t" //      1
-        PAVGB(%%mm6, %%mm1)                           //  11  4  2        /8
-        PAVGB(%%mm0, %%mm1)                           //  11224222        /16
-        "movq %%mm1, (%%"REG_a", %1, 2)         \n\t" //      X
-        // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
-        PAVGB((%%REGc), %%mm2)                        //   112 4        /8
-        "movq (%%"REG_a", %1, 4), %%mm0         \n\t" //        1
-        PAVGB(%%mm0, %%mm6)                           //      1 1        /2
-        PAVGB(%%mm7, %%mm6)                           //      1 12        /4
-        PAVGB(%%mm2, %%mm6)                           //   1122424        /4
-        "movq %%mm6, (%%"REG_c")                \n\t" //       X
-        // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
-        PAVGB(%%mm7, %%mm5)                           //    11   2        /4
-        PAVGB(%%mm7, %%mm5)                           //    11   6        /8
-
-        PAVGB(%%mm3, %%mm0)                           //      112        /4
-        PAVGB(%%mm0, %%mm5)                           //    112246        /16
-        "movq %%mm5, (%%"REG_a", %1, 4)         \n\t" //        X
-        "sub %1, %0                             \n\t"
-
-        :
-        : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
-        : "%"REG_a, "%"REG_c
-    );
-#else //HAVE_MMX2 || HAVE_AMD3DNOW
-    const int l1= stride;
-    const int l2= stride + l1;
-    const int l3= stride + l2;
-    const int l4= stride + l3;
-    const int l5= stride + l4;
-    const int l6= stride + l5;
-    const int l7= stride + l6;
-    const int l8= stride + l7;
-    const int l9= stride + l8;
-    int x;
-    src+= stride*3;
-    for(x=0; x<BLOCK_SIZE; x++){
-        const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
-        const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
-
-        int sums[10];
-        sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
-        sums[1] = sums[0] - first  + src[l4];
-        sums[2] = sums[1] - first  + src[l5];
-        sums[3] = sums[2] - first  + src[l6];
-        sums[4] = sums[3] - first  + src[l7];
-        sums[5] = sums[4] - src[l1] + src[l8];
-        sums[6] = sums[5] - src[l2] + last;
-        sums[7] = sums[6] - src[l3] + last;
-        sums[8] = sums[7] - src[l4] + last;
-        sums[9] = sums[8] - src[l5] + last;
-
-        src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
-        src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
-        src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
-        src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
-        src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
-        src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
-        src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
-        src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
-
-        src++;
-    }
-#endif //HAVE_MMX2 || HAVE_AMD3DNOW
-}
-#endif //HAVE_ALTIVEC
-
-/**
- * Experimental Filter 1
- * will not damage linear gradients
- * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
- * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
- * MMX2 version does correct clipping C version does not
- */
-static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
-{
-#if HAVE_MMX2 || HAVE_AMD3DNOW
-    src+= stride*3;
-
-    __asm__ volatile(
-        "pxor %%mm7, %%mm7                      \n\t" // 0
-        "lea (%0, %1), %%"REG_a"                \n\t"
-        "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
-//      0       1       2       3       4       5       6       7       8       9
-//      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
-        "movq (%%"REG_a", %1, 2), %%mm0         \n\t" // line 3
-        "movq (%0, %1, 4), %%mm1                \n\t" // line 4
-        "movq %%mm1, %%mm2                      \n\t" // line 4
-        "psubusb %%mm0, %%mm1                   \n\t"
-        "psubusb %%mm2, %%mm0                   \n\t"
-        "por %%mm1, %%mm0                       \n\t" // |l2 - l3|
-        "movq (%%"REG_c"), %%mm3                \n\t" // line 5
-        "movq (%%"REG_c", %1), %%mm4            \n\t" // line 6
-        "movq %%mm3, %%mm5                      \n\t" // line 5
-        "psubusb %%mm4, %%mm3                   \n\t"
-        "psubusb %%mm5, %%mm4                   \n\t"
-        "por %%mm4, %%mm3                       \n\t" // |l5 - l6|
-        PAVGB(%%mm3, %%mm0)                           // (|l2 - l3| + |l5 - l6|)/2
-        "movq %%mm2, %%mm1                      \n\t" // line 4
-        "psubusb %%mm5, %%mm2                   \n\t"
-        "movq %%mm2, %%mm4                      \n\t"
-        "pcmpeqb %%mm7, %%mm2                   \n\t" // (l4 - l5) <= 0 ? -1 : 0
-        "psubusb %%mm1, %%mm5                   \n\t"
-        "por %%mm5, %%mm4                       \n\t" // |l4 - l5|
-        "psubusb %%mm0, %%mm4                   \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
-        "movq %%mm4, %%mm3                      \n\t" // d
-        "movq %2, %%mm0                         \n\t"
-        "paddusb %%mm0, %%mm0                   \n\t"
-        "psubusb %%mm0, %%mm4                   \n\t"
-        "pcmpeqb %%mm7, %%mm4                   \n\t" // d <= QP ? -1 : 0
-        "psubusb "MANGLE(b01)", %%mm3           \n\t"
-        "pand %%mm4, %%mm3                      \n\t" // d <= QP ? d : 0
-
-        PAVGB(%%mm7, %%mm3)                           // d/2
-        "movq %%mm3, %%mm1                      \n\t" // d/2
-        PAVGB(%%mm7, %%mm3)                           // d/4
-        PAVGB(%%mm1, %%mm3)                           // 3*d/8
-
-        "movq (%0, %1, 4), %%mm0                \n\t" // line 4
-        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
-        "psubusb %%mm3, %%mm0                   \n\t"
-        "pxor %%mm2, %%mm0                      \n\t"
-        "movq %%mm0, (%0, %1, 4)                \n\t" // line 4
-
-        "movq (%%"REG_c"), %%mm0                \n\t" // line 5
-        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
-        "paddusb %%mm3, %%mm0                   \n\t"
-        "pxor %%mm2, %%mm0                      \n\t"
-        "movq %%mm0, (%%"REG_c")                \n\t" // line 5
-
-        PAVGB(%%mm7, %%mm1)                           // d/4
-
-        "movq (%%"REG_a", %1, 2), %%mm0         \n\t" // line 3
-        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
-        "psubusb %%mm1, %%mm0                   \n\t"
-        "pxor %%mm2, %%mm0                      \n\t"
-        "movq %%mm0, (%%"REG_a", %1, 2)         \n\t" // line 3
-
-        "movq (%%"REG_c", %1), %%mm0            \n\t" // line 6
-        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
-        "paddusb %%mm1, %%mm0                   \n\t"
-        "pxor %%mm2, %%mm0                      \n\t"
-        "movq %%mm0, (%%"REG_c", %1)            \n\t" // line 6
-
-        PAVGB(%%mm7, %%mm1)                           // d/8
-
-        "movq (%%"REG_a", %1), %%mm0            \n\t" // line 2
-        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
-        "psubusb %%mm1, %%mm0                   \n\t"
-        "pxor %%mm2, %%mm0                      \n\t"
-        "movq %%mm0, (%%"REG_a", %1)            \n\t" // line 2
-
-        "movq (%%"REG_c", %1, 2), %%mm0         \n\t" // line 7
-        "pxor %%mm2, %%mm0                      \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
-        "paddusb %%mm1, %%mm0                   \n\t"
-        "pxor %%mm2, %%mm0                      \n\t"
-        "movq %%mm0, (%%"REG_c", %1, 2)         \n\t" // line 7
-
-        :
-        : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
-        : "%"REG_a, "%"REG_c
-    );
-#else //HAVE_MMX2 || HAVE_AMD3DNOW
-
-    const int l1= stride;
-    const int l2= stride + l1;
-    const int l3= stride + l2;
-    const int l4= stride + l3;
-    const int l5= stride + l4;
-    const int l6= stride + l5;
-    const int l7= stride + l6;
-//    const int l8= stride + l7;
-//    const int l9= stride + l8;
-    int x;
-
-    src+= stride*3;
-    for(x=0; x<BLOCK_SIZE; x++){
-        int a= src[l3] - src[l4];
-        int b= src[l4] - src[l5];
-        int c= src[l5] - src[l6];
-
-        int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
-        d= FFMAX(d, 0);
-
-        if(d < co->QP*2){
-            int v = d * FFSIGN(-b);
-
-            src[l2] +=v>>3;
-            src[l3] +=v>>2;
-            src[l4] +=(3*v)>>3;
-            src[l5] -=(3*v)>>3;
-            src[l6] -=v>>2;
-            src[l7] -=v>>3;
-        }
-        src++;
-    }
-#endif //HAVE_MMX2 || HAVE_AMD3DNOW
-}
-
-#if !HAVE_ALTIVEC
-static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
-{
-#if HAVE_MMX2 || HAVE_AMD3DNOW
-/*
-    uint8_t tmp[16];
-    const int l1= stride;
-    const int l2= stride + l1;
-    const int l3= stride + l2;
-    const int l4= (int)tmp - (int)src - stride*3;
-    const int l5= (int)tmp - (int)src - stride*3 + 8;
-    const int l6= stride*3 + l3;
-    const int l7= stride + l6;
-    const int l8= stride + l7;
-
-    memcpy(tmp, src+stride*7, 8);
-    memcpy(tmp+8, src+stride*8, 8);
-*/
-    src+= stride*4;
-    __asm__ volatile(
-
-#if 0 //slightly more accurate and slightly slower
-        "pxor %%mm7, %%mm7                      \n\t" // 0
-        "lea (%0, %1), %%"REG_a"                \n\t"
-        "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
-//      0       1       2       3       4       5       6       7
-//      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ecx+%1  ecx+2%1
-//      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1
-
-
-        "movq (%0, %1, 2), %%mm0                \n\t" // l2
-        "movq (%0), %%mm1                       \n\t" // l0
-        "movq %%mm0, %%mm2                      \n\t" // l2
-        PAVGB(%%mm7, %%mm0)                           // ~l2/2
-        PAVGB(%%mm1, %%mm0)                           // ~(l2 + 2l0)/4
-        PAVGB(%%mm2, %%mm0)                           // ~(5l2 + 2l0)/8
-
-        "movq (%%"REG_a"), %%mm1                \n\t" // l1
-        "movq (%%"REG_a", %1, 2), %%mm3         \n\t" // l3
-        "movq %%mm1, %%mm4                      \n\t" // l1
-        PAVGB(%%mm7, %%mm1)                           // ~l1/2
-        PAVGB(%%mm3, %%mm1)                           // ~(l1 + 2l3)/4
-        PAVGB(%%mm4, %%mm1)                           // ~(5l1 + 2l3)/8
-
-        "movq %%mm0, %%mm4                      \n\t" // ~(5l2 + 2l0)/8
-        "psubusb %%mm1, %%mm0                   \n\t"
-        "psubusb %%mm4, %%mm1                   \n\t"
-        "por %%mm0, %%mm1                       \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
-// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
-
-        "movq (%0, %1, 4), %%mm0                \n\t" // l4
-        "movq %%mm0, %%mm4                      \n\t" // l4
-        PAVGB(%%mm7, %%mm0)                           // ~l4/2
-        PAVGB(%%mm2, %%mm0)                           // ~(l4 + 2l2)/4
-        PAVGB(%%mm4, %%mm0)                           // ~(5l4 + 2l2)/8
-
-        "movq (%%"REG_c"), %%mm2                \n\t" // l5
-        "movq %%mm3, %%mm5                      \n\t" // l3
-        PAVGB(%%mm7, %%mm3)                           // ~l3/2
-        PAVGB(%%mm2, %%mm3)                           // ~(l3 + 2l5)/4
-        PAVGB(%%mm5, %%mm3)                           // ~(5l3 + 2l5)/8
-
-        "movq %%mm0, %%mm6                      \n\t" // ~(5l4 + 2l2)/8
-        "psubusb %%mm3, %%mm0                   \n\t"
-        "psubusb %%mm6, %%mm3                   \n\t"
-        "por %%mm0, %%mm3                       \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
-        "pcmpeqb %%mm7, %%mm0                   \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
-// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
-
-        "movq (%%"REG_c", %1), %%mm6            \n\t" // l6
-        "movq %%mm6, %%mm5                      \n\t" // l6
-        PAVGB(%%mm7, %%mm6)                           // ~l6/2
-        PAVGB(%%mm4, %%mm6)                           // ~(l6 + 2l4)/4
-        PAVGB(%%mm5, %%mm6)                           // ~(5l6 + 2l4)/8
-
-        "movq (%%"REG_c", %1, 2), %%mm5         \n\t" // l7
-        "movq %%mm2, %%mm4                      \n\t" // l5
-        PAVGB(%%mm7, %%mm2)                           // ~l5/2
-        PAVGB(%%mm5, %%mm2)                           // ~(l5 + 2l7)/4
-        PAVGB(%%mm4, %%mm2)                           // ~(5l5 + 2l7)/8
-
-        "movq %%mm6, %%mm4                      \n\t" // ~(5l6 + 2l4)/8
-        "psubusb %%mm2, %%mm6                   \n\t"
-        "psubusb %%mm4, %%mm2                   \n\t"
-        "por %%mm6, %%mm2                       \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
-// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
-
-
-        PMINUB(%%mm2, %%mm1, %%mm4)                   // MIN(|lenergy|,|renergy|)/8
-        "movq %2, %%mm4                         \n\t" // QP //FIXME QP+1 ?
-        "paddusb "MANGLE(b01)", %%mm4           \n\t"
-        "pcmpgtb %%mm3, %%mm4                   \n\t" // |menergy|/8 < QP
-        "psubusb %%mm1, %%mm3                   \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
-        "pand %%mm4, %%mm3                      \n\t"
-
-        "movq %%mm3, %%mm1                      \n\t"
-//        "psubusb "MANGLE(b01)", %%mm3           \n\t"
-        PAVGB(%%mm7, %%mm3)
-        PAVGB(%%mm7, %%mm3)
-        "paddusb %%mm1, %%mm3                   \n\t"
-//        "paddusb "MANGLE(b01)", %%mm3           \n\t"
-
-        "movq (%%"REG_a", %1, 2), %%mm6         \n\t" //l3
-        "movq (%0, %1, 4), %%mm5                \n\t" //l4
-        "movq (%0, %1, 4), %%mm4                \n\t" //l4
-        "psubusb %%mm6, %%mm5                   \n\t"
-        "psubusb %%mm4, %%mm6                   \n\t"
-        "por %%mm6, %%mm5                       \n\t" // |l3-l4|
-        "pcmpeqb %%mm7, %%mm6                   \n\t" // SIGN(l3-l4)
-        "pxor %%mm6, %%mm0                      \n\t"
-        "pand %%mm0, %%mm3                      \n\t"
-        PMINUB(%%mm5, %%mm3, %%mm0)
-
-        "psubusb "MANGLE(b01)", %%mm3           \n\t"
-        PAVGB(%%mm7, %%mm3)
-
-        "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
-        "movq (%0, %1, 4), %%mm2                \n\t"
-        "pxor %%mm6, %%mm0                      \n\t"
-        "pxor %%mm6, %%mm2                      \n\t"
-        "psubb %%mm3, %%mm0                     \n\t"
-        "paddb %%mm3, %%mm2                     \n\t"
-        "pxor %%mm6, %%mm0                      \n\t"
-        "pxor %%mm6, %%mm2                      \n\t"
-        "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
-        "movq %%mm2, (%0, %1, 4)                \n\t"
-#endif //0
-
-        "lea (%0, %1), %%"REG_a"                \n\t"
-        "pcmpeqb %%mm6, %%mm6                   \n\t" // -1
-//      0       1       2       3       4       5       6       7
-//      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 ecx+%1  ecx+2%1
-//      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1
-
-
-        "movq (%%"REG_a", %1, 2), %%mm1         \n\t" // l3
-        "movq (%0, %1, 4), %%mm0                \n\t" // l4
-        "pxor %%mm6, %%mm1                      \n\t" // -l3-1
-        PAVGB(%%mm1, %%mm0)                           // -q+128 = (l4-l3+256)/2
-// mm1=-l3-1, mm0=128-q
-
-        "movq (%%"REG_a", %1, 4), %%mm2         \n\t" // l5
-        "movq (%%"REG_a", %1), %%mm3            \n\t" // l2
-        "pxor %%mm6, %%mm2                      \n\t" // -l5-1
-        "movq %%mm2, %%mm5                      \n\t" // -l5-1
-        "movq "MANGLE(b80)", %%mm4              \n\t" // 128
-        "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
-        PAVGB(%%mm3, %%mm2)                           // (l2-l5+256)/2
-        PAVGB(%%mm0, %%mm4)                           // ~(l4-l3)/4 + 128
-        PAVGB(%%mm2, %%mm4)                           // ~(l2-l5)/4 +(l4-l3)/8 + 128
-        PAVGB(%%mm0, %%mm4)                           // ~(l2-l5)/8 +5(l4-l3)/16 + 128
-// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
-
-        "movq (%%"REG_a"), %%mm2                \n\t" // l1
-        "pxor %%mm6, %%mm2                      \n\t" // -l1-1
-        PAVGB(%%mm3, %%mm2)                           // (l2-l1+256)/2
-        PAVGB((%0), %%mm1)                            // (l0-l3+256)/2
-        "movq "MANGLE(b80)", %%mm3              \n\t" // 128
-        PAVGB(%%mm2, %%mm3)                           // ~(l2-l1)/4 + 128
-        PAVGB(%%mm1, %%mm3)                           // ~(l0-l3)/4 +(l2-l1)/8 + 128
-        PAVGB(%%mm2, %%mm3)                           // ~(l0-l3)/8 +5(l2-l1)/16 + 128
-// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
-
-        PAVGB((%%REGc, %1), %%mm5)                    // (l6-l5+256)/2
-        "movq (%%"REG_c", %1, 2), %%mm1         \n\t" // l7
-        "pxor %%mm6, %%mm1                      \n\t" // -l7-1
-        PAVGB((%0, %1, 4), %%mm1)                     // (l4-l7+256)/2
-        "movq "MANGLE(b80)", %%mm2              \n\t" // 128
-        PAVGB(%%mm5, %%mm2)                           // ~(l6-l5)/4 + 128
-        PAVGB(%%mm1, %%mm2)                           // ~(l4-l7)/4 +(l6-l5)/8 + 128
-        PAVGB(%%mm5, %%mm2)                           // ~(l4-l7)/8 +5(l6-l5)/16 + 128
-// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
-
-        "movq "MANGLE(b00)", %%mm1              \n\t" // 0
-        "movq "MANGLE(b00)", %%mm5              \n\t" // 0
-        "psubb %%mm2, %%mm1                     \n\t" // 128 - renergy/16
-        "psubb %%mm3, %%mm5                     \n\t" // 128 - lenergy/16
-        PMAXUB(%%mm1, %%mm2)                          // 128 + |renergy/16|
-        PMAXUB(%%mm5, %%mm3)                          // 128 + |lenergy/16|
-        PMINUB(%%mm2, %%mm3, %%mm1)                   // 128 + MIN(|lenergy|,|renergy|)/16
-
-// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
-
-        "movq "MANGLE(b00)", %%mm7              \n\t" // 0
-        "movq %2, %%mm2                         \n\t" // QP
-        PAVGB(%%mm6, %%mm2)                           // 128 + QP/2
-        "psubb %%mm6, %%mm2                     \n\t"
-
-        "movq %%mm4, %%mm1                      \n\t"
-        "pcmpgtb %%mm7, %%mm1                   \n\t" // SIGN(menergy)
-        "pxor %%mm1, %%mm4                      \n\t"
-        "psubb %%mm1, %%mm4                     \n\t" // 128 + |menergy|/16
-        "pcmpgtb %%mm4, %%mm2                   \n\t" // |menergy|/16 < QP/2
-        "psubusb %%mm3, %%mm4                   \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
-// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
-
-        "movq %%mm4, %%mm3                      \n\t" // d
-        "psubusb "MANGLE(b01)", %%mm4           \n\t"
-        PAVGB(%%mm7, %%mm4)                           // d/32
-        PAVGB(%%mm7, %%mm4)                           // (d + 32)/64
-        "paddb %%mm3, %%mm4                     \n\t" // 5d/64
-        "pand %%mm2, %%mm4                      \n\t"
-
-        "movq "MANGLE(b80)", %%mm5              \n\t" // 128
-        "psubb %%mm0, %%mm5                     \n\t" // q
-        "paddsb %%mm6, %%mm5                    \n\t" // fix bad rounding
-        "pcmpgtb %%mm5, %%mm7                   \n\t" // SIGN(q)
-        "pxor %%mm7, %%mm5                      \n\t"
-
-        PMINUB(%%mm5, %%mm4, %%mm3)                   // MIN(|q|, 5d/64)
-        "pxor %%mm1, %%mm7                      \n\t" // SIGN(d*q)
-
-        "pand %%mm7, %%mm4                      \n\t"
-        "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
-        "movq (%0, %1, 4), %%mm2                \n\t"
-        "pxor %%mm1, %%mm0                      \n\t"
-        "pxor %%mm1, %%mm2                      \n\t"
-        "paddb %%mm4, %%mm0                     \n\t"
-        "psubb %%mm4, %%mm2                     \n\t"
-        "pxor %%mm1, %%mm0                      \n\t"
-        "pxor %%mm1, %%mm2                      \n\t"
-        "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
-        "movq %%mm2, (%0, %1, 4)                \n\t"
-
-        :
-        : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
-        : "%"REG_a, "%"REG_c
-    );
-
-/*
-    {
-    int x;
-    src-= stride;
-    for(x=0; x<BLOCK_SIZE; x++){
-        const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
-        if(FFABS(middleEnergy)< 8*QP){
-            const int q=(src[l4] - src[l5])/2;
-            const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
-            const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
-
-            int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
-            d= FFMAX(d, 0);
-
-            d= (5*d + 32) >> 6;
-            d*= FFSIGN(-middleEnergy);
-
-            if(q>0){
-                d= d<0 ? 0 : d;
-                d= d>q ? q : d;
-            }else{
-                d= d>0 ? 0 : d;
-                d= d<q ? q : d;
-            }
-
-            src[l4]-= d;
-            src[l5]+= d;
-        }
-        src++;
-    }
-    src-=8;
-    for(x=0; x<8; x++){
-        int y;
-        for(y=4; y<6; y++){
-            int d= src[x+y*stride] - tmp[x+(y-4)*8];
-            int ad= FFABS(d);
-            static int max=0;
-            static int sum=0;
-            static int num=0;
-            static int bias=0;
-
-            if(max<ad) max=ad;
-            sum+= ad>3 ? 1 : 0;
-            if(ad>3){
-                src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
-            }
-            if(y==4) bias+=d;
-            num++;
-            if(num%1000000 == 0){
-                av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
-            }
-        }
-    }
-}
-*/
-#elif HAVE_MMX
-    src+= stride*4;
-    __asm__ volatile(
-        "pxor %%mm7, %%mm7                      \n\t"
-        "lea -40(%%"REG_SP"), %%"REG_c"         \n\t" // make space for 4 8-byte vars
-        "and "ALIGN_MASK", %%"REG_c"            \n\t" // align
-//      0       1       2       3       4       5       6       7
-//      %0      %0+%1   %0+2%1  eax+2%1 %0+4%1  eax+4%1 edx+%1  edx+2%1
-//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1
-
-        "movq (%0), %%mm0                       \n\t"
-        "movq %%mm0, %%mm1                      \n\t"
-        "punpcklbw %%mm7, %%mm0                 \n\t" // low part of line 0
-        "punpckhbw %%mm7, %%mm1                 \n\t" // high part of line 0
-
-        "movq (%0, %1), %%mm2                   \n\t"
-        "lea (%0, %1, 2), %%"REG_a"             \n\t"
-        "movq %%mm2, %%mm3                      \n\t"
-        "punpcklbw %%mm7, %%mm2                 \n\t" // low part of line 1
-        "punpckhbw %%mm7, %%mm3                 \n\t" // high part of line 1
-
-        "movq (%%"REG_a"), %%mm4                \n\t"
-        "movq %%mm4, %%mm5                      \n\t"
-        "punpcklbw %%mm7, %%mm4                 \n\t" // low part of line 2
-        "punpckhbw %%mm7, %%mm5                 \n\t" // high part of line 2
-
-        "paddw %%mm0, %%mm0                     \n\t" // 2L0
-        "paddw %%mm1, %%mm1                     \n\t" // 2H0
-        "psubw %%mm4, %%mm2                     \n\t" // L1 - L2
-        "psubw %%mm5, %%mm3                     \n\t" // H1 - H2
-        "psubw %%mm2, %%mm0                     \n\t" // 2L0 - L1 + L2
-        "psubw %%mm3, %%mm1                     \n\t" // 2H0 - H1 + H2
-
-        "psllw $2, %%mm2                        \n\t" // 4L1 - 4L2
-        "psllw $2, %%mm3                        \n\t" // 4H1 - 4H2
-        "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2
-        "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2
-
-        "movq (%%"REG_a", %1), %%mm2            \n\t"
-        "movq %%mm2, %%mm3                      \n\t"
-        "punpcklbw %%mm7, %%mm2                 \n\t" // L3
-        "punpckhbw %%mm7, %%mm3                 \n\t" // H3
-
-        "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - L3
-        "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - H3
-        "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - 2L3
-        "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - 2H3
-        "movq %%mm0, (%%"REG_c")                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
-        "movq %%mm1, 8(%%"REG_c")               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
-
-        "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
-        "movq %%mm0, %%mm1                      \n\t"
-        "punpcklbw %%mm7, %%mm0                 \n\t" // L4
-        "punpckhbw %%mm7, %%mm1                 \n\t" // H4
-
-        "psubw %%mm0, %%mm2                     \n\t" // L3 - L4
-        "psubw %%mm1, %%mm3                     \n\t" // H3 - H4
-        "movq %%mm2, 16(%%"REG_c")              \n\t" // L3 - L4
-        "movq %%mm3, 24(%%"REG_c")              \n\t" // H3 - H4
-        "paddw %%mm4, %%mm4                     \n\t" // 2L2
-        "paddw %%mm5, %%mm5                     \n\t" // 2H2
-        "psubw %%mm2, %%mm4                     \n\t" // 2L2 - L3 + L4
-        "psubw %%mm3, %%mm5                     \n\t" // 2H2 - H3 + H4
-
-        "lea (%%"REG_a", %1), %0                \n\t"
-        "psllw $2, %%mm2                        \n\t" // 4L3 - 4L4
-        "psllw $2, %%mm3                        \n\t" // 4H3 - 4H4
-        "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4
-        "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4
-//50 opcodes so far
-        "movq (%0, %1, 2), %%mm2                \n\t"
-        "movq %%mm2, %%mm3                      \n\t"
-        "punpcklbw %%mm7, %%mm2                 \n\t" // L5
-        "punpckhbw %%mm7, %%mm3                 \n\t" // H5
-        "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - L5
-        "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - H5
-        "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - 2L5
-        "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - 2H5
-
-        "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
-        "punpcklbw %%mm7, %%mm6                 \n\t" // L6
-        "psubw %%mm6, %%mm2                     \n\t" // L5 - L6
-        "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
-        "punpckhbw %%mm7, %%mm6                 \n\t" // H6
-        "psubw %%mm6, %%mm3                     \n\t" // H5 - H6
-
-        "paddw %%mm0, %%mm0                     \n\t" // 2L4
-        "paddw %%mm1, %%mm1                     \n\t" // 2H4
-        "psubw %%mm2, %%mm0                     \n\t" // 2L4 - L5 + L6
-        "psubw %%mm3, %%mm1                     \n\t" // 2H4 - H5 + H6
-
-        "psllw $2, %%mm2                        \n\t" // 4L5 - 4L6
-        "psllw $2, %%mm3                        \n\t" // 4H5 - 4H6
-        "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6
-        "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6
-
-        "movq (%0, %1, 4), %%mm2                \n\t"
-        "movq %%mm2, %%mm3                      \n\t"
-        "punpcklbw %%mm7, %%mm2                 \n\t" // L7
-        "punpckhbw %%mm7, %%mm3                 \n\t" // H7
-
-        "paddw %%mm2, %%mm2                     \n\t" // 2L7
-        "paddw %%mm3, %%mm3                     \n\t" // 2H7
-        "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6 - 2L7
-        "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6 - 2H7
-
-        "movq (%%"REG_c"), %%mm2                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
-        "movq 8(%%"REG_c"), %%mm3               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
-
-#if HAVE_MMX2
-        "movq %%mm7, %%mm6                      \n\t" // 0
-        "psubw %%mm0, %%mm6                     \n\t"
-        "pmaxsw %%mm6, %%mm0                    \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
-        "movq %%mm7, %%mm6                      \n\t" // 0
-        "psubw %%mm1, %%mm6                     \n\t"
-        "pmaxsw %%mm6, %%mm1                    \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
-        "movq %%mm7, %%mm6                      \n\t" // 0
-        "psubw %%mm2, %%mm6                     \n\t"
-        "pmaxsw %%mm6, %%mm2                    \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
-        "movq %%mm7, %%mm6                      \n\t" // 0
-        "psubw %%mm3, %%mm6                     \n\t"
-        "pmaxsw %%mm6, %%mm3                    \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
-#else
-        "movq %%mm7, %%mm6                      \n\t" // 0
-        "pcmpgtw %%mm0, %%mm6                   \n\t"
-        "pxor %%mm6, %%mm0                      \n\t"
-        "psubw %%mm6, %%mm0                     \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
-        "movq %%mm7, %%mm6                      \n\t" // 0
-        "pcmpgtw %%mm1, %%mm6                   \n\t"
-        "pxor %%mm6, %%mm1                      \n\t"
-        "psubw %%mm6, %%mm1                     \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
-        "movq %%mm7, %%mm6                      \n\t" // 0
-        "pcmpgtw %%mm2, %%mm6                   \n\t"
-        "pxor %%mm6, %%mm2                      \n\t"
-        "psubw %%mm6, %%mm2                     \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
-        "movq %%mm7, %%mm6                      \n\t" // 0
-        "pcmpgtw %%mm3, %%mm6                   \n\t"
-        "pxor %%mm6, %%mm3                      \n\t"
-        "psubw %%mm6, %%mm3                     \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
-#endif
-
-#if HAVE_MMX2
-        "pminsw %%mm2, %%mm0                    \n\t"
-        "pminsw %%mm3, %%mm1                    \n\t"
-#else
-        "movq %%mm0, %%mm6                      \n\t"
-        "psubusw %%mm2, %%mm6                   \n\t"
-        "psubw %%mm6, %%mm0                     \n\t"
-        "movq %%mm1, %%mm6                      \n\t"
-        "psubusw %%mm3, %%mm6                   \n\t"
-        "psubw %%mm6, %%mm1                     \n\t"
-#endif
-
-        "movd %2, %%mm2                         \n\t" // QP
-        "punpcklbw %%mm7, %%mm2                 \n\t"
-
-        "movq %%mm7, %%mm6                      \n\t" // 0
-        "pcmpgtw %%mm4, %%mm6                   \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
-        "pxor %%mm6, %%mm4                      \n\t"
-        "psubw %%mm6, %%mm4                     \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
-        "pcmpgtw %%mm5, %%mm7                   \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
-        "pxor %%mm7, %%mm5                      \n\t"
-        "psubw %%mm7, %%mm5                     \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
-// 100 opcodes
-        "psllw $3, %%mm2                        \n\t" // 8QP
-        "movq %%mm2, %%mm3                      \n\t" // 8QP
-        "pcmpgtw %%mm4, %%mm2                   \n\t"
-        "pcmpgtw %%mm5, %%mm3                   \n\t"
-        "pand %%mm2, %%mm4                      \n\t"
-        "pand %%mm3, %%mm5                      \n\t"
-
-
-        "psubusw %%mm0, %%mm4                   \n\t" // hd
-        "psubusw %%mm1, %%mm5                   \n\t" // ld
-
-
-        "movq "MANGLE(w05)", %%mm2              \n\t" // 5
-        "pmullw %%mm2, %%mm4                    \n\t"
-        "pmullw %%mm2, %%mm5                    \n\t"
-        "movq "MANGLE(w20)", %%mm2              \n\t" // 32
-        "paddw %%mm2, %%mm4                     \n\t"
-        "paddw %%mm2, %%mm5                     \n\t"
-        "psrlw $6, %%mm4                        \n\t"
-        "psrlw $6, %%mm5                        \n\t"
-
-        "movq 16(%%"REG_c"), %%mm0              \n\t" // L3 - L4
-        "movq 24(%%"REG_c"), %%mm1              \n\t" // H3 - H4
-
-        "pxor %%mm2, %%mm2                      \n\t"
-        "pxor %%mm3, %%mm3                      \n\t"
-
-        "pcmpgtw %%mm0, %%mm2                   \n\t" // sign (L3-L4)
-        "pcmpgtw %%mm1, %%mm3                   \n\t" // sign (H3-H4)
-        "pxor %%mm2, %%mm0                      \n\t"
-        "pxor %%mm3, %%mm1                      \n\t"
-        "psubw %%mm2, %%mm0                     \n\t" // |L3-L4|
-        "psubw %%mm3, %%mm1                     \n\t" // |H3-H4|
-        "psrlw $1, %%mm0                        \n\t" // |L3 - L4|/2
-        "psrlw $1, %%mm1                        \n\t" // |H3 - H4|/2
-
-        "pxor %%mm6, %%mm2                      \n\t"
-        "pxor %%mm7, %%mm3                      \n\t"
-        "pand %%mm2, %%mm4                      \n\t"
-        "pand %%mm3, %%mm5                      \n\t"
-
-#if HAVE_MMX2
-        "pminsw %%mm0, %%mm4                    \n\t"
-        "pminsw %%mm1, %%mm5                    \n\t"
-#else
-        "movq %%mm4, %%mm2                      \n\t"
-        "psubusw %%mm0, %%mm2                   \n\t"
-        "psubw %%mm2, %%mm4                     \n\t"
-        "movq %%mm5, %%mm2                      \n\t"
-        "psubusw %%mm1, %%mm2                   \n\t"
-        "psubw %%mm2, %%mm5                     \n\t"
-#endif
-        "pxor %%mm6, %%mm4                      \n\t"
-        "pxor %%mm7, %%mm5                      \n\t"
-        "psubw %%mm6, %%mm4                     \n\t"
-        "psubw %%mm7, %%mm5                     \n\t"
-        "packsswb %%mm5, %%mm4                  \n\t"
-        "movq (%0), %%mm0                       \n\t"
-        "paddb   %%mm4, %%mm0                   \n\t"
-        "movq %%mm0, (%0)                       \n\t"
-        "movq (%0, %1), %%mm0                   \n\t"
-        "psubb %%mm4, %%mm0                     \n\t"
-        "movq %%mm0, (%0, %1)                   \n\t"
-
-        : "+r" (src)
-        : "r" ((x86_reg)stride), "m" (c->pQPb)
-        : "%"REG_a, "%"REG_c
-    );
-#else //HAVE_MMX2 || HAVE_AMD3DNOW
-    const int l1= stride;
-    const int l2= stride + l1;
-    const int l3= stride + l2;
-    const int l4= stride + l3;
-    const int l5= stride + l4;
-    const int l6= stride + l5;
-    const int l7= stride + l6;
-    const int l8= stride + l7;
-//    const int l9= stride + l8;
-    int x;
-    src+= stride*3;
-    for(x=0; x<BLOCK_SIZE; x++){
-        const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
-        if(FFABS(middleEnergy) < 8*c->QP){
-            const int q=(src[l4] - src[l5])/2;
-            const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
-            const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
-
-            int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
-            d= FFMAX(d, 0);
-
-            d= (5*d + 32) >> 6;
-            d*= FFSIGN(-middleEnergy);
-
-            if(q>0){
-                d= d<0 ? 0 : d;
-                d= d>q ? q : d;
-            }else{
-                d= d>0 ? 0 : d;
-                d= d<q ? q : d;
-            }
-
-            src[l4]-= d;
-            src[l5]+= d;
-        }
-        src++;
-    }
-#endif //HAVE_MMX2 || HAVE_AMD3DNOW
-}
-#endif //HAVE_ALTIVEC
-
-#if !HAVE_ALTIVEC
-static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
-{
-#if HAVE_MMX2 || HAVE_AMD3DNOW
-    __asm__ volatile(
-        "pxor %%mm6, %%mm6                      \n\t"
-        "pcmpeqb %%mm7, %%mm7                   \n\t"
-        "movq %2, %%mm0                         \n\t"
-        "punpcklbw %%mm6, %%mm0                 \n\t"
-        "psrlw $1, %%mm0                        \n\t"
-        "psubw %%mm7, %%mm0                     \n\t"
-        "packuswb %%mm0, %%mm0                  \n\t"
-        "movq %%mm0, %3                         \n\t"
-
-        "lea (%0, %1), %%"REG_a"                \n\t"
-        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
-
-//        0        1        2        3        4        5        6        7        8        9
-//        %0        eax        eax+%1        eax+2%1        %0+4%1        edx        edx+%1        edx+2%1        %0+8%1        edx+4%1
-
-#undef REAL_FIND_MIN_MAX
-#undef FIND_MIN_MAX
-#if HAVE_MMX2
-#define REAL_FIND_MIN_MAX(addr)\
-        "movq " #addr ", %%mm0                  \n\t"\
-        "pminub %%mm0, %%mm7                    \n\t"\
-        "pmaxub %%mm0, %%mm6                    \n\t"
-#else
-#define REAL_FIND_MIN_MAX(addr)\
-        "movq " #addr ", %%mm0                  \n\t"\
-        "movq %%mm7, %%mm1                      \n\t"\
-        "psubusb %%mm0, %%mm6                   \n\t"\
-        "paddb %%mm0, %%mm6                     \n\t"\
-        "psubusb %%mm0, %%mm1                   \n\t"\
-        "psubb %%mm1, %%mm7                     \n\t"
-#endif
-#define FIND_MIN_MAX(addr)  REAL_FIND_MIN_MAX(addr)
-
-FIND_MIN_MAX((%%REGa))
-FIND_MIN_MAX((%%REGa, %1))
-FIND_MIN_MAX((%%REGa, %1, 2))
-FIND_MIN_MAX((%0, %1, 4))
-FIND_MIN_MAX((%%REGd))
-FIND_MIN_MAX((%%REGd, %1))
-FIND_MIN_MAX((%%REGd, %1, 2))
-FIND_MIN_MAX((%0, %1, 8))
-
-        "movq %%mm7, %%mm4                      \n\t"
-        "psrlq $8, %%mm7                        \n\t"
-#if HAVE_MMX2
-        "pminub %%mm4, %%mm7                    \n\t" // min of pixels
-        "pshufw $0xF9, %%mm7, %%mm4             \n\t"
-        "pminub %%mm4, %%mm7                    \n\t" // min of pixels
-        "pshufw $0xFE, %%mm7, %%mm4             \n\t"
-        "pminub %%mm4, %%mm7                    \n\t"
-#else
-        "movq %%mm7, %%mm1                      \n\t"
-        "psubusb %%mm4, %%mm1                   \n\t"
-        "psubb %%mm1, %%mm7                     \n\t"
-        "movq %%mm7, %%mm4                      \n\t"
-        "psrlq $16, %%mm7                       \n\t"
-        "movq %%mm7, %%mm1                      \n\t"
-        "psubusb %%mm4, %%mm1                   \n\t"
-        "psubb %%mm1, %%mm7                     \n\t"
-        "movq %%mm7, %%mm4                      \n\t"
-        "psrlq $32, %%mm7                       \n\t"
-        "movq %%mm7, %%mm1                      \n\t"
-        "psubusb %%mm4, %%mm1                   \n\t"
-        "psubb %%mm1, %%mm7                     \n\t"
-#endif
-
-
-        "movq %%mm6, %%mm4                      \n\t"
-        "psrlq $8, %%mm6                        \n\t"
-#if HAVE_MMX2
-        "pmaxub %%mm4, %%mm6                    \n\t" // max of pixels
-        "pshufw $0xF9, %%mm6, %%mm4             \n\t"
-        "pmaxub %%mm4, %%mm6                    \n\t"
-        "pshufw $0xFE, %%mm6, %%mm4             \n\t"
-        "pmaxub %%mm4, %%mm6                    \n\t"
-#else
-        "psubusb %%mm4, %%mm6                   \n\t"
-        "paddb %%mm4, %%mm6                     \n\t"
-        "movq %%mm6, %%mm4                      \n\t"
-        "psrlq $16, %%mm6                       \n\t"
-        "psubusb %%mm4, %%mm6                   \n\t"
-        "paddb %%mm4, %%mm6                     \n\t"
-        "movq %%mm6, %%mm4                      \n\t"
-        "psrlq $32, %%mm6                       \n\t"
-        "psubusb %%mm4, %%mm6                   \n\t"
-        "paddb %%mm4, %%mm6                     \n\t"
-#endif
-        "movq %%mm6, %%mm0                      \n\t" // max
-        "psubb %%mm7, %%mm6                     \n\t" // max - min
-        "movd %%mm6, %%ecx                      \n\t"
-        "cmpb "MANGLE(deringThreshold)", %%cl   \n\t"
-        " jb 1f                                 \n\t"
-        "lea -24(%%"REG_SP"), %%"REG_c"         \n\t"
-        "and "ALIGN_MASK", %%"REG_c"            \n\t"
-        PAVGB(%%mm0, %%mm7)                           // a=(max + min)/2
-        "punpcklbw %%mm7, %%mm7                 \n\t"
-        "punpcklbw %%mm7, %%mm7                 \n\t"
-        "punpcklbw %%mm7, %%mm7                 \n\t"
-        "movq %%mm7, (%%"REG_c")                \n\t"
-
-        "movq (%0), %%mm0                       \n\t" // L10
-        "movq %%mm0, %%mm1                      \n\t" // L10
-        "movq %%mm0, %%mm2                      \n\t" // L10
-        "psllq $8, %%mm1                        \n\t"
-        "psrlq $8, %%mm2                        \n\t"
-        "movd -4(%0), %%mm3                     \n\t"
-        "movd 8(%0), %%mm4                      \n\t"
-        "psrlq $24, %%mm3                       \n\t"
-        "psllq $56, %%mm4                       \n\t"
-        "por %%mm3, %%mm1                       \n\t" // L00
-        "por %%mm4, %%mm2                       \n\t" // L20
-        "movq %%mm1, %%mm3                      \n\t" // L00
-        PAVGB(%%mm2, %%mm1)                           // (L20 + L00)/2
-        PAVGB(%%mm0, %%mm1)                           // (L20 + L00 + 2L10)/4
-        "psubusb %%mm7, %%mm0                   \n\t"
-        "psubusb %%mm7, %%mm2                   \n\t"
-        "psubusb %%mm7, %%mm3                   \n\t"
-        "pcmpeqb "MANGLE(b00)", %%mm0           \n\t" // L10 > a ? 0 : -1
-        "pcmpeqb "MANGLE(b00)", %%mm2           \n\t" // L20 > a ? 0 : -1
-        "pcmpeqb "MANGLE(b00)", %%mm3           \n\t" // L00 > a ? 0 : -1
-        "paddb %%mm2, %%mm0                     \n\t"
-        "paddb %%mm3, %%mm0                     \n\t"
-
-        "movq (%%"REG_a"), %%mm2                \n\t" // L11
-        "movq %%mm2, %%mm3                      \n\t" // L11
-        "movq %%mm2, %%mm4                      \n\t" // L11
-        "psllq $8, %%mm3                        \n\t"
-        "psrlq $8, %%mm4                        \n\t"
-        "movd -4(%%"REG_a"), %%mm5              \n\t"
-        "movd 8(%%"REG_a"), %%mm6               \n\t"
-        "psrlq $24, %%mm5                       \n\t"
-        "psllq $56, %%mm6                       \n\t"
-        "por %%mm5, %%mm3                       \n\t" // L01
-        "por %%mm6, %%mm4                       \n\t" // L21
-        "movq %%mm3, %%mm5                      \n\t" // L01
-        PAVGB(%%mm4, %%mm3)                           // (L21 + L01)/2
-        PAVGB(%%mm2, %%mm3)                           // (L21 + L01 + 2L11)/4
-        "psubusb %%mm7, %%mm2                   \n\t"
-        "psubusb %%mm7, %%mm4                   \n\t"
-        "psubusb %%mm7, %%mm5                   \n\t"
-        "pcmpeqb "MANGLE(b00)", %%mm2           \n\t" // L11 > a ? 0 : -1
-        "pcmpeqb "MANGLE(b00)", %%mm4           \n\t" // L21 > a ? 0 : -1
-        "pcmpeqb "MANGLE(b00)", %%mm5           \n\t" // L01 > a ? 0 : -1
-        "paddb %%mm4, %%mm2                     \n\t"
-        "paddb %%mm5, %%mm2                     \n\t"
-// 0, 2, 3, 1
-#define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
-        "movq " #src ", " #sx "                 \n\t" /* src[0] */\
-        "movq " #sx ", " #lx "                  \n\t" /* src[0] */\
-        "movq " #sx ", " #t0 "                  \n\t" /* src[0] */\
-        "psllq $8, " #lx "                      \n\t"\
-        "psrlq $8, " #t0 "                      \n\t"\
-        "movd -4" #src ", " #t1 "               \n\t"\
-        "psrlq $24, " #t1 "                     \n\t"\
-        "por " #t1 ", " #lx "                   \n\t" /* src[-1] */\
-        "movd 8" #src ", " #t1 "                \n\t"\
-        "psllq $56, " #t1 "                     \n\t"\
-        "por " #t1 ", " #t0 "                   \n\t" /* src[+1] */\
-        "movq " #lx ", " #t1 "                  \n\t" /* src[-1] */\
-        PAVGB(t0, lx)                                 /* (src[-1] + src[+1])/2 */\
-        PAVGB(sx, lx)                                 /* (src[-1] + 2src[0] + src[+1])/4 */\
-        PAVGB(lx, pplx)                                     \
-        "movq " #lx ", 8(%%"REG_c")             \n\t"\
-        "movq (%%"REG_c"), " #lx "              \n\t"\
-        "psubusb " #lx ", " #t1 "               \n\t"\
-        "psubusb " #lx ", " #t0 "               \n\t"\
-        "psubusb " #lx ", " #sx "               \n\t"\
-        "movq "MANGLE(b00)", " #lx "            \n\t"\
-        "pcmpeqb " #lx ", " #t1 "               \n\t" /* src[-1] > a ? 0 : -1*/\
-        "pcmpeqb " #lx ", " #t0 "               \n\t" /* src[+1] > a ? 0 : -1*/\
-        "pcmpeqb " #lx ", " #sx "               \n\t" /* src[0]  > a ? 0 : -1*/\
-        "paddb " #t1 ", " #t0 "                 \n\t"\
-        "paddb " #t0 ", " #sx "                 \n\t"\
-\
-        PAVGB(plx, pplx)                              /* filtered */\
-        "movq " #dst ", " #t0 "                 \n\t" /* dst */\
-        "movq " #t0 ", " #t1 "                  \n\t" /* dst */\
-        "psubusb %3, " #t0 "                    \n\t"\
-        "paddusb %3, " #t1 "                    \n\t"\
-        PMAXUB(t0, pplx)\
-        PMINUB(t1, pplx, t0)\
-        "paddb " #sx ", " #ppsx "               \n\t"\
-        "paddb " #psx ", " #ppsx "              \n\t"\
-        "#paddb "MANGLE(b02)", " #ppsx "        \n\t"\
-        "pand "MANGLE(b08)", " #ppsx "          \n\t"\
-        "pcmpeqb " #lx ", " #ppsx "             \n\t"\
-        "pand " #ppsx ", " #pplx "              \n\t"\
-        "pandn " #dst ", " #ppsx "              \n\t"\
-        "por " #pplx ", " #ppsx "               \n\t"\
-        "movq " #ppsx ", " #dst "               \n\t"\
-        "movq 8(%%"REG_c"), " #lx "             \n\t"
-
-#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
-   REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
-/*
-0000000
-1111111
-
-1111110
-1111101
-1111100
-1111011
-1111010
-1111001
-
-1111000
-1110111
-
-*/
-//DERING_CORE(dst          ,src            ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
-DERING_CORE((%%REGa)       ,(%%REGa, %1)   ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
-DERING_CORE((%%REGa, %1)   ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
-DERING_CORE((%%REGa, %1, 2),(%0, %1, 4)    ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
-DERING_CORE((%0, %1, 4)    ,(%%REGd)       ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
-DERING_CORE((%%REGd)       ,(%%REGd, %1)   ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
-DERING_CORE((%%REGd, %1)   ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
-DERING_CORE((%%REGd, %1, 2),(%0, %1, 8)    ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
-DERING_CORE((%0, %1, 8)    ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
-
-        "1:                        \n\t"
-        : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2)
-        : "%"REG_a, "%"REG_d, "%"REG_c
-    );
-#else //HAVE_MMX2 || HAVE_AMD3DNOW
-    int y;
-    int min=255;
-    int max=0;
-    int avg;
-    uint8_t *p;
-    int s[10];
-    const int QP2= c->QP/2 + 1;
-
-    for(y=1; y<9; y++){
-        int x;
-        p= src + stride*y;
-        for(x=1; x<9; x++){
-            p++;
-            if(*p > max) max= *p;
-            if(*p < min) min= *p;
-        }
-    }
-    avg= (min + max + 1)>>1;
-
-    if(max - min <deringThreshold) return;
-
-    for(y=0; y<10; y++){
-        int t = 0;
-
-        if(src[stride*y + 0] > avg) t+= 1;
-        if(src[stride*y + 1] > avg) t+= 2;
-        if(src[stride*y + 2] > avg) t+= 4;
-        if(src[stride*y + 3] > avg) t+= 8;
-        if(src[stride*y + 4] > avg) t+= 16;
-        if(src[stride*y + 5] > avg) t+= 32;
-        if(src[stride*y + 6] > avg) t+= 64;
-        if(src[stride*y + 7] > avg) t+= 128;
-        if(src[stride*y + 8] > avg) t+= 256;
-        if(src[stride*y + 9] > avg) t+= 512;
-
-        t |= (~t)<<16;
-        t &= (t<<1) & (t>>1);
-        s[y] = t;
-    }
-
-    for(y=1; y<9; y++){
-        int t = s[y-1] & s[y] & s[y+1];
-        t|= t>>16;
-        s[y-1]= t;
-    }
-
-    for(y=1; y<9; y++){
-        int x;
-        int t = s[y-1];
-
-        p= src + stride*y;
-        for(x=1; x<9; x++){
-            p++;
-            if(t & (1<<x)){
-                int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
-                      +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
-                      +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
-                f= (f + 8)>>4;
-
-#ifdef DEBUG_DERING_THRESHOLD
-                    __asm__ volatile("emms\n\t":);
-                    {
-                    static long long numPixels=0;
-                    if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
-//                    if((max-min)<20 || (max-min)*QP<200)
-//                    if((max-min)*QP < 500)
-//                    if(max-min<QP/2)
-                    if(max-min < 20){
-                        static int numSkipped=0;
-                        static int errorSum=0;
-                        static int worstQP=0;
-                        static int worstRange=0;
-                        static int worstDiff=0;
-                        int diff= (f - *p);
-                        int absDiff= FFABS(diff);
-                        int error= diff*diff;
-
-                        if(x==1 || x==8 || y==1 || y==8) continue;
-
-                        numSkipped++;
-                        if(absDiff > worstDiff){
-                            worstDiff= absDiff;
-                            worstQP= QP;
-                            worstRange= max-min;
-                        }
-                        errorSum+= error;
-
-                        if(1024LL*1024LL*1024LL % numSkipped == 0){
-                            av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
-                                   "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
-                                   (float)errorSum/numSkipped, numSkipped, worstQP, worstRange,
-                                   worstDiff, (float)numSkipped/numPixels);
-                        }
-                    }
-                    }
-#endif
-                    if     (*p + QP2 < f) *p= *p + QP2;
-                    else if(*p - QP2 > f) *p= *p - QP2;
-                    else *p=f;
-            }
-        }
-    }
-#ifdef DEBUG_DERING_THRESHOLD
-    if(max-min < 20){
-        for(y=1; y<9; y++){
-            int x;
-            int t = 0;
-            p= src + stride*y;
-            for(x=1; x<9; x++){
-                p++;
-                *p = FFMIN(*p + 20, 255);
-            }
-        }
-//        src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
-    }
-#endif
-#endif //HAVE_MMX2 || HAVE_AMD3DNOW
-}
-#endif //HAVE_ALTIVEC
-
-/**
- * Deinterlace the given block by linearly interpolating every second line.
- * will be called for every 8x8 block and can read & write from line 4-15
- * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced
- */
-static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
-{
-#if HAVE_MMX2 || HAVE_AMD3DNOW
-    src+= 4*stride;
-    __asm__ volatile(
-        "lea (%0, %1), %%"REG_a"                \n\t"
-        "lea (%%"REG_a", %1, 4), %%"REG_c"      \n\t"
-//      0       1       2       3       4       5       6       7       8       9
-//      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %0+8%1  ecx+4%1
-
-        "movq (%0), %%mm0                       \n\t"
-        "movq (%%"REG_a", %1), %%mm1            \n\t"
-        PAVGB(%%mm1, %%mm0)
-        "movq %%mm0, (%%"REG_a")                \n\t"
-        "movq (%0, %1, 4), %%mm0                \n\t"
-        PAVGB(%%mm0, %%mm1)
-        "movq %%mm1, (%%"REG_a", %1, 2)         \n\t"
-        "movq (%%"REG_c", %1), %%mm1            \n\t"
-        PAVGB(%%mm1, %%mm0)
-        "movq %%mm0, (%%"REG_c")                \n\t"
-        "movq (%0, %1, 8), %%mm0                \n\t"
-        PAVGB(%%mm0, %%mm1)
-        "movq %%mm1, (%%"REG_c", %1, 2)         \n\t"
-
-        : : "r" (src), "r" ((x86_reg)stride)
-        : "%"REG_a, "%"REG_c
-    );
-#else
-    int a, b, x;
-    src+= 4*stride;
-
-    for(x=0; x<2; x++){
-        a= *(uint32_t*)&src[stride*0];
-        b= *(uint32_t*)&src[stride*2];
-        *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
-        a= *(uint32_t*)&src[stride*4];
-        *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
-        b= *(uint32_t*)&src[stride*6];
-        *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
-        a= *(uint32_t*)&src[stride*8];
-        *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
-        src += 4;
-    }
-#endif
-}
-
-/**
- * Deinterlace the given block by cubic interpolating every second line.
- * will be called for every 8x8 block and can read & write from line 4-15
- * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced
- * this filter will read lines 3-15 and write 7-13
- */
-static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
-{
-#if HAVE_MMX2 || HAVE_AMD3DNOW
-    src+= stride*3;
-    __asm__ volatile(
-        "lea (%0, %1), %%"REG_a"                \n\t"
-        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
-        "lea (%%"REG_d", %1, 4), %%"REG_c"      \n\t"
-        "add %1, %%"REG_c"                      \n\t"
-        "pxor %%mm7, %%mm7                      \n\t"
-//      0       1       2       3       4       5       6       7       8       9       10
-//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
-
-#define REAL_DEINT_CUBIC(a,b,c,d,e)\
-        "movq " #a ", %%mm0                     \n\t"\
-        "movq " #b ", %%mm1                     \n\t"\
-        "movq " #d ", %%mm2                     \n\t"\
-        "movq " #e ", %%mm3                     \n\t"\
-        PAVGB(%%mm2, %%mm1)                             /* (b+d) /2 */\
-        PAVGB(%%mm3, %%mm0)                             /* a(a+e) /2 */\
-        "movq %%mm0, %%mm2                      \n\t"\
-        "punpcklbw %%mm7, %%mm0                 \n\t"\
-        "punpckhbw %%mm7, %%mm2                 \n\t"\
-        "movq %%mm1, %%mm3                      \n\t"\
-        "punpcklbw %%mm7, %%mm1                 \n\t"\
-        "punpckhbw %%mm7, %%mm3                 \n\t"\
-        "psubw %%mm1, %%mm0                     \n\t"   /* L(a+e - (b+d))/2 */\
-        "psubw %%mm3, %%mm2                     \n\t"   /* H(a+e - (b+d))/2 */\
-        "psraw $3, %%mm0                        \n\t"   /* L(a+e - (b+d))/16 */\
-        "psraw $3, %%mm2                        \n\t"   /* H(a+e - (b+d))/16 */\
-        "psubw %%mm0, %%mm1                     \n\t"   /* L(9b + 9d - a - e)/16 */\
-        "psubw %%mm2, %%mm3                     \n\t"   /* H(9b + 9d - a - e)/16 */\
-        "packuswb %%mm3, %%mm1                  \n\t"\
-        "movq %%mm1, " #c "                     \n\t"
-#define DEINT_CUBIC(a,b,c,d,e)  REAL_DEINT_CUBIC(a,b,c,d,e)
-
-DEINT_CUBIC((%0)        , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1))
-DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd)       , (%%REGd, %1), (%0, %1, 8))
-DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc))
-DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc)    , (%%REGc, %1, 2))
-
-        : : "r" (src), "r" ((x86_reg)stride)
-        : "%"REG_a, "%"REG_d, "%"REG_c
-    );
-#else //HAVE_MMX2 || HAVE_AMD3DNOW
-    int x;
-    src+= stride*3;
-    for(x=0; x<8; x++){
-        src[stride*3] = CLIP((-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
-        src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
-        src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
-        src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
-        src++;
-    }
-#endif //HAVE_MMX2 || HAVE_AMD3DNOW
-}
-
-/**
- * Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter.
- * will be called for every 8x8 block and can read & write from line 4-15
- * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced
- * this filter will read lines 4-13 and write 5-11
- */
-static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
-{
-#if HAVE_MMX2 || HAVE_AMD3DNOW
-    src+= stride*4;
-    __asm__ volatile(
-        "lea (%0, %1), %%"REG_a"                \n\t"
-        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
-        "pxor %%mm7, %%mm7                      \n\t"
-        "movq (%2), %%mm0                       \n\t"
-//      0       1       2       3       4       5       6       7       8       9       10
-//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
-
-#define REAL_DEINT_FF(a,b,c,d)\
-        "movq " #a ", %%mm1                     \n\t"\
-        "movq " #b ", %%mm2                     \n\t"\
-        "movq " #c ", %%mm3                     \n\t"\
-        "movq " #d ", %%mm4                     \n\t"\
-        PAVGB(%%mm3, %%mm1)                          \
-        PAVGB(%%mm4, %%mm0)                          \
-        "movq %%mm0, %%mm3                      \n\t"\
-        "punpcklbw %%mm7, %%mm0                 \n\t"\
-        "punpckhbw %%mm7, %%mm3                 \n\t"\
-        "movq %%mm1, %%mm4                      \n\t"\
-        "punpcklbw %%mm7, %%mm1                 \n\t"\
-        "punpckhbw %%mm7, %%mm4                 \n\t"\
-        "psllw $2, %%mm1                        \n\t"\
-        "psllw $2, %%mm4                        \n\t"\
-        "psubw %%mm0, %%mm1                     \n\t"\
-        "psubw %%mm3, %%mm4                     \n\t"\
-        "movq %%mm2, %%mm5                      \n\t"\
-        "movq %%mm2, %%mm0                      \n\t"\
-        "punpcklbw %%mm7, %%mm2                 \n\t"\
-        "punpckhbw %%mm7, %%mm5                 \n\t"\
-        "paddw %%mm2, %%mm1                     \n\t"\
-        "paddw %%mm5, %%mm4                     \n\t"\
-        "psraw $2, %%mm1                        \n\t"\
-        "psraw $2, %%mm4                        \n\t"\
-        "packuswb %%mm4, %%mm1                  \n\t"\
-        "movq %%mm1, " #b "                     \n\t"\
-
-#define DEINT_FF(a,b,c,d)  REAL_DEINT_FF(a,b,c,d)
-
-DEINT_FF((%0)        , (%%REGa)       , (%%REGa, %1), (%%REGa, %1, 2))
-DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd)       )
-DEINT_FF((%0, %1, 4) , (%%REGd)       , (%%REGd, %1), (%%REGd, %1, 2))
-DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
-
-        "movq %%mm0, (%2)                       \n\t"
-        : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
-        : "%"REG_a, "%"REG_d
-    );
-#else //HAVE_MMX2 || HAVE_AMD3DNOW
-    int x;
-    src+= stride*4;
-    for(x=0; x<8; x++){
-        int t1= tmp[x];
-        int t2= src[stride*1];
-
-        src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
-        t1= src[stride*4];
-        src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
-        t2= src[stride*6];
-        src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
-        t1= src[stride*8];
-        src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
-        tmp[x]= t1;
-
-        src++;
-    }
-#endif //HAVE_MMX2 || HAVE_AMD3DNOW
-}
-
-/**
- * Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter.
- * will be called for every 8x8 block and can read & write from line 4-15
- * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced
- * this filter will read lines 4-13 and write 4-11
- */
-static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
-{
-#if HAVE_MMX2 || HAVE_AMD3DNOW
-    src+= stride*4;
-    __asm__ volatile(
-        "lea (%0, %1), %%"REG_a"                \n\t"
-        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
-        "pxor %%mm7, %%mm7                      \n\t"
-        "movq (%2), %%mm0                       \n\t"
-        "movq (%3), %%mm1                       \n\t"
-//      0       1       2       3       4       5       6       7       8       9       10
-//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1 ecx
-
-#define REAL_DEINT_L5(t1,t2,a,b,c)\
-        "movq " #a ", %%mm2                     \n\t"\
-        "movq " #b ", %%mm3                     \n\t"\
-        "movq " #c ", %%mm4                     \n\t"\
-        PAVGB(t2, %%mm3)                             \
-        PAVGB(t1, %%mm4)                             \
-        "movq %%mm2, %%mm5                      \n\t"\
-        "movq %%mm2, " #t1 "                    \n\t"\
-        "punpcklbw %%mm7, %%mm2                 \n\t"\
-        "punpckhbw %%mm7, %%mm5                 \n\t"\
-        "movq %%mm2, %%mm6                      \n\t"\
-        "paddw %%mm2, %%mm2                     \n\t"\
-        "paddw %%mm6, %%mm2                     \n\t"\
-        "movq %%mm5, %%mm6                      \n\t"\
-        "paddw %%mm5, %%mm5                     \n\t"\
-        "paddw %%mm6, %%mm5                     \n\t"\
-        "movq %%mm3, %%mm6                      \n\t"\
-        "punpcklbw %%mm7, %%mm3                 \n\t"\
-        "punpckhbw %%mm7, %%mm6                 \n\t"\
-        "paddw %%mm3, %%mm3                     \n\t"\
-        "paddw %%mm6, %%mm6                     \n\t"\
-        "paddw %%mm3, %%mm2                     \n\t"\
-        "paddw %%mm6, %%mm5                     \n\t"\
-        "movq %%mm4, %%mm6                      \n\t"\
-        "punpcklbw %%mm7, %%mm4                 \n\t"\
-        "punpckhbw %%mm7, %%mm6                 \n\t"\
-        "psubw %%mm4, %%mm2                     \n\t"\
-        "psubw %%mm6, %%mm5                     \n\t"\
-        "psraw $2, %%mm2                        \n\t"\
-        "psraw $2, %%mm5                        \n\t"\
-        "packuswb %%mm5, %%mm2                  \n\t"\
-        "movq %%mm2, " #a "                     \n\t"\
-
-#define DEINT_L5(t1,t2,a,b,c)  REAL_DEINT_L5(t1,t2,a,b,c)
-
-DEINT_L5(%%mm0, %%mm1, (%0)           , (%%REGa)       , (%%REGa, %1)   )
-DEINT_L5(%%mm1, %%mm0, (%%REGa)       , (%%REGa, %1)   , (%%REGa, %1, 2))
-DEINT_L5(%%mm0, %%mm1, (%%REGa, %1)   , (%%REGa, %1, 2), (%0, %1, 4)   )
-DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4)    , (%%REGd)       )
-DEINT_L5(%%mm0, %%mm1, (%0, %1, 4)    , (%%REGd)       , (%%REGd, %1)   )
-DEINT_L5(%%mm1, %%mm0, (%%REGd)       , (%%REGd, %1)   , (%%REGd, %1, 2))
-DEINT_L5(%%mm0, %%mm1, (%%REGd, %1)   , (%%REGd, %1, 2), (%0, %1, 8)   )
-DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8)    , (%%REGd, %1, 4))
-
-        "movq %%mm0, (%2)                       \n\t"
-        "movq %%mm1, (%3)                       \n\t"
-        : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
-        : "%"REG_a, "%"REG_d
-    );
-#else //HAVE_MMX2 || HAVE_AMD3DNOW
-    int x;
-    src+= stride*4;
-    for(x=0; x<8; x++){
-        int t1= tmp[x];
-        int t2= tmp2[x];
-        int t3= src[0];
-
-        src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
-        t1= src[stride*1];
-        src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
-        t2= src[stride*2];
-        src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
-        t3= src[stride*3];
-        src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
-        t1= src[stride*4];
-        src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
-        t2= src[stride*5];
-        src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
-        t3= src[stride*6];
-        src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
-        t1= src[stride*7];
-        src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
-
-        tmp[x]= t3;
-        tmp2[x]= t1;
-
-        src++;
-    }
-#endif //HAVE_MMX2 || HAVE_AMD3DNOW
-}
-
-/**
- * Deinterlace the given block by filtering all lines with a (1 2 1) filter.
- * will be called for every 8x8 block and can read & write from line 4-15
- * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced
- * this filter will read lines 4-13 and write 4-11
- */
-static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
-{
-#if HAVE_MMX2 || HAVE_AMD3DNOW
-    src+= 4*stride;
-    __asm__ volatile(
-        "lea (%0, %1), %%"REG_a"                \n\t"
-        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
-//      0       1       2       3       4       5       6       7       8       9
-//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
-
-        "movq (%2), %%mm0                       \n\t" // L0
-        "movq (%%"REG_a"), %%mm1                \n\t" // L2
-        PAVGB(%%mm1, %%mm0)                           // L0+L2
-        "movq (%0), %%mm2                       \n\t" // L1
-        PAVGB(%%mm2, %%mm0)
-        "movq %%mm0, (%0)                       \n\t"
-        "movq (%%"REG_a", %1), %%mm0            \n\t" // L3
-        PAVGB(%%mm0, %%mm2)                           // L1+L3
-        PAVGB(%%mm1, %%mm2)                           // 2L2 + L1 + L3
-        "movq %%mm2, (%%"REG_a")                \n\t"
-        "movq (%%"REG_a", %1, 2), %%mm2         \n\t" // L4
-        PAVGB(%%mm2, %%mm1)                           // L2+L4
-        PAVGB(%%mm0, %%mm1)                           // 2L3 + L2 + L4
-        "movq %%mm1, (%%"REG_a", %1)            \n\t"
-        "movq (%0, %1, 4), %%mm1                \n\t" // L5
-        PAVGB(%%mm1, %%mm0)                           // L3+L5
-        PAVGB(%%mm2, %%mm0)                           // 2L4 + L3 + L5
-        "movq %%mm0, (%%"REG_a", %1, 2)         \n\t"
-        "movq (%%"REG_d"), %%mm0                \n\t" // L6
-        PAVGB(%%mm0, %%mm2)                           // L4+L6
-        PAVGB(%%mm1, %%mm2)                           // 2L5 + L4 + L6
-        "movq %%mm2, (%0, %1, 4)                \n\t"
-        "movq (%%"REG_d", %1), %%mm2            \n\t" // L7
-        PAVGB(%%mm2, %%mm1)                           // L5+L7
-        PAVGB(%%mm0, %%mm1)                           // 2L6 + L5 + L7
-        "movq %%mm1, (%%"REG_d")                \n\t"
-        "movq (%%"REG_d", %1, 2), %%mm1         \n\t" // L8
-        PAVGB(%%mm1, %%mm0)                           // L6+L8
-        PAVGB(%%mm2, %%mm0)                           // 2L7 + L6 + L8
-        "movq %%mm0, (%%"REG_d", %1)            \n\t"
-        "movq (%0, %1, 8), %%mm0                \n\t" // L9
-        PAVGB(%%mm0, %%mm2)                           // L7+L9
-        PAVGB(%%mm1, %%mm2)                           // 2L8 + L7 + L9
-        "movq %%mm2, (%%"REG_d", %1, 2)         \n\t"
-        "movq %%mm1, (%2)                       \n\t"
-
-        : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
-        : "%"REG_a, "%"REG_d
-    );
-#else //HAVE_MMX2 || HAVE_AMD3DNOW
-    int a, b, c, x;
-    src+= 4*stride;
-
-    for(x=0; x<2; x++){
-        a= *(uint32_t*)&tmp[stride*0];
-        b= *(uint32_t*)&src[stride*0];
-        c= *(uint32_t*)&src[stride*1];
-        a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
-        *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
-
-        a= *(uint32_t*)&src[stride*2];
-        b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
-        *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
-
-        b= *(uint32_t*)&src[stride*3];
-        c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
-        *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
-
-        c= *(uint32_t*)&src[stride*4];
-        a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
-        *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
-
-        a= *(uint32_t*)&src[stride*5];
-        b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
-        *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
-
-        b= *(uint32_t*)&src[stride*6];
-        c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
-        *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
-
-        c= *(uint32_t*)&src[stride*7];
-        a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
-        *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
-
-        a= *(uint32_t*)&src[stride*8];
-        b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
-        *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
-
-        *(uint32_t*)&tmp[stride*0]= c;
-        src += 4;
-        tmp += 4;
-    }
-#endif //HAVE_MMX2 || HAVE_AMD3DNOW
-}
-
-/**
- * Deinterlace the given block by applying a median filter to every second line.
- * will be called for every 8x8 block and can read & write from line 4-15,
- * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced
- */
-static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
-{
-#if HAVE_MMX
-    src+= 4*stride;
-#if HAVE_MMX2
-    __asm__ volatile(
-        "lea (%0, %1), %%"REG_a"                \n\t"
-        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
-//      0       1       2       3       4       5       6       7       8       9
-//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
-
-        "movq (%0), %%mm0                       \n\t" //
-        "movq (%%"REG_a", %1), %%mm2            \n\t" //
-        "movq (%%"REG_a"), %%mm1                \n\t" //
-        "movq %%mm0, %%mm3                      \n\t"
-        "pmaxub %%mm1, %%mm0                    \n\t" //
-        "pminub %%mm3, %%mm1                    \n\t" //
-        "pmaxub %%mm2, %%mm1                    \n\t" //
-        "pminub %%mm1, %%mm0                    \n\t"
-        "movq %%mm0, (%%"REG_a")                \n\t"
-
-        "movq (%0, %1, 4), %%mm0                \n\t" //
-        "movq (%%"REG_a", %1, 2), %%mm1         \n\t" //
-        "movq %%mm2, %%mm3                      \n\t"
-        "pmaxub %%mm1, %%mm2                    \n\t" //
-        "pminub %%mm3, %%mm1                    \n\t" //
-        "pmaxub %%mm0, %%mm1                    \n\t" //
-        "pminub %%mm1, %%mm2                    \n\t"
-        "movq %%mm2, (%%"REG_a", %1, 2)         \n\t"
-
-        "movq (%%"REG_d"), %%mm2                \n\t" //
-        "movq (%%"REG_d", %1), %%mm1            \n\t" //
-        "movq %%mm2, %%mm3                      \n\t"
-        "pmaxub %%mm0, %%mm2                    \n\t" //
-        "pminub %%mm3, %%mm0                    \n\t" //
-        "pmaxub %%mm1, %%mm0                    \n\t" //
-        "pminub %%mm0, %%mm2                    \n\t"
-        "movq %%mm2, (%%"REG_d")                \n\t"
-
-        "movq (%%"REG_d", %1, 2), %%mm2         \n\t" //
-        "movq (%0, %1, 8), %%mm0                \n\t" //
-        "movq %%mm2, %%mm3                      \n\t"
-        "pmaxub %%mm0, %%mm2                    \n\t" //
-        "pminub %%mm3, %%mm0                    \n\t" //
-        "pmaxub %%mm1, %%mm0                    \n\t" //
-        "pminub %%mm0, %%mm2                    \n\t"
-        "movq %%mm2, (%%"REG_d", %1, 2)         \n\t"
-
-
-        : : "r" (src), "r" ((x86_reg)stride)
-        : "%"REG_a, "%"REG_d
-    );
-
-#else // MMX without MMX2
-    __asm__ volatile(
-        "lea (%0, %1), %%"REG_a"                \n\t"
-        "lea (%%"REG_a", %1, 4), %%"REG_d"      \n\t"
-//      0       1       2       3       4       5       6       7       8       9
-//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
-        "pxor %%mm7, %%mm7                      \n\t"
-
-#define REAL_MEDIAN(a,b,c)\
-        "movq " #a ", %%mm0                     \n\t"\
-        "movq " #b ", %%mm2                     \n\t"\
-        "movq " #c ", %%mm1                     \n\t"\
-        "movq %%mm0, %%mm3                      \n\t"\
-        "movq %%mm1, %%mm4                      \n\t"\
-        "movq %%mm2, %%mm5                      \n\t"\
-        "psubusb %%mm1, %%mm3                   \n\t"\
-        "psubusb %%mm2, %%mm4                   \n\t"\
-        "psubusb %%mm0, %%mm5                   \n\t"\
-        "pcmpeqb %%mm7, %%mm3                   \n\t"\
-        "pcmpeqb %%mm7, %%mm4                   \n\t"\
-        "pcmpeqb %%mm7, %%mm5                   \n\t"\
-        "movq %%mm3, %%mm6                      \n\t"\
-        "pxor %%mm4, %%mm3                      \n\t"\
-        "pxor %%mm5, %%mm4                      \n\t"\
-        "pxor %%mm6, %%mm5                      \n\t"\
-        "por %%mm3, %%mm1                       \n\t"\
-        "por %%mm4, %%mm2                       \n\t"\
-        "por %%mm5, %%mm0                       \n\t"\
-        "pand %%mm2, %%mm0                      \n\t"\
-        "pand %%mm1, %%mm0                      \n\t"\
-        "movq %%mm0, " #b "                     \n\t"
-#define MEDIAN(a,b,c)  REAL_MEDIAN(a,b,c)
-
-MEDIAN((%0)        , (%%REGa)       , (%%REGa, %1))
-MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4))
-MEDIAN((%0, %1, 4) , (%%REGd)       , (%%REGd, %1))
-MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
-
-        : : "r" (src), "r" ((x86_reg)stride)
-        : "%"REG_a, "%"REG_d
-    );
-#endif //HAVE_MMX2
-#else //HAVE_MMX
-    int x, y;
-    src+= 4*stride;
-    // FIXME - there should be a way to do a few columns in parallel like w/mmx
-    for(x=0; x<8; x++){
-        uint8_t *colsrc = src;
-        for (y=0; y<4; y++){
-            int a, b, c, d, e, f;
-            a = colsrc[0       ];
-            b = colsrc[stride  ];
-            c = colsrc[stride*2];
-            d = (a-b)>>31;
-            e = (b-c)>>31;
-            f = (c-a)>>31;
-            colsrc[stride  ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
-            colsrc += stride*2;
-        }
-        src++;
-    }
-#endif //HAVE_MMX
-}
-
-#if HAVE_MMX
-/**
- * Transpose and shift the given 8x8 Block into dst1 and dst2.
- */
-static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
-{
-    __asm__(
-        "lea (%0, %1), %%"REG_a"                \n\t"
-//      0       1       2       3       4       5       6       7       8       9
-//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
-        "movq (%0), %%mm0                       \n\t" // 12345678
-        "movq (%%"REG_a"), %%mm1                \n\t" // abcdefgh
-        "movq %%mm0, %%mm2                      \n\t" // 12345678
-        "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
-        "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
-
-        "movq (%%"REG_a", %1), %%mm1            \n\t"
-        "movq (%%"REG_a", %1, 2), %%mm3         \n\t"
-        "movq %%mm1, %%mm4                      \n\t"
-        "punpcklbw %%mm3, %%mm1                 \n\t"
-        "punpckhbw %%mm3, %%mm4                 \n\t"
-
-        "movq %%mm0, %%mm3                      \n\t"
-        "punpcklwd %%mm1, %%mm0                 \n\t"
-        "punpckhwd %%mm1, %%mm3                 \n\t"
-        "movq %%mm2, %%mm1                      \n\t"
-        "punpcklwd %%mm4, %%mm2                 \n\t"
-        "punpckhwd %%mm4, %%mm1                 \n\t"
-
-        "movd %%mm0, 128(%2)                    \n\t"
-        "psrlq $32, %%mm0                       \n\t"
-        "movd %%mm0, 144(%2)                    \n\t"
-        "movd %%mm3, 160(%2)                    \n\t"
-        "psrlq $32, %%mm3                       \n\t"
-        "movd %%mm3, 176(%2)                    \n\t"
-        "movd %%mm3, 48(%3)                     \n\t"
-        "movd %%mm2, 192(%2)                    \n\t"
-        "movd %%mm2, 64(%3)                     \n\t"
-        "psrlq $32, %%mm2                       \n\t"
-        "movd %%mm2, 80(%3)                     \n\t"
-        "movd %%mm1, 96(%3)                     \n\t"
-        "psrlq $32, %%mm1                       \n\t"
-        "movd %%mm1, 112(%3)                    \n\t"
-
-        "lea (%%"REG_a", %1, 4), %%"REG_a"      \n\t"
-
-        "movq (%0, %1, 4), %%mm0                \n\t" // 12345678
-        "movq (%%"REG_a"), %%mm1                \n\t" // abcdefgh
-        "movq %%mm0, %%mm2                      \n\t" // 12345678
-        "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
-        "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
-
-        "movq (%%"REG_a", %1), %%mm1            \n\t"
-        "movq (%%"REG_a", %1, 2), %%mm3         \n\t"
-        "movq %%mm1, %%mm4                      \n\t"
-        "punpcklbw %%mm3, %%mm1                 \n\t"
-        "punpckhbw %%mm3, %%mm4                 \n\t"
-
-        "movq %%mm0, %%mm3                      \n\t"
-        "punpcklwd %%mm1, %%mm0                 \n\t"
-        "punpckhwd %%mm1, %%mm3                 \n\t"
-        "movq %%mm2, %%mm1                      \n\t"
-        "punpcklwd %%mm4, %%mm2                 \n\t"
-        "punpckhwd %%mm4, %%mm1                 \n\t"
-
-        "movd %%mm0, 132(%2)                    \n\t"
-        "psrlq $32, %%mm0                       \n\t"
-        "movd %%mm0, 148(%2)                    \n\t"
-        "movd %%mm3, 164(%2)                    \n\t"
-        "psrlq $32, %%mm3                       \n\t"
-        "movd %%mm3, 180(%2)                    \n\t"
-        "movd %%mm3, 52(%3)                     \n\t"
-        "movd %%mm2, 196(%2)                    \n\t"
-        "movd %%mm2, 68(%3)                     \n\t"
-        "psrlq $32, %%mm2                       \n\t"
-        "movd %%mm2, 84(%3)                     \n\t"
-        "movd %%mm1, 100(%3)                    \n\t"
-        "psrlq $32, %%mm1                       \n\t"
-        "movd %%mm1, 116(%3)                    \n\t"
-
-
-        :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2)
-        : "%"REG_a
-    );
-}
-
-/**
- * Transpose the given 8x8 block.
- */
-static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
-{
-    __asm__(
-        "lea (%0, %1), %%"REG_a"                \n\t"
-        "lea (%%"REG_a",%1,4), %%"REG_d"        \n\t"
-//      0       1       2       3       4       5       6       7       8       9
-//      %0      eax     eax+%1  eax+2%1 %0+4%1  edx     edx+%1  edx+2%1 %0+8%1  edx+4%1
-        "movq (%2), %%mm0                       \n\t" // 12345678
-        "movq 16(%2), %%mm1                     \n\t" // abcdefgh
-        "movq %%mm0, %%mm2                      \n\t" // 12345678
-        "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
-        "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
-
-        "movq 32(%2), %%mm1                     \n\t"
-        "movq 48(%2), %%mm3                     \n\t"
-        "movq %%mm1, %%mm4                      \n\t"
-        "punpcklbw %%mm3, %%mm1                 \n\t"
-        "punpckhbw %%mm3, %%mm4                 \n\t"
-
-        "movq %%mm0, %%mm3                      \n\t"
-        "punpcklwd %%mm1, %%mm0                 \n\t"
-        "punpckhwd %%mm1, %%mm3                 \n\t"
-        "movq %%mm2, %%mm1                      \n\t"
-        "punpcklwd %%mm4, %%mm2                 \n\t"
-        "punpckhwd %%mm4, %%mm1                 \n\t"
-
-        "movd %%mm0, (%0)                       \n\t"
-        "psrlq $32, %%mm0                       \n\t"
-        "movd %%mm0, (%%"REG_a")                \n\t"
-        "movd %%mm3, (%%"REG_a", %1)            \n\t"
-        "psrlq $32, %%mm3                       \n\t"
-        "movd %%mm3, (%%"REG_a", %1, 2)         \n\t"
-        "movd %%mm2, (%0, %1, 4)                \n\t"
-        "psrlq $32, %%mm2                       \n\t"
-        "movd %%mm2, (%%"REG_d")                \n\t"
-        "movd %%mm1, (%%"REG_d", %1)            \n\t"
-        "psrlq $32, %%mm1                       \n\t"
-        "movd %%mm1, (%%"REG_d", %1, 2)         \n\t"
-
-
-        "movq 64(%2), %%mm0                     \n\t" // 12345678
-        "movq 80(%2), %%mm1                     \n\t" // abcdefgh
-        "movq %%mm0, %%mm2                      \n\t" // 12345678
-        "punpcklbw %%mm1, %%mm0                 \n\t" // 1a2b3c4d
-        "punpckhbw %%mm1, %%mm2                 \n\t" // 5e6f7g8h
-
-        "movq 96(%2), %%mm1                     \n\t"
-        "movq 112(%2), %%mm3                    \n\t"
-        "movq %%mm1, %%mm4                      \n\t"
-        "punpcklbw %%mm3, %%mm1                 \n\t"
-        "punpckhbw %%mm3, %%mm4                 \n\t"
-
-        "movq %%mm0, %%mm3                      \n\t"
-        "punpcklwd %%mm1, %%mm0                 \n\t"
-        "punpckhwd %%mm1, %%mm3                 \n\t"
-        "movq %%mm2, %%mm1                      \n\t"
-        "punpcklwd %%mm4, %%mm2                 \n\t"
-        "punpckhwd %%mm4, %%mm1                 \n\t"
-
-        "movd %%mm0, 4(%0)                      \n\t"
-        "psrlq $32, %%mm0                       \n\t"
-        "movd %%mm0, 4(%%"REG_a")               \n\t"
-        "movd %%mm3, 4(%%"REG_a", %1)           \n\t"
-        "psrlq $32, %%mm3                       \n\t"
-        "movd %%mm3, 4(%%"REG_a", %1, 2)        \n\t"
-        "movd %%mm2, 4(%0, %1, 4)               \n\t"
-        "psrlq $32, %%mm2                       \n\t"
-        "movd %%mm2, 4(%%"REG_d")               \n\t"
-        "movd %%mm1, 4(%%"REG_d", %1)           \n\t"
-        "psrlq $32, %%mm1                       \n\t"
-        "movd %%mm1, 4(%%"REG_d", %1, 2)        \n\t"
-
-        :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src)
-        : "%"REG_a, "%"REG_d
-    );
-}
-#endif //HAVE_MMX
-//static long test=0;
-
-#if !HAVE_ALTIVEC
-static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
-                                    uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
-{
-    // to save a register (FIXME do this outside of the loops)
-    tempBlurredPast[127]= maxNoise[0];
-    tempBlurredPast[128]= maxNoise[1];
-    tempBlurredPast[129]= maxNoise[2];
-
-#define FAST_L2_DIFF
-//#define L1_DIFF //u should change the thresholds too if u try that one
-#if HAVE_MMX2 || HAVE_AMD3DNOW
-    __asm__ volatile(
-        "lea (%2, %2, 2), %%"REG_a"             \n\t" // 3*stride
-        "lea (%2, %2, 4), %%"REG_d"             \n\t" // 5*stride
-        "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
-//      0       1       2       3       4       5       6       7       8       9
-//      %x      %x+%2   %x+2%2  %x+eax  %x+4%2  %x+edx  %x+2eax %x+ecx  %x+8%2
-//FIXME reorder?
-#ifdef L1_DIFF //needs mmx2
-        "movq (%0), %%mm0                       \n\t" // L0
-        "psadbw (%1), %%mm0                     \n\t" // |L0-R0|
-        "movq (%0, %2), %%mm1                   \n\t" // L1
-        "psadbw (%1, %2), %%mm1                 \n\t" // |L1-R1|
-        "movq (%0, %2, 2), %%mm2                \n\t" // L2
-        "psadbw (%1, %2, 2), %%mm2              \n\t" // |L2-R2|
-        "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
-        "psadbw (%1, %%"REG_a"), %%mm3          \n\t" // |L3-R3|
-
-        "movq (%0, %2, 4), %%mm4                \n\t" // L4
-        "paddw %%mm1, %%mm0                     \n\t"
-        "psadbw (%1, %2, 4), %%mm4              \n\t" // |L4-R4|
-        "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
-        "paddw %%mm2, %%mm0                     \n\t"
-        "psadbw (%1, %%"REG_d"), %%mm5          \n\t" // |L5-R5|
-        "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
-        "paddw %%mm3, %%mm0                     \n\t"
-        "psadbw (%1, %%"REG_a", 2), %%mm6       \n\t" // |L6-R6|
-        "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
-        "paddw %%mm4, %%mm0                     \n\t"
-        "psadbw (%1, %%"REG_c"), %%mm7          \n\t" // |L7-R7|
-        "paddw %%mm5, %%mm6                     \n\t"
-        "paddw %%mm7, %%mm6                     \n\t"
-        "paddw %%mm6, %%mm0                     \n\t"
-#else //L1_DIFF
-#if defined (FAST_L2_DIFF)
-        "pcmpeqb %%mm7, %%mm7                   \n\t"
-        "movq "MANGLE(b80)", %%mm6              \n\t"
-        "pxor %%mm0, %%mm0                      \n\t"
-#define REAL_L2_DIFF_CORE(a, b)\
-        "movq " #a ", %%mm5                     \n\t"\
-        "movq " #b ", %%mm2                     \n\t"\
-        "pxor %%mm7, %%mm2                      \n\t"\
-        PAVGB(%%mm2, %%mm5)\
-        "paddb %%mm6, %%mm5                     \n\t"\
-        "movq %%mm5, %%mm2                      \n\t"\
-        "psllw $8, %%mm5                        \n\t"\
-        "pmaddwd %%mm5, %%mm5                   \n\t"\
-        "pmaddwd %%mm2, %%mm2                   \n\t"\
-        "paddd %%mm2, %%mm5                     \n\t"\
-        "psrld $14, %%mm5                       \n\t"\
-        "paddd %%mm5, %%mm0                     \n\t"
-
-#else //defined (FAST_L2_DIFF)
-        "pxor %%mm7, %%mm7                      \n\t"
-        "pxor %%mm0, %%mm0                      \n\t"
-#define REAL_L2_DIFF_CORE(a, b)\
-        "movq " #a ", %%mm5                     \n\t"\
-        "movq " #b ", %%mm2                     \n\t"\
-        "movq %%mm5, %%mm1                      \n\t"\
-        "movq %%mm2, %%mm3                      \n\t"\
-        "punpcklbw %%mm7, %%mm5                 \n\t"\
-        "punpckhbw %%mm7, %%mm1                 \n\t"\
-        "punpcklbw %%mm7, %%mm2                 \n\t"\
-        "punpckhbw %%mm7, %%mm3                 \n\t"\
-        "psubw %%mm2, %%mm5                     \n\t"\
-        "psubw %%mm3, %%mm1                     \n\t"\
-        "pmaddwd %%mm5, %%mm5                   \n\t"\
-        "pmaddwd %%mm1, %%mm1                   \n\t"\
-        "paddd %%mm1, %%mm5                     \n\t"\
-        "paddd %%mm5, %%mm0                     \n\t"
-
-#endif //defined (FAST_L2_DIFF)
-
-#define L2_DIFF_CORE(a, b)  REAL_L2_DIFF_CORE(a, b)
-
-L2_DIFF_CORE((%0)          , (%1))
-L2_DIFF_CORE((%0, %2)      , (%1, %2))
-L2_DIFF_CORE((%0, %2, 2)   , (%1, %2, 2))
-L2_DIFF_CORE((%0, %%REGa)  , (%1, %%REGa))
-L2_DIFF_CORE((%0, %2, 4)   , (%1, %2, 4))
-L2_DIFF_CORE((%0, %%REGd)  , (%1, %%REGd))
-L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2))
-L2_DIFF_CORE((%0, %%REGc)  , (%1, %%REGc))
-
-#endif //L1_DIFF
-
-        "movq %%mm0, %%mm4                      \n\t"
-        "psrlq $32, %%mm0                       \n\t"
-        "paddd %%mm0, %%mm4                     \n\t"
-        "movd %%mm4, %%ecx                      \n\t"
-        "shll $2, %%ecx                         \n\t"
-        "mov %3, %%"REG_d"                      \n\t"
-        "addl -4(%%"REG_d"), %%ecx              \n\t"
-        "addl 4(%%"REG_d"), %%ecx               \n\t"
-        "addl -1024(%%"REG_d"), %%ecx           \n\t"
-        "addl $4, %%ecx                         \n\t"
-        "addl 1024(%%"REG_d"), %%ecx            \n\t"
-        "shrl $3, %%ecx                         \n\t"
-        "movl %%ecx, (%%"REG_d")                \n\t"
-
-//        "mov %3, %%"REG_c"                      \n\t"
-//        "mov %%"REG_c", test                    \n\t"
-//        "jmp 4f                                 \n\t"
-        "cmpl 512(%%"REG_d"), %%ecx             \n\t"
-        " jb 2f                                 \n\t"
-        "cmpl 516(%%"REG_d"), %%ecx             \n\t"
-        " jb 1f                                 \n\t"
-
-        "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
-        "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
-        "movq (%0), %%mm0                       \n\t" // L0
-        "movq (%0, %2), %%mm1                   \n\t" // L1
-        "movq (%0, %2, 2), %%mm2                \n\t" // L2
-        "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
-        "movq (%0, %2, 4), %%mm4                \n\t" // L4
-        "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
-        "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
-        "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
-        "movq %%mm0, (%1)                       \n\t" // L0
-        "movq %%mm1, (%1, %2)                   \n\t" // L1
-        "movq %%mm2, (%1, %2, 2)                \n\t" // L2
-        "movq %%mm3, (%1, %%"REG_a")            \n\t" // L3
-        "movq %%mm4, (%1, %2, 4)                \n\t" // L4
-        "movq %%mm5, (%1, %%"REG_d")            \n\t" // L5
-        "movq %%mm6, (%1, %%"REG_a", 2)         \n\t" // L6
-        "movq %%mm7, (%1, %%"REG_c")            \n\t" // L7
-        "jmp 4f                                 \n\t"
-
-        "1:                                     \n\t"
-        "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
-        "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
-        "movq (%0), %%mm0                       \n\t" // L0
-        PAVGB((%1), %%mm0)                            // L0
-        "movq (%0, %2), %%mm1                   \n\t" // L1
-        PAVGB((%1, %2), %%mm1)                        // L1
-        "movq (%0, %2, 2), %%mm2                \n\t" // L2
-        PAVGB((%1, %2, 2), %%mm2)                     // L2
-        "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
-        PAVGB((%1, %%REGa), %%mm3)                    // L3
-        "movq (%0, %2, 4), %%mm4                \n\t" // L4
-        PAVGB((%1, %2, 4), %%mm4)                     // L4
-        "movq (%0, %%"REG_d"), %%mm5            \n\t" // L5
-        PAVGB((%1, %%REGd), %%mm5)                    // L5
-        "movq (%0, %%"REG_a", 2), %%mm6         \n\t" // L6
-        PAVGB((%1, %%REGa, 2), %%mm6)                 // L6
-        "movq (%0, %%"REG_c"), %%mm7            \n\t" // L7
-        PAVGB((%1, %%REGc), %%mm7)                    // L7
-        "movq %%mm0, (%1)                       \n\t" // R0
-        "movq %%mm1, (%1, %2)                   \n\t" // R1
-        "movq %%mm2, (%1, %2, 2)                \n\t" // R2
-        "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
-        "movq %%mm4, (%1, %2, 4)                \n\t" // R4
-        "movq %%mm5, (%1, %%"REG_d")            \n\t" // R5
-        "movq %%mm6, (%1, %%"REG_a", 2)         \n\t" // R6
-        "movq %%mm7, (%1, %%"REG_c")            \n\t" // R7
-        "movq %%mm0, (%0)                       \n\t" // L0
-        "movq %%mm1, (%0, %2)                   \n\t" // L1
-        "movq %%mm2, (%0, %2, 2)                \n\t" // L2
-        "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
-        "movq %%mm4, (%0, %2, 4)                \n\t" // L4
-        "movq %%mm5, (%0, %%"REG_d")            \n\t" // L5
-        "movq %%mm6, (%0, %%"REG_a", 2)         \n\t" // L6
-        "movq %%mm7, (%0, %%"REG_c")            \n\t" // L7
-        "jmp 4f                                 \n\t"
-
-        "2:                                     \n\t"
-        "cmpl 508(%%"REG_d"), %%ecx             \n\t"
-        " jb 3f                                 \n\t"
-
-        "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
-        "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
-        "movq (%0), %%mm0                       \n\t" // L0
-        "movq (%0, %2), %%mm1                   \n\t" // L1
-        "movq (%0, %2, 2), %%mm2                \n\t" // L2
-        "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
-        "movq (%1), %%mm4                       \n\t" // R0
-        "movq (%1, %2), %%mm5                   \n\t" // R1
-        "movq (%1, %2, 2), %%mm6                \n\t" // R2
-        "movq (%1, %%"REG_a"), %%mm7            \n\t" // R3
-        PAVGB(%%mm4, %%mm0)
-        PAVGB(%%mm5, %%mm1)
-        PAVGB(%%mm6, %%mm2)
-        PAVGB(%%mm7, %%mm3)
-        PAVGB(%%mm4, %%mm0)
-        PAVGB(%%mm5, %%mm1)
-        PAVGB(%%mm6, %%mm2)
-        PAVGB(%%mm7, %%mm3)
-        "movq %%mm0, (%1)                       \n\t" // R0
-        "movq %%mm1, (%1, %2)                   \n\t" // R1
-        "movq %%mm2, (%1, %2, 2)                \n\t" // R2
-        "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
-        "movq %%mm0, (%0)                       \n\t" // L0
-        "movq %%mm1, (%0, %2)                   \n\t" // L1
-        "movq %%mm2, (%0, %2, 2)                \n\t" // L2
-        "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
-
-        "movq (%0, %2, 4), %%mm0                \n\t" // L4
-        "movq (%0, %%"REG_d"), %%mm1            \n\t" // L5
-        "movq (%0, %%"REG_a", 2), %%mm2         \n\t" // L6
-        "movq (%0, %%"REG_c"), %%mm3            \n\t" // L7
-        "movq (%1, %2, 4), %%mm4                \n\t" // R4
-        "movq (%1, %%"REG_d"), %%mm5            \n\t" // R5
-        "movq (%1, %%"REG_a", 2), %%mm6         \n\t" // R6
-        "movq (%1, %%"REG_c"), %%mm7            \n\t" // R7
-        PAVGB(%%mm4, %%mm0)
-        PAVGB(%%mm5, %%mm1)
-        PAVGB(%%mm6, %%mm2)
-        PAVGB(%%mm7, %%mm3)
-        PAVGB(%%mm4, %%mm0)
-        PAVGB(%%mm5, %%mm1)
-        PAVGB(%%mm6, %%mm2)
-        PAVGB(%%mm7, %%mm3)
-        "movq %%mm0, (%1, %2, 4)                \n\t" // R4
-        "movq %%mm1, (%1, %%"REG_d")            \n\t" // R5
-        "movq %%mm2, (%1, %%"REG_a", 2)         \n\t" // R6
-        "movq %%mm3, (%1, %%"REG_c")            \n\t" // R7
-        "movq %%mm0, (%0, %2, 4)                \n\t" // L4
-        "movq %%mm1, (%0, %%"REG_d")            \n\t" // L5
-        "movq %%mm2, (%0, %%"REG_a", 2)         \n\t" // L6
-        "movq %%mm3, (%0, %%"REG_c")            \n\t" // L7
-        "jmp 4f                                 \n\t"
-
-        "3:                                     \n\t"
-        "lea (%%"REG_a", %2, 2), %%"REG_d"      \n\t" // 5*stride
-        "lea (%%"REG_d", %2, 2), %%"REG_c"      \n\t" // 7*stride
-        "movq (%0), %%mm0                       \n\t" // L0
-        "movq (%0, %2), %%mm1                   \n\t" // L1
-        "movq (%0, %2, 2), %%mm2                \n\t" // L2
-        "movq (%0, %%"REG_a"), %%mm3            \n\t" // L3
-        "movq (%1), %%mm4                       \n\t" // R0
-        "movq (%1, %2), %%mm5                   \n\t" // R1
-        "movq (%1, %2, 2), %%mm6                \n\t" // R2
-        "movq (%1, %%"REG_a"), %%mm7            \n\t" // R3
-        PAVGB(%%mm4, %%mm0)
-        PAVGB(%%mm5, %%mm1)
-        PAVGB(%%mm6, %%mm2)
-        PAVGB(%%mm7, %%mm3)
-        PAVGB(%%mm4, %%mm0)
-        PAVGB(%%mm5, %%mm1)
-        PAVGB(%%mm6, %%mm2)
-        PAVGB(%%mm7, %%mm3)
-        PAVGB(%%mm4, %%mm0)
-        PAVGB(%%mm5, %%mm1)
-        PAVGB(%%mm6, %%mm2)
-        PAVGB(%%mm7, %%mm3)
-        "movq %%mm0, (%1)                       \n\t" // R0
-        "movq %%mm1, (%1, %2)                   \n\t" // R1
-        "movq %%mm2, (%1, %2, 2)                \n\t" // R2
-        "movq %%mm3, (%1, %%"REG_a")            \n\t" // R3
-        "movq %%mm0, (%0)                       \n\t" // L0
-        "movq %%mm1, (%0, %2)                   \n\t" // L1
-        "movq %%mm2, (%0, %2, 2)                \n\t" // L2
-        "movq %%mm3, (%0, %%"REG_a")            \n\t" // L3
-
-        "movq (%0, %2, 4), %%mm0                \n\t" // L4
-        "movq (%0, %%"REG_d"), %%mm1            \n\t" // L5
-        "movq (%0, %%"REG_a", 2), %%mm2         \n\t" // L6
-        "movq (%0, %%"REG_c"), %%mm3            \n\t" // L7
-        "movq (%1, %2, 4), %%mm4                \n\t" // R4
-        "movq (%1, %%"REG_d"), %%mm5            \n\t" // R5
-        "movq (%1, %%"REG_a", 2), %%mm6         \n\t" // R6
-        "movq (%1, %%"REG_c"), %%mm7            \n\t" // R7
-        PAVGB(%%mm4, %%mm0)
-        PAVGB(%%mm5, %%mm1)
-        PAVGB(%%mm6, %%mm2)
-        PAVGB(%%mm7, %%mm3)
-        PAVGB(%%mm4, %%mm0)
-        PAVGB(%%mm5, %%mm1)
-        PAVGB(%%mm6, %%mm2)
-        PAVGB(%%mm7, %%mm3)
-        PAVGB(%%mm4, %%mm0)
-        PAVGB(%%mm5, %%mm1)
-        PAVGB(%%mm6, %%mm2)
-        PAVGB(%%mm7, %%mm3)
-        "movq %%mm0, (%1, %2, 4)                \n\t" // R4
-        "movq %%mm1, (%1, %%"REG_d")            \n\t" // R5
-        "movq %%mm2, (%1, %%"REG_a", 2)         \n\t" // R6
-        "movq %%mm3, (%1, %%"REG_c")            \n\t" // R7
-        "movq %%mm0, (%0, %2, 4)                \n\t" // L4
-        "movq %%mm1, (%0, %%"REG_d")            \n\t" // L5
-        "movq %%mm2, (%0, %%"REG_a", 2)         \n\t" // L6
-        "movq %%mm3, (%0, %%"REG_c")            \n\t" // L7
-
-        "4:                                     \n\t"
-
-        :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
-        : "%"REG_a, "%"REG_d, "%"REG_c, "memory"
-    );
-#else //HAVE_MMX2 || HAVE_AMD3DNOW
-{
-    int y;
-    int d=0;
-//    int sysd=0;
-    int i;
-
-    for(y=0; y<8; y++){
-        int x;
-        for(x=0; x<8; x++){
-            int ref= tempBlurred[ x + y*stride ];
-            int cur= src[ x + y*stride ];
-            int d1=ref - cur;
-//            if(x==0 || x==7) d1+= d1>>1;
-//            if(y==0 || y==7) d1+= d1>>1;
-//            d+= FFABS(d1);
-            d+= d1*d1;
-//            sysd+= d1;
-        }
-    }
-    i=d;
-    d=  (
-        4*d
-        +(*(tempBlurredPast-256))
-        +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
-        +(*(tempBlurredPast+256))
-        +4)>>3;
-    *tempBlurredPast=i;
-//    ((*tempBlurredPast)*3 + d + 2)>>2;
-
-/*
-Switch between
- 1  0  0  0  0  0  0  (0)
-64 32 16  8  4  2  1  (1)
-64 48 36 27 20 15 11 (33) (approx)
-64 56 49 43 37 33 29 (200) (approx)
-*/
-    if(d > maxNoise[1]){
-        if(d < maxNoise[2]){
-            for(y=0; y<8; y++){
-                int x;
-                for(x=0; x<8; x++){
-                    int ref= tempBlurred[ x + y*stride ];
-                    int cur= src[ x + y*stride ];
-                    tempBlurred[ x + y*stride ]=
-                    src[ x + y*stride ]=
-                        (ref + cur + 1)>>1;
-                }
-            }
-        }else{
-            for(y=0; y<8; y++){
-                int x;
-                for(x=0; x<8; x++){
-                    tempBlurred[ x + y*stride ]= src[ x + y*stride ];
-                }
-            }
-        }
-    }else{
-        if(d < maxNoise[0]){
-            for(y=0; y<8; y++){
-                int x;
-                for(x=0; x<8; x++){
-                    int ref= tempBlurred[ x + y*stride ];
-                    int cur= src[ x + y*stride ];
-                    tempBlurred[ x + y*stride ]=
-                    src[ x + y*stride ]=
-                        (ref*7 + cur + 4)>>3;
-                }
-            }
-        }else{
-            for(y=0; y<8; y++){
-                int x;
-                for(x=0; x<8; x++){
-                    int ref= tempBlurred[ x + y*stride ];
-                    int cur= src[ x + y*stride ];
-                    tempBlurred[ x + y*stride ]=
-                    src[ x + y*stride ]=
-                        (ref*3 + cur + 2)>>2;
-                }
-            }
-        }
-    }
-}
-#endif //HAVE_MMX2 || HAVE_AMD3DNOW
-}
-#endif //HAVE_ALTIVEC
-
-#if HAVE_MMX
-/**
- * accurate deblock filter
- */
-static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
-    int64_t dc_mask, eq_mask, both_masks;
-    int64_t sums[10*8*2];
-    src+= step*3; // src points to begin of the 8x8 Block
-    //{ START_TIMER
-    __asm__ volatile(
-        "movq %0, %%mm7                         \n\t"
-        "movq %1, %%mm6                         \n\t"
-        : : "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
-        );
-
-    __asm__ volatile(
-        "lea (%2, %3), %%"REG_a"                \n\t"
-//      0       1       2       3       4       5       6       7       8       9
-//      %1      eax     eax+%2  eax+2%2 %1+4%2  ecx     ecx+%2  ecx+2%2 %1+8%2  ecx+4%2
-
-        "movq (%2), %%mm0                       \n\t"
-        "movq (%%"REG_a"), %%mm1                \n\t"
-        "movq %%mm1, %%mm3                      \n\t"
-        "movq %%mm1, %%mm4                      \n\t"
-        "psubb %%mm1, %%mm0                     \n\t" // mm0 = differnece
-        "paddb %%mm7, %%mm0                     \n\t"
-        "pcmpgtb %%mm6, %%mm0                   \n\t"
-
-        "movq (%%"REG_a",%3), %%mm2             \n\t"
-        PMAXUB(%%mm2, %%mm4)
-        PMINUB(%%mm2, %%mm3, %%mm5)
-        "psubb %%mm2, %%mm1                     \n\t"
-        "paddb %%mm7, %%mm1                     \n\t"
-        "pcmpgtb %%mm6, %%mm1                   \n\t"
-        "paddb %%mm1, %%mm0                     \n\t"
-
-        "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
-        PMAXUB(%%mm1, %%mm4)
-        PMINUB(%%mm1, %%mm3, %%mm5)
-        "psubb %%mm1, %%mm2                     \n\t"
-        "paddb %%mm7, %%mm2                     \n\t"
-        "pcmpgtb %%mm6, %%mm2                   \n\t"
-        "paddb %%mm2, %%mm0                     \n\t"
-
-        "lea (%%"REG_a", %3, 4), %%"REG_a"      \n\t"
-
-        "movq (%2, %3, 4), %%mm2                \n\t"
-        PMAXUB(%%mm2, %%mm4)
-        PMINUB(%%mm2, %%mm3, %%mm5)
-        "psubb %%mm2, %%mm1                     \n\t"
-        "paddb %%mm7, %%mm1                     \n\t"
-        "pcmpgtb %%mm6, %%mm1                   \n\t"
-        "paddb %%mm1, %%mm0                     \n\t"
-
-        "movq (%%"REG_a"), %%mm1                \n\t"
-        PMAXUB(%%mm1, %%mm4)
-        PMINUB(%%mm1, %%mm3, %%mm5)
-        "psubb %%mm1, %%mm2                     \n\t"
-        "paddb %%mm7, %%mm2                     \n\t"
-        "pcmpgtb %%mm6, %%mm2                   \n\t"
-        "paddb %%mm2, %%mm0                     \n\t"
-
-        "movq (%%"REG_a", %3), %%mm2            \n\t"
-        PMAXUB(%%mm2, %%mm4)
-        PMINUB(%%mm2, %%mm3, %%mm5)
-        "psubb %%mm2, %%mm1                     \n\t"
-        "paddb %%mm7, %%mm1                     \n\t"
-        "pcmpgtb %%mm6, %%mm1                   \n\t"
-        "paddb %%mm1, %%mm0                     \n\t"
-
-        "movq (%%"REG_a", %3, 2), %%mm1         \n\t"
-        PMAXUB(%%mm1, %%mm4)
-        PMINUB(%%mm1, %%mm3, %%mm5)
-        "psubb %%mm1, %%mm2                     \n\t"
-        "paddb %%mm7, %%mm2                     \n\t"
-        "pcmpgtb %%mm6, %%mm2                   \n\t"
-        "paddb %%mm2, %%mm0                     \n\t"
-
-        "movq (%2, %3, 8), %%mm2                \n\t"
-        PMAXUB(%%mm2, %%mm4)
-        PMINUB(%%mm2, %%mm3, %%mm5)
-        "psubb %%mm2, %%mm1                     \n\t"
-        "paddb %%mm7, %%mm1                     \n\t"
-        "pcmpgtb %%mm6, %%mm1                   \n\t"
-        "paddb %%mm1, %%mm0                     \n\t"
-
-        "movq (%%"REG_a", %3, 4), %%mm1         \n\t"
-        "psubb %%mm1, %%mm2                     \n\t"
-        "paddb %%mm7, %%mm2                     \n\t"
-        "pcmpgtb %%mm6, %%mm2                   \n\t"
-        "paddb %%mm2, %%mm0                     \n\t"
-        "psubusb %%mm3, %%mm4                   \n\t"
-
-        "pxor %%mm6, %%mm6                      \n\t"
-        "movq %4, %%mm7                         \n\t" // QP,..., QP
-        "paddusb %%mm7, %%mm7                   \n\t" // 2QP ... 2QP
-        "psubusb %%mm4, %%mm7                   \n\t" // Diff >=2QP -> 0
-        "pcmpeqb %%mm6, %%mm7                   \n\t" // Diff < 2QP -> 0
-        "pcmpeqb %%mm6, %%mm7                   \n\t" // Diff < 2QP -> 0
-        "movq %%mm7, %1                         \n\t"
-
-        "movq %5, %%mm7                         \n\t"
-        "punpcklbw %%mm7, %%mm7                 \n\t"
-        "punpcklbw %%mm7, %%mm7                 \n\t"
-        "punpcklbw %%mm7, %%mm7                 \n\t"
-        "psubb %%mm0, %%mm6                     \n\t"
-        "pcmpgtb %%mm7, %%mm6                   \n\t"
-        "movq %%mm6, %0                         \n\t"
-
-        : "=m" (eq_mask), "=m" (dc_mask)
-        : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
-        : "%"REG_a
-    );
-
-    both_masks = dc_mask & eq_mask;
-
-    if(both_masks){
-        x86_reg offset= -8*step;
-        int64_t *temp_sums= sums;
-
-        __asm__ volatile(
-            "movq %2, %%mm0                         \n\t"  // QP,..., QP
-            "pxor %%mm4, %%mm4                      \n\t"
-
-            "movq (%0), %%mm6                       \n\t"
-            "movq (%0, %1), %%mm5                   \n\t"
-            "movq %%mm5, %%mm1                      \n\t"
-            "movq %%mm6, %%mm2                      \n\t"
-            "psubusb %%mm6, %%mm5                   \n\t"
-            "psubusb %%mm1, %%mm2                   \n\t"
-            "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
-            "psubusb %%mm2, %%mm0                   \n\t" // diff >= QP -> 0
-            "pcmpeqb %%mm4, %%mm0                   \n\t" // diff >= QP -> FF
-
-            "pxor %%mm6, %%mm1                      \n\t"
-            "pand %%mm0, %%mm1                      \n\t"
-            "pxor %%mm1, %%mm6                      \n\t"
-            // 0:QP  6:First
-
-            "movq (%0, %1, 8), %%mm5                \n\t"
-            "add %1, %0                             \n\t" // %0 points to line 1 not 0
-            "movq (%0, %1, 8), %%mm7                \n\t"
-            "movq %%mm5, %%mm1                      \n\t"
-            "movq %%mm7, %%mm2                      \n\t"
-            "psubusb %%mm7, %%mm5                   \n\t"
-            "psubusb %%mm1, %%mm2                   \n\t"
-            "por %%mm5, %%mm2                       \n\t" // ABS Diff of lines
-            "movq %2, %%mm0                         \n\t"  // QP,..., QP
-            "psubusb %%mm2, %%mm0                   \n\t" // diff >= QP -> 0
-            "pcmpeqb %%mm4, %%mm0                   \n\t" // diff >= QP -> FF
-
-            "pxor %%mm7, %%mm1                      \n\t"
-            "pand %%mm0, %%mm1                      \n\t"
-            "pxor %%mm1, %%mm7                      \n\t"
-
-            "movq %%mm6, %%mm5                      \n\t"
-            "punpckhbw %%mm4, %%mm6                 \n\t"
-            "punpcklbw %%mm4, %%mm5                 \n\t"
-            // 4:0 5/6:First 7:Last
-
-            "movq %%mm5, %%mm0                      \n\t"
-            "movq %%mm6, %%mm1                      \n\t"
-            "psllw $2, %%mm0                        \n\t"
-            "psllw $2, %%mm1                        \n\t"
-            "paddw "MANGLE(w04)", %%mm0             \n\t"
-            "paddw "MANGLE(w04)", %%mm1             \n\t"
-
-#define NEXT\
-            "movq (%0), %%mm2                       \n\t"\
-            "movq (%0), %%mm3                       \n\t"\
-            "add %1, %0                             \n\t"\
-            "punpcklbw %%mm4, %%mm2                 \n\t"\
-            "punpckhbw %%mm4, %%mm3                 \n\t"\
-            "paddw %%mm2, %%mm0                     \n\t"\
-            "paddw %%mm3, %%mm1                     \n\t"
-
-#define PREV\
-            "movq (%0), %%mm2                       \n\t"\
-            "movq (%0), %%mm3                       \n\t"\
-            "add %1, %0                             \n\t"\
-            "punpcklbw %%mm4, %%mm2                 \n\t"\
-            "punpckhbw %%mm4, %%mm3                 \n\t"\
-            "psubw %%mm2, %%mm0                     \n\t"\
-            "psubw %%mm3, %%mm1                     \n\t"
-
-
-            NEXT //0
-            NEXT //1
-            NEXT //2
-            "movq %%mm0, (%3)                       \n\t"
-            "movq %%mm1, 8(%3)                      \n\t"
-
-            NEXT //3
-            "psubw %%mm5, %%mm0                     \n\t"
-            "psubw %%mm6, %%mm1                     \n\t"
-            "movq %%mm0, 16(%3)                     \n\t"
-            "movq %%mm1, 24(%3)                     \n\t"
-
-            NEXT //4
-            "psubw %%mm5, %%mm0                     \n\t"
-            "psubw %%mm6, %%mm1                     \n\t"
-            "movq %%mm0, 32(%3)                     \n\t"
-            "movq %%mm1, 40(%3)                     \n\t"
-
-            NEXT //5
-            "psubw %%mm5, %%mm0                     \n\t"
-            "psubw %%mm6, %%mm1                     \n\t"
-            "movq %%mm0, 48(%3)                     \n\t"
-            "movq %%mm1, 56(%3)                     \n\t"
-
-            NEXT //6
-            "psubw %%mm5, %%mm0                     \n\t"
-            "psubw %%mm6, %%mm1                     \n\t"
-            "movq %%mm0, 64(%3)                     \n\t"
-            "movq %%mm1, 72(%3)                     \n\t"
-
-            "movq %%mm7, %%mm6                      \n\t"
-            "punpckhbw %%mm4, %%mm7                 \n\t"
-            "punpcklbw %%mm4, %%mm6                 \n\t"
-
-            NEXT //7
-            "mov %4, %0                             \n\t"
-            "add %1, %0                             \n\t"
-            PREV //0
-            "movq %%mm0, 80(%3)                     \n\t"
-            "movq %%mm1, 88(%3)                     \n\t"
-
-            PREV //1
-            "paddw %%mm6, %%mm0                     \n\t"
-            "paddw %%mm7, %%mm1                     \n\t"
-            "movq %%mm0, 96(%3)                     \n\t"
-            "movq %%mm1, 104(%3)                    \n\t"
-
-            PREV //2
-            "paddw %%mm6, %%mm0                     \n\t"
-            "paddw %%mm7, %%mm1                     \n\t"
-            "movq %%mm0, 112(%3)                    \n\t"
-            "movq %%mm1, 120(%3)                    \n\t"
-
-            PREV //3
-            "paddw %%mm6, %%mm0                     \n\t"
-            "paddw %%mm7, %%mm1                     \n\t"
-            "movq %%mm0, 128(%3)                    \n\t"
-            "movq %%mm1, 136(%3)                    \n\t"
-
-            PREV //4
-            "paddw %%mm6, %%mm0                     \n\t"
-            "paddw %%mm7, %%mm1                     \n\t"
-            "movq %%mm0, 144(%3)                    \n\t"
-            "movq %%mm1, 152(%3)                    \n\t"
-
-            "mov %4, %0                             \n\t" //FIXME
-
-            : "+&r"(src)
-            : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src)
-        );
-
-        src+= step; // src points to begin of the 8x8 Block
-
-        __asm__ volatile(
-            "movq %4, %%mm6                         \n\t"
-            "pcmpeqb %%mm5, %%mm5                   \n\t"
-            "pxor %%mm6, %%mm5                      \n\t"
-            "pxor %%mm7, %%mm7                      \n\t"
-
-            "1:                                     \n\t"
-            "movq (%1), %%mm0                       \n\t"
-            "movq 8(%1), %%mm1                      \n\t"
-            "paddw 32(%1), %%mm0                    \n\t"
-            "paddw 40(%1), %%mm1                    \n\t"
-            "movq (%0, %3), %%mm2                   \n\t"
-            "movq %%mm2, %%mm3                      \n\t"
-            "movq %%mm2, %%mm4                      \n\t"
-            "punpcklbw %%mm7, %%mm2                 \n\t"
-            "punpckhbw %%mm7, %%mm3                 \n\t"
-            "paddw %%mm2, %%mm0                     \n\t"
-            "paddw %%mm3, %%mm1                     \n\t"
-            "paddw %%mm2, %%mm0                     \n\t"
-            "paddw %%mm3, %%mm1                     \n\t"
-            "psrlw $4, %%mm0                        \n\t"
-            "psrlw $4, %%mm1                        \n\t"
-            "packuswb %%mm1, %%mm0                  \n\t"
-            "pand %%mm6, %%mm0                      \n\t"
-            "pand %%mm5, %%mm4                      \n\t"
-            "por %%mm4, %%mm0                       \n\t"
-            "movq %%mm0, (%0, %3)                   \n\t"
-            "add $16, %1                            \n\t"
-            "add %2, %0                             \n\t"
-            " js 1b                                 \n\t"
-
-            : "+r"(offset), "+r"(temp_sums)
-            : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks)
-        );
-    }else
-        src+= step; // src points to begin of the 8x8 Block
-
-    if(eq_mask != -1LL){
-        uint8_t *temp_src= src;
-        __asm__ volatile(
-            "pxor %%mm7, %%mm7                      \n\t"
-            "lea -40(%%"REG_SP"), %%"REG_c"         \n\t" // make space for 4 8-byte vars
-            "and "ALIGN_MASK", %%"REG_c"            \n\t" // align
-//      0       1       2       3       4       5       6       7       8       9
-//      %0      eax     eax+%1  eax+2%1 %0+4%1  ecx     ecx+%1  ecx+2%1 %1+8%1  ecx+4%1
-
-            "movq (%0), %%mm0                       \n\t"
-            "movq %%mm0, %%mm1                      \n\t"
-            "punpcklbw %%mm7, %%mm0                 \n\t" // low part of line 0
-            "punpckhbw %%mm7, %%mm1                 \n\t" // high part of line 0
-
-            "movq (%0, %1), %%mm2                   \n\t"
-            "lea (%0, %1, 2), %%"REG_a"             \n\t"
-            "movq %%mm2, %%mm3                      \n\t"
-            "punpcklbw %%mm7, %%mm2                 \n\t" // low part of line 1
-            "punpckhbw %%mm7, %%mm3                 \n\t" // high part of line 1
-
-            "movq (%%"REG_a"), %%mm4                \n\t"
-            "movq %%mm4, %%mm5                      \n\t"
-            "punpcklbw %%mm7, %%mm4                 \n\t" // low part of line 2
-            "punpckhbw %%mm7, %%mm5                 \n\t" // high part of line 2
-
-            "paddw %%mm0, %%mm0                     \n\t" // 2L0
-            "paddw %%mm1, %%mm1                     \n\t" // 2H0
-            "psubw %%mm4, %%mm2                     \n\t" // L1 - L2
-            "psubw %%mm5, %%mm3                     \n\t" // H1 - H2
-            "psubw %%mm2, %%mm0                     \n\t" // 2L0 - L1 + L2
-            "psubw %%mm3, %%mm1                     \n\t" // 2H0 - H1 + H2
-
-            "psllw $2, %%mm2                        \n\t" // 4L1 - 4L2
-            "psllw $2, %%mm3                        \n\t" // 4H1 - 4H2
-            "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2
-            "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2
-
-            "movq (%%"REG_a", %1), %%mm2            \n\t"
-            "movq %%mm2, %%mm3                      \n\t"
-            "punpcklbw %%mm7, %%mm2                 \n\t" // L3
-            "punpckhbw %%mm7, %%mm3                 \n\t" // H3
-
-            "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - L3
-            "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - H3
-            "psubw %%mm2, %%mm0                     \n\t" // 2L0 - 5L1 + 5L2 - 2L3
-            "psubw %%mm3, %%mm1                     \n\t" // 2H0 - 5H1 + 5H2 - 2H3
-            "movq %%mm0, (%%"REG_c")                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
-            "movq %%mm1, 8(%%"REG_c")               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
-
-            "movq (%%"REG_a", %1, 2), %%mm0         \n\t"
-            "movq %%mm0, %%mm1                      \n\t"
-            "punpcklbw %%mm7, %%mm0                 \n\t" // L4
-            "punpckhbw %%mm7, %%mm1                 \n\t" // H4
-
-            "psubw %%mm0, %%mm2                     \n\t" // L3 - L4
-            "psubw %%mm1, %%mm3                     \n\t" // H3 - H4
-            "movq %%mm2, 16(%%"REG_c")              \n\t" // L3 - L4
-            "movq %%mm3, 24(%%"REG_c")              \n\t" // H3 - H4
-            "paddw %%mm4, %%mm4                     \n\t" // 2L2
-            "paddw %%mm5, %%mm5                     \n\t" // 2H2
-            "psubw %%mm2, %%mm4                     \n\t" // 2L2 - L3 + L4
-            "psubw %%mm3, %%mm5                     \n\t" // 2H2 - H3 + H4
-
-            "lea (%%"REG_a", %1), %0                \n\t"
-            "psllw $2, %%mm2                        \n\t" // 4L3 - 4L4
-            "psllw $2, %%mm3                        \n\t" // 4H3 - 4H4
-            "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4
-            "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4
-//50 opcodes so far
-            "movq (%0, %1, 2), %%mm2                \n\t"
-            "movq %%mm2, %%mm3                      \n\t"
-            "punpcklbw %%mm7, %%mm2                 \n\t" // L5
-            "punpckhbw %%mm7, %%mm3                 \n\t" // H5
-            "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - L5
-            "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - H5
-            "psubw %%mm2, %%mm4                     \n\t" // 2L2 - 5L3 + 5L4 - 2L5
-            "psubw %%mm3, %%mm5                     \n\t" // 2H2 - 5H3 + 5H4 - 2H5
-
-            "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
-            "punpcklbw %%mm7, %%mm6                 \n\t" // L6
-            "psubw %%mm6, %%mm2                     \n\t" // L5 - L6
-            "movq (%%"REG_a", %1, 4), %%mm6         \n\t"
-            "punpckhbw %%mm7, %%mm6                 \n\t" // H6
-            "psubw %%mm6, %%mm3                     \n\t" // H5 - H6
-
-            "paddw %%mm0, %%mm0                     \n\t" // 2L4
-            "paddw %%mm1, %%mm1                     \n\t" // 2H4
-            "psubw %%mm2, %%mm0                     \n\t" // 2L4 - L5 + L6
-            "psubw %%mm3, %%mm1                     \n\t" // 2H4 - H5 + H6
-
-            "psllw $2, %%mm2                        \n\t" // 4L5 - 4L6
-            "psllw $2, %%mm3                        \n\t" // 4H5 - 4H6
-            "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6
-            "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6
-
-            "movq (%0, %1, 4), %%mm2                \n\t"
-            "movq %%mm2, %%mm3                      \n\t"
-            "punpcklbw %%mm7, %%mm2                 \n\t" // L7
-            "punpckhbw %%mm7, %%mm3                 \n\t" // H7
-
-            "paddw %%mm2, %%mm2                     \n\t" // 2L7
-            "paddw %%mm3, %%mm3                     \n\t" // 2H7
-            "psubw %%mm2, %%mm0                     \n\t" // 2L4 - 5L5 + 5L6 - 2L7
-            "psubw %%mm3, %%mm1                     \n\t" // 2H4 - 5H5 + 5H6 - 2H7
-
-            "movq (%%"REG_c"), %%mm2                \n\t" // 2L0 - 5L1 + 5L2 - 2L3
-            "movq 8(%%"REG_c"), %%mm3               \n\t" // 2H0 - 5H1 + 5H2 - 2H3
-
-#if HAVE_MMX2
-            "movq %%mm7, %%mm6                      \n\t" // 0
-            "psubw %%mm0, %%mm6                     \n\t"
-            "pmaxsw %%mm6, %%mm0                    \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
-            "movq %%mm7, %%mm6                      \n\t" // 0
-            "psubw %%mm1, %%mm6                     \n\t"
-            "pmaxsw %%mm6, %%mm1                    \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
-            "movq %%mm7, %%mm6                      \n\t" // 0
-            "psubw %%mm2, %%mm6                     \n\t"
-            "pmaxsw %%mm6, %%mm2                    \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
-            "movq %%mm7, %%mm6                      \n\t" // 0
-            "psubw %%mm3, %%mm6                     \n\t"
-            "pmaxsw %%mm6, %%mm3                    \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
-#else
-            "movq %%mm7, %%mm6                      \n\t" // 0
-            "pcmpgtw %%mm0, %%mm6                   \n\t"
-            "pxor %%mm6, %%mm0                      \n\t"
-            "psubw %%mm6, %%mm0                     \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
-            "movq %%mm7, %%mm6                      \n\t" // 0
-            "pcmpgtw %%mm1, %%mm6                   \n\t"
-            "pxor %%mm6, %%mm1                      \n\t"
-            "psubw %%mm6, %%mm1                     \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
-            "movq %%mm7, %%mm6                      \n\t" // 0
-            "pcmpgtw %%mm2, %%mm6                   \n\t"
-            "pxor %%mm6, %%mm2                      \n\t"
-            "psubw %%mm6, %%mm2                     \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
-            "movq %%mm7, %%mm6                      \n\t" // 0
-            "pcmpgtw %%mm3, %%mm6                   \n\t"
-            "pxor %%mm6, %%mm3                      \n\t"
-            "psubw %%mm6, %%mm3                     \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
-#endif
-
-#if HAVE_MMX2
-            "pminsw %%mm2, %%mm0                    \n\t"
-            "pminsw %%mm3, %%mm1                    \n\t"
-#else
-            "movq %%mm0, %%mm6                      \n\t"
-            "psubusw %%mm2, %%mm6                   \n\t"
-            "psubw %%mm6, %%mm0                     \n\t"
-            "movq %%mm1, %%mm6                      \n\t"
-            "psubusw %%mm3, %%mm6                   \n\t"
-            "psubw %%mm6, %%mm1                     \n\t"
-#endif
-
-            "movd %2, %%mm2                         \n\t" // QP
-            "punpcklbw %%mm7, %%mm2                 \n\t"
-
-            "movq %%mm7, %%mm6                      \n\t" // 0
-            "pcmpgtw %%mm4, %%mm6                   \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
-            "pxor %%mm6, %%mm4                      \n\t"
-            "psubw %%mm6, %%mm4                     \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
-            "pcmpgtw %%mm5, %%mm7                   \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
-            "pxor %%mm7, %%mm5                      \n\t"
-            "psubw %%mm7, %%mm5                     \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
-// 100 opcodes
-            "psllw $3, %%mm2                        \n\t" // 8QP
-            "movq %%mm2, %%mm3                      \n\t" // 8QP
-            "pcmpgtw %%mm4, %%mm2                   \n\t"
-            "pcmpgtw %%mm5, %%mm3                   \n\t"
-            "pand %%mm2, %%mm4                      \n\t"
-            "pand %%mm3, %%mm5                      \n\t"
-
-
-            "psubusw %%mm0, %%mm4                   \n\t" // hd
-            "psubusw %%mm1, %%mm5                   \n\t" // ld
-
-
-            "movq "MANGLE(w05)", %%mm2              \n\t" // 5
-            "pmullw %%mm2, %%mm4                    \n\t"
-            "pmullw %%mm2, %%mm5                    \n\t"
-            "movq "MANGLE(w20)", %%mm2              \n\t" // 32
-            "paddw %%mm2, %%mm4                     \n\t"
-            "paddw %%mm2, %%mm5                     \n\t"
-            "psrlw $6, %%mm4                        \n\t"
-            "psrlw $6, %%mm5                        \n\t"
-
-            "movq 16(%%"REG_c"), %%mm0              \n\t" // L3 - L4
-            "movq 24(%%"REG_c"), %%mm1              \n\t" // H3 - H4
-
-            "pxor %%mm2, %%mm2                      \n\t"
-            "pxor %%mm3, %%mm3                      \n\t"
-
-            "pcmpgtw %%mm0, %%mm2                   \n\t" // sign (L3-L4)
-            "pcmpgtw %%mm1, %%mm3                   \n\t" // sign (H3-H4)
-            "pxor %%mm2, %%mm0                      \n\t"
-            "pxor %%mm3, %%mm1                      \n\t"
-            "psubw %%mm2, %%mm0                     \n\t" // |L3-L4|
-            "psubw %%mm3, %%mm1                     \n\t" // |H3-H4|
-            "psrlw $1, %%mm0                        \n\t" // |L3 - L4|/2
-            "psrlw $1, %%mm1                        \n\t" // |H3 - H4|/2
-
-            "pxor %%mm6, %%mm2                      \n\t"
-            "pxor %%mm7, %%mm3                      \n\t"
-            "pand %%mm2, %%mm4                      \n\t"
-            "pand %%mm3, %%mm5                      \n\t"
-
-#if HAVE_MMX2
-            "pminsw %%mm0, %%mm4                    \n\t"
-            "pminsw %%mm1, %%mm5                    \n\t"
-#else
-            "movq %%mm4, %%mm2                      \n\t"
-            "psubusw %%mm0, %%mm2                   \n\t"
-            "psubw %%mm2, %%mm4                     \n\t"
-            "movq %%mm5, %%mm2                      \n\t"
-            "psubusw %%mm1, %%mm2                   \n\t"
-            "psubw %%mm2, %%mm5                     \n\t"
-#endif
-            "pxor %%mm6, %%mm4                      \n\t"
-            "pxor %%mm7, %%mm5                      \n\t"
-            "psubw %%mm6, %%mm4                     \n\t"
-            "psubw %%mm7, %%mm5                     \n\t"
-            "packsswb %%mm5, %%mm4                  \n\t"
-            "movq %3, %%mm1                         \n\t"
-            "pandn %%mm4, %%mm1                     \n\t"
-            "movq (%0), %%mm0                       \n\t"
-            "paddb   %%mm1, %%mm0                   \n\t"
-            "movq %%mm0, (%0)                       \n\t"
-            "movq (%0, %1), %%mm0                   \n\t"
-            "psubb %%mm1, %%mm0                     \n\t"
-            "movq %%mm0, (%0, %1)                   \n\t"
-
-            : "+r" (temp_src)
-            : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask)
-            : "%"REG_a, "%"REG_c
-        );
-    }
-/*if(step==16){
-    STOP_TIMER("step16")
-}else{
-    STOP_TIMER("stepX")
-}
-    } */
-}
-#endif //HAVE_MMX
-
-static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
-                                const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
-
-/**
- * Copy a block from src to dst and fixes the blacklevel.
- * levelFix == 0 -> do not touch the brighness & contrast
- */
-#undef REAL_SCALED_CPY
-#undef SCALED_CPY
-
-static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
-                                     int levelFix, int64_t *packedOffsetAndScale)
-{
-#if !HAVE_MMX
-    int i;
-#endif
-    if(levelFix){
-#if HAVE_MMX
-    __asm__ volatile(
-        "movq (%%"REG_a"), %%mm2        \n\t" // packedYOffset
-        "movq 8(%%"REG_a"), %%mm3       \n\t" // packedYScale
-        "lea (%2,%4), %%"REG_a"         \n\t"
-        "lea (%3,%5), %%"REG_d"         \n\t"
-        "pxor %%mm4, %%mm4              \n\t"
-#if HAVE_MMX2
-#define REAL_SCALED_CPY(src1, src2, dst1, dst2)                                                \
-        "movq " #src1 ", %%mm0          \n\t"\
-        "movq " #src1 ", %%mm5          \n\t"\
-        "movq " #src2 ", %%mm1          \n\t"\
-        "movq " #src2 ", %%mm6          \n\t"\
-        "punpcklbw %%mm0, %%mm0         \n\t"\
-        "punpckhbw %%mm5, %%mm5         \n\t"\
-        "punpcklbw %%mm1, %%mm1         \n\t"\
-        "punpckhbw %%mm6, %%mm6         \n\t"\
-        "pmulhuw %%mm3, %%mm0           \n\t"\
-        "pmulhuw %%mm3, %%mm5           \n\t"\
-        "pmulhuw %%mm3, %%mm1           \n\t"\
-        "pmulhuw %%mm3, %%mm6           \n\t"\
-        "psubw %%mm2, %%mm0             \n\t"\
-        "psubw %%mm2, %%mm5             \n\t"\
-        "psubw %%mm2, %%mm1             \n\t"\
-        "psubw %%mm2, %%mm6             \n\t"\
-        "packuswb %%mm5, %%mm0          \n\t"\
-        "packuswb %%mm6, %%mm1          \n\t"\
-        "movq %%mm0, " #dst1 "          \n\t"\
-        "movq %%mm1, " #dst2 "          \n\t"\
-
-#else //HAVE_MMX2
-#define REAL_SCALED_CPY(src1, src2, dst1, dst2)                                        \
-        "movq " #src1 ", %%mm0          \n\t"\
-        "movq " #src1 ", %%mm5          \n\t"\
-        "punpcklbw %%mm4, %%mm0         \n\t"\
-        "punpckhbw %%mm4, %%mm5         \n\t"\
-        "psubw %%mm2, %%mm0             \n\t"\
-        "psubw %%mm2, %%mm5             \n\t"\
-        "movq " #src2 ", %%mm1          \n\t"\
-        "psllw $6, %%mm0                \n\t"\
-        "psllw $6, %%mm5                \n\t"\
-        "pmulhw %%mm3, %%mm0            \n\t"\
-        "movq " #src2 ", %%mm6          \n\t"\
-        "pmulhw %%mm3, %%mm5            \n\t"\
-        "punpcklbw %%mm4, %%mm1         \n\t"\
-        "punpckhbw %%mm4, %%mm6         \n\t"\
-        "psubw %%mm2, %%mm1             \n\t"\
-        "psubw %%mm2, %%mm6             \n\t"\
-        "psllw $6, %%mm1                \n\t"\
-        "psllw $6, %%mm6                \n\t"\
-        "pmulhw %%mm3, %%mm1            \n\t"\
-        "pmulhw %%mm3, %%mm6            \n\t"\
-        "packuswb %%mm5, %%mm0          \n\t"\
-        "packuswb %%mm6, %%mm1          \n\t"\
-        "movq %%mm0, " #dst1 "          \n\t"\
-        "movq %%mm1, " #dst2 "          \n\t"\
-
-#endif //HAVE_MMX2
-#define SCALED_CPY(src1, src2, dst1, dst2)\
-   REAL_SCALED_CPY(src1, src2, dst1, dst2)
-
-SCALED_CPY((%2)       , (%2, %4)      , (%3)       , (%3, %5))
-SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2))
-SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4))
-        "lea (%%"REG_a",%4,4), %%"REG_a"        \n\t"
-        "lea (%%"REG_d",%5,4), %%"REG_d"        \n\t"
-SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
-
-
-        : "=&a" (packedOffsetAndScale)
-        : "0" (packedOffsetAndScale),
-        "r"(src),
-        "r"(dst),
-        "r" ((x86_reg)srcStride),
-        "r" ((x86_reg)dstStride)
-        : "%"REG_d
-    );
-#else //HAVE_MMX
-    for(i=0; i<8; i++)
-        memcpy( &(dst[dstStride*i]),
-                &(src[srcStride*i]), BLOCK_SIZE);
-#endif //HAVE_MMX
-    }else{
-#if HAVE_MMX
-    __asm__ volatile(
-        "lea (%0,%2), %%"REG_a"                 \n\t"
-        "lea (%1,%3), %%"REG_d"                 \n\t"
-
-#define REAL_SIMPLE_CPY(src1, src2, dst1, dst2)                              \
-        "movq " #src1 ", %%mm0          \n\t"\
-        "movq " #src2 ", %%mm1          \n\t"\
-        "movq %%mm0, " #dst1 "          \n\t"\
-        "movq %%mm1, " #dst2 "          \n\t"\
-
-#define SIMPLE_CPY(src1, src2, dst1, dst2)\
-   REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
-
-SIMPLE_CPY((%0)       , (%0, %2)       , (%1)       , (%1, %3))
-SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2))
-SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4))
-        "lea (%%"REG_a",%2,4), %%"REG_a"        \n\t"
-        "lea (%%"REG_d",%3,4), %%"REG_d"        \n\t"
-SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
-
-        : : "r" (src),
-        "r" (dst),
-        "r" ((x86_reg)srcStride),
-        "r" ((x86_reg)dstStride)
-        : "%"REG_a, "%"REG_d
-    );
-#else //HAVE_MMX
-    for(i=0; i<8; i++)
-        memcpy( &(dst[dstStride*i]),
-                &(src[srcStride*i]), BLOCK_SIZE);
-#endif //HAVE_MMX
-    }
-}
-
-/**
- * Duplicate the given 8 src pixels ? times upward
- */
-static inline void RENAME(duplicate)(uint8_t src[], int stride)
-{
-#if HAVE_MMX
-    __asm__ volatile(
-        "movq (%0), %%mm0               \n\t"
-        "add %1, %0                     \n\t"
-        "movq %%mm0, (%0)               \n\t"
-        "movq %%mm0, (%0, %1)           \n\t"
-        "movq %%mm0, (%0, %1, 2)        \n\t"
-        : "+r" (src)
-        : "r" ((x86_reg)-stride)
-    );
-#else
-    int i;
-    uint8_t *p=src;
-    for(i=0; i<3; i++){
-        p-= stride;
-        memcpy(p, src, 8);
-    }
-#endif
-}
-
-/**
- * Filter array of bytes (Y or U or V values)
- */
-static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
-                                const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
-{
-    DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
-    int x,y;
-#ifdef COMPILE_TIME_MODE
-    const int mode= COMPILE_TIME_MODE;
-#else
-    const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
-#endif
-    int black=0, white=255; // blackest black and whitest white in the picture
-    int QPCorrecture= 256*256;
-
-    int copyAhead;
-#if HAVE_MMX
-    int i;
-#endif
-
-    const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
-    const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
-
-    //FIXME remove
-    uint64_t * const yHistogram= c.yHistogram;
-    uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
-    uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride;
-    //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
-
-#if HAVE_MMX
-    for(i=0; i<57; i++){
-        int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
-        int threshold= offset*2 + 1;
-        c.mmxDcOffset[i]= 0x7F - offset;
-        c.mmxDcThreshold[i]= 0x7F - threshold;
-        c.mmxDcOffset[i]*= 0x0101010101010101LL;
-        c.mmxDcThreshold[i]*= 0x0101010101010101LL;
-    }
-#endif
-
-    if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
-    else if(   (mode & LINEAR_BLEND_DEINT_FILTER)
-            || (mode & FFMPEG_DEINT_FILTER)
-            || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
-    else if(   (mode & V_DEBLOCK)
-            || (mode & LINEAR_IPOL_DEINT_FILTER)
-            || (mode & MEDIAN_DEINT_FILTER)
-            || (mode & V_A_DEBLOCK)) copyAhead=13;
-    else if(mode & V_X1_FILTER) copyAhead=11;
-//    else if(mode & V_RK1_FILTER) copyAhead=10;
-    else if(mode & DERING) copyAhead=9;
-    else copyAhead=8;
-
-    copyAhead-= 8;
-
-    if(!isColor){
-        uint64_t sum= 0;
-        int i;
-        uint64_t maxClipped;
-        uint64_t clipped;
-        double scale;
-
-        c.frameNum++;
-        // first frame is fscked so we ignore it
-        if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
-
-        for(i=0; i<256; i++){
-            sum+= yHistogram[i];
-        }
-
-        /* We always get a completely black picture first. */
-        maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
-
-        clipped= sum;
-        for(black=255; black>0; black--){
-            if(clipped < maxClipped) break;
-            clipped-= yHistogram[black];
-        }
-
-        clipped= sum;
-        for(white=0; white<256; white++){
-            if(clipped < maxClipped) break;
-            clipped-= yHistogram[white];
-        }
-
-        scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
-
-#if HAVE_MMX2
-        c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
-        c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
-#else
-        c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
-        c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
-#endif
-
-        c.packedYOffset|= c.packedYOffset<<32;
-        c.packedYOffset|= c.packedYOffset<<16;
-
-        c.packedYScale|= c.packedYScale<<32;
-        c.packedYScale|= c.packedYScale<<16;
-
-        if(mode & LEVEL_FIX)        QPCorrecture= (int)(scale*256*256 + 0.5);
-        else                        QPCorrecture= 256*256;
-    }else{
-        c.packedYScale= 0x0100010001000100LL;
-        c.packedYOffset= 0;
-        QPCorrecture= 256*256;
-    }
-
-    /* copy & deinterlace first row of blocks */
-    y=-BLOCK_SIZE;
-    {
-        const uint8_t *srcBlock= &(src[y*srcStride]);
-        uint8_t *dstBlock= tempDst + dstStride;
-
-        // From this point on it is guaranteed that we can read and write 16 lines downward
-        // finish 1 block before the next otherwise we might have a problem
-        // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
-        for(x=0; x<width; x+=BLOCK_SIZE){
-
-#if HAVE_MMX2
-/*
-            prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
-            prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
-            prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
-            prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
-*/
-
-            __asm__(
-                "mov %4, %%"REG_a"              \n\t"
-                "shr $2, %%"REG_a"              \n\t"
-                "and $6, %%"REG_a"              \n\t"
-                "add %5, %%"REG_a"              \n\t"
-                "mov %%"REG_a", %%"REG_d"       \n\t"
-                "imul %1, %%"REG_a"             \n\t"
-                "imul %3, %%"REG_d"             \n\t"
-                "prefetchnta 32(%%"REG_a", %0)  \n\t"
-                "prefetcht0 32(%%"REG_d", %2)   \n\t"
-                "add %1, %%"REG_a"              \n\t"
-                "add %3, %%"REG_d"              \n\t"
-                "prefetchnta 32(%%"REG_a", %0)  \n\t"
-                "prefetcht0 32(%%"REG_d", %2)   \n\t"
-                :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
-                "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
-                : "%"REG_a, "%"REG_d
-            );
-
-#elif HAVE_AMD3DNOW
-//FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
-/*          prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
-            prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
-            prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
-            prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
-*/
-#endif
-
-            RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
-                              srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
-
-            RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
-
-            if(mode & LINEAR_IPOL_DEINT_FILTER)
-                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
-            else if(mode & LINEAR_BLEND_DEINT_FILTER)
-                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
-            else if(mode & MEDIAN_DEINT_FILTER)
-                RENAME(deInterlaceMedian)(dstBlock, dstStride);
-            else if(mode & CUBIC_IPOL_DEINT_FILTER)
-                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
-            else if(mode & FFMPEG_DEINT_FILTER)
-                RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
-            else if(mode & LOWPASS5_DEINT_FILTER)
-                RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
-/*          else if(mode & CUBIC_BLEND_DEINT_FILTER)
-                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
-*/
-            dstBlock+=8;
-            srcBlock+=8;
-        }
-        if(width==FFABS(dstStride))
-            linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
-        else{
-            int i;
-            for(i=0; i<copyAhead; i++){
-                memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
-            }
-        }
-    }
-
-    for(y=0; y<height; y+=BLOCK_SIZE){
-        //1% speedup if these are here instead of the inner loop
-        const uint8_t *srcBlock= &(src[y*srcStride]);
-        uint8_t *dstBlock= &(dst[y*dstStride]);
-#if HAVE_MMX
-        uint8_t *tempBlock1= c.tempBlocks;
-        uint8_t *tempBlock2= c.tempBlocks + 8;
-#endif
-        const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
-        int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
-        int QP=0;
-        /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
-           if not than use a temporary buffer */
-        if(y+15 >= height){
-            int i;
-            /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
-               blockcopy to dst later */
-            linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
-                    FFMAX(height-y-copyAhead, 0), srcStride);
-
-            /* duplicate last line of src to fill the void up to line (copyAhead+7) */
-            for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
-                    memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
-
-            /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
-            linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
-
-            /* duplicate last line of dst to fill the void up to line (copyAhead) */
-            for(i=height-y+1; i<=copyAhead; i++)
-                    memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
-
-            dstBlock= tempDst + dstStride;
-            srcBlock= tempSrc;
-        }
-
-        // From this point on it is guaranteed that we can read and write 16 lines downward
-        // finish 1 block before the next otherwise we might have a problem
-        // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
-        for(x=0; x<width; x+=BLOCK_SIZE){
-            const int stride= dstStride;
-#if HAVE_MMX
-            uint8_t *tmpXchg;
-#endif
-            if(isColor){
-                QP= QPptr[x>>qpHShift];
-                c.nonBQP= nonBQPptr[x>>qpHShift];
-            }else{
-                QP= QPptr[x>>4];
-                QP= (QP* QPCorrecture + 256*128)>>16;
-                c.nonBQP= nonBQPptr[x>>4];
-                c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
-                yHistogram[ srcBlock[srcStride*12 + 4] ]++;
-            }
-            c.QP= QP;
-#if HAVE_MMX
-            __asm__ volatile(
-                "movd %1, %%mm7         \n\t"
-                "packuswb %%mm7, %%mm7  \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
-                "packuswb %%mm7, %%mm7  \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
-                "packuswb %%mm7, %%mm7  \n\t" // QP,..., QP
-                "movq %%mm7, %0         \n\t"
-                : "=m" (c.pQPb)
-                : "r" (QP)
-            );
-#endif
-
-
-#if HAVE_MMX2
-/*
-            prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
-            prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
-            prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
-            prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
-*/
-
-            __asm__(
-                "mov %4, %%"REG_a"              \n\t"
-                "shr $2, %%"REG_a"              \n\t"
-                "and $6, %%"REG_a"              \n\t"
-                "add %5, %%"REG_a"              \n\t"
-                "mov %%"REG_a", %%"REG_d"       \n\t"
-                "imul %1, %%"REG_a"             \n\t"
-                "imul %3, %%"REG_d"             \n\t"
-                "prefetchnta 32(%%"REG_a", %0)  \n\t"
-                "prefetcht0 32(%%"REG_d", %2)   \n\t"
-                "add %1, %%"REG_a"              \n\t"
-                "add %3, %%"REG_d"              \n\t"
-                "prefetchnta 32(%%"REG_a", %0)  \n\t"
-                "prefetcht0 32(%%"REG_d", %2)   \n\t"
-                :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
-                "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
-                : "%"REG_a, "%"REG_d
-            );
-
-#elif HAVE_AMD3DNOW
-//FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ...
-/*          prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
-            prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
-            prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
-            prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
-*/
-#endif
-
-            RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
-                              srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
-
-            if(mode & LINEAR_IPOL_DEINT_FILTER)
-                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
-            else if(mode & LINEAR_BLEND_DEINT_FILTER)
-                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
-            else if(mode & MEDIAN_DEINT_FILTER)
-                RENAME(deInterlaceMedian)(dstBlock, dstStride);
-            else if(mode & CUBIC_IPOL_DEINT_FILTER)
-                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
-            else if(mode & FFMPEG_DEINT_FILTER)
-                RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
-            else if(mode & LOWPASS5_DEINT_FILTER)
-                RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
-/*          else if(mode & CUBIC_BLEND_DEINT_FILTER)
-                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
-*/
-
-            /* only deblock if we have 2 blocks */
-            if(y + 8 < height){
-                if(mode & V_X1_FILTER)
-                    RENAME(vertX1Filter)(dstBlock, stride, &c);
-                else if(mode & V_DEBLOCK){
-                    const int t= RENAME(vertClassify)(dstBlock, stride, &c);
-
-                    if(t==1)
-                        RENAME(doVertLowPass)(dstBlock, stride, &c);
-                    else if(t==2)
-                        RENAME(doVertDefFilter)(dstBlock, stride, &c);
-                }else if(mode & V_A_DEBLOCK){
-                    RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
-                }
-            }
-
-#if HAVE_MMX
-            RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
-#endif
-            /* check if we have a previous block to deblock it with dstBlock */
-            if(x - 8 >= 0){
-#if HAVE_MMX
-                if(mode & H_X1_FILTER)
-                        RENAME(vertX1Filter)(tempBlock1, 16, &c);
-                else if(mode & H_DEBLOCK){
-//START_TIMER
-                    const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
-//STOP_TIMER("dc & minmax")
-                    if(t==1)
-                        RENAME(doVertLowPass)(tempBlock1, 16, &c);
-                    else if(t==2)
-                        RENAME(doVertDefFilter)(tempBlock1, 16, &c);
-                }else if(mode & H_A_DEBLOCK){
-                        RENAME(do_a_deblock)(tempBlock1, 16, 1, &c);
-                }
-
-                RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
-
-#else
-                if(mode & H_X1_FILTER)
-                    horizX1Filter(dstBlock-4, stride, QP);
-                else if(mode & H_DEBLOCK){
-#if HAVE_ALTIVEC
-                    DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
-                    int t;
-                    transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
-
-                    t = vertClassify_altivec(tempBlock-48, 16, &c);
-                    if(t==1) {
-                        doVertLowPass_altivec(tempBlock-48, 16, &c);
-                        transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
-                    }
-                    else if(t==2) {
-                        doVertDefFilter_altivec(tempBlock-48, 16, &c);
-                        transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
-                    }
-#else
-                    const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
-
-                    if(t==1)
-                        RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
-                    else if(t==2)
-                        RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
-#endif
-                }else if(mode & H_A_DEBLOCK){
-                    RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
-                }
-#endif //HAVE_MMX
-                if(mode & DERING){
-                //FIXME filter first line
-                    if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
-                }
-
-                if(mode & TEMP_NOISE_FILTER)
-                {
-                    RENAME(tempNoiseReducer)(dstBlock-8, stride,
-                            c.tempBlurred[isColor] + y*dstStride + x,
-                            c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3),
-                            c.ppMode.maxTmpNoise);
-                }
-            }
-
-            dstBlock+=8;
-            srcBlock+=8;
-
-#if HAVE_MMX
-            tmpXchg= tempBlock1;
-            tempBlock1= tempBlock2;
-            tempBlock2 = tmpXchg;
-#endif
-        }
-
-        if(mode & DERING){
-            if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
-        }
-
-        if((mode & TEMP_NOISE_FILTER)){
-            RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
-                    c.tempBlurred[isColor] + y*dstStride + x,
-                    c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3),
-                    c.ppMode.maxTmpNoise);
-        }
-
-        /* did we use a tmp buffer for the last lines*/
-        if(y+15 >= height){
-            uint8_t *dstBlock= &(dst[y*dstStride]);
-            if(width==FFABS(dstStride))
-                linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
-            else{
-                int i;
-                for(i=0; i<height-y; i++){
-                    memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
-                }
-            }
-        }
-/*
-        for(x=0; x<width; x+=32){
-            volatile int i;
-            i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
-                + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
-                + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
-                + dstBlock[x +13*dstStride]
-                + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
-        }*/
-    }
-#if   HAVE_AMD3DNOW
-    __asm__ volatile("femms");
-#elif HAVE_MMX
-    __asm__ volatile("emms");
-#endif
-
-#ifdef DEBUG_BRIGHTNESS
-    if(!isColor){
-        int max=1;
-        int i;
-        for(i=0; i<256; i++)
-            if(yHistogram[i] > max) max=yHistogram[i];
-
-        for(i=1; i<256; i++){
-            int x;
-            int start=yHistogram[i-1]/(max/256+1);
-            int end=yHistogram[i]/(max/256+1);
-            int inc= end > start ? 1 : -1;
-            for(x=start; x!=end+inc; x+=inc)
-                dst[ i*dstStride + x]+=128;
-        }
-
-        for(i=0; i<100; i+=2){
-            dst[ (white)*dstStride + i]+=128;
-            dst[ (black)*dstStride + i]+=128;
-        }
-    }
-#endif
-
-    *c2= c; //copy local context back
-
-}

From 0ecf54f9dc0bbebb064f40f9a00d9e72e1793a2c Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Mon, 20 Feb 2012 13:21:58 +0100
Subject: [PATCH 24/40] flashsvenc: switch to encode2().

---
 libavcodec/flashsvenc.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/libavcodec/flashsvenc.c b/libavcodec/flashsvenc.c
index 669badc6ce..75daf36202 100644
--- a/libavcodec/flashsvenc.c
+++ b/libavcodec/flashsvenc.c
@@ -49,6 +49,7 @@
 #include <zlib.h>
 
 #include "avcodec.h"
+#include "internal.h"
 #include "put_bits.h"
 #include "bytestream.h"
 
@@ -194,11 +195,10 @@ static int encode_bitstream(FlashSVContext *s, AVFrame *p, uint8_t *buf,
 }
 
 
-static int flashsv_encode_frame(AVCodecContext *avctx, uint8_t *buf,
-                                int buf_size, void *data)
+static int flashsv_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                                const AVFrame *pict, int *got_packet)
 {
     FlashSVContext * const s = avctx->priv_data;
-    AVFrame *pict = data;
     AVFrame * const p = &s->frame;
     uint8_t *pfptr;
     int res;
@@ -228,15 +228,15 @@ static int flashsv_encode_frame(AVCodecContext *avctx, uint8_t *buf,
         I_frame = 1;
     }
 
-    if (buf_size < s->image_width * s->image_height * 3) {
+    if ((res = ff_alloc_packet(pkt, s->image_width * s->image_height * 3)) < 0) {
         //Conservative upper bound check for compressed data
-        av_log(avctx, AV_LOG_ERROR, "buf_size %d <  %d\n",
-               buf_size, s->image_width * s->image_height * 3);
-        return -1;
+        av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n",
+               s->image_width * s->image_height * 3);
+        return res;
     }
 
-    res = encode_bitstream(s, p, buf, buf_size, opt_w * 16, opt_h * 16,
-                           pfptr, &I_frame);
+    pkt->size = encode_bitstream(s, p, pkt->data, pkt->size, opt_w * 16, opt_h * 16,
+                                 pfptr, &I_frame);
 
     //save the current frame
     if (p->linesize[0] > 0)
@@ -259,7 +259,11 @@ static int flashsv_encode_frame(AVCodecContext *avctx, uint8_t *buf,
 
     avctx->coded_frame = p;
 
-    return res;
+    if (p->key_frame)
+        pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
 }
 
 static av_cold int flashsv_encode_end(AVCodecContext *avctx)
@@ -281,7 +285,7 @@ AVCodec ff_flashsv_encoder = {
     .id             = CODEC_ID_FLASHSV,
     .priv_data_size = sizeof(FlashSVContext),
     .init           = flashsv_encode_init,
-    .encode         = flashsv_encode_frame,
+    .encode2        = flashsv_encode_frame,
     .close          = flashsv_encode_end,
     .pix_fmts       = (const enum PixelFormat[]){PIX_FMT_BGR24, PIX_FMT_NONE},
     .long_name      = NULL_IF_CONFIG_SMALL("Flash Screen Video"),

From d8f3365fb072d22b9aa023ababb343daf8190d51 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Wed, 22 Feb 2012 07:38:13 +0100
Subject: [PATCH 25/40] libvpxenc: switch to encode2().

---
 libavcodec/libvpxenc.c | 53 +++++++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/libavcodec/libvpxenc.c b/libavcodec/libvpxenc.c
index bba79733ed..4a240e5150 100644
--- a/libavcodec/libvpxenc.c
+++ b/libavcodec/libvpxenc.c
@@ -362,33 +362,33 @@ static inline void cx_pktcpy(struct FrameListData *dst,
 }
 
 /**
- * Store coded frame information in format suitable for return from encode().
+ * Store coded frame information in format suitable for return from encode2().
  *
- * Write buffer information from @a cx_frame to @a buf & @a buf_size.
- * Timing/frame details to @a coded_frame.
- * @return Frame size written to @a buf on success
- * @return AVERROR(EINVAL) on error
+ * Write information from @a cx_frame to @a pkt
+ * @return packet data size on success
+ * @return a negative AVERROR on error
  */
 static int storeframe(AVCodecContext *avctx, struct FrameListData *cx_frame,
-                      uint8_t *buf, int buf_size, AVFrame *coded_frame)
+                      AVPacket *pkt, AVFrame *coded_frame)
 {
-    if ((int) cx_frame->sz <= buf_size) {
-        buf_size = cx_frame->sz;
-        memcpy(buf, cx_frame->buf, buf_size);
+    int ret = ff_alloc_packet(pkt, cx_frame->sz);
+    if (ret >= 0) {
+        memcpy(pkt->data, cx_frame->buf, pkt->size);
+        pkt->pts = pkt->dts    = cx_frame->pts;
         coded_frame->pts       = cx_frame->pts;
         coded_frame->key_frame = !!(cx_frame->flags & VPX_FRAME_IS_KEY);
 
-        if (coded_frame->key_frame)
+        if (coded_frame->key_frame) {
             coded_frame->pict_type = AV_PICTURE_TYPE_I;
-        else
+            pkt->flags            |= AV_PKT_FLAG_KEY;
+        } else
             coded_frame->pict_type = AV_PICTURE_TYPE_P;
     } else {
         av_log(avctx, AV_LOG_ERROR,
-               "Compressed frame larger than storage provided! (%zu/%d)\n",
-               cx_frame->sz, buf_size);
-        return AVERROR(EINVAL);
+               "Error getting output packet of size %zu.\n", cx_frame->sz);
+        return ret;
     }
-    return buf_size;
+    return pkt->size;
 }
 
 /**
@@ -399,7 +399,7 @@ static int storeframe(AVCodecContext *avctx, struct FrameListData *cx_frame,
  * @return AVERROR(EINVAL) on output size error
  * @return AVERROR(ENOMEM) on coded frame queue data allocation error
  */
-static int queue_frames(AVCodecContext *avctx, uint8_t *buf, int buf_size,
+static int queue_frames(AVCodecContext *avctx, AVPacket *pkt_out,
                         AVFrame *coded_frame)
 {
     VP8Context *ctx = avctx->priv_data;
@@ -410,9 +410,9 @@ static int queue_frames(AVCodecContext *avctx, uint8_t *buf, int buf_size,
     if (ctx->coded_frame_list) {
         struct FrameListData *cx_frame = ctx->coded_frame_list;
         /* return the leading frame if we've already begun queueing */
-        size = storeframe(avctx, cx_frame, buf, buf_size, coded_frame);
+        size = storeframe(avctx, cx_frame, pkt_out, coded_frame);
         if (size < 0)
-            return AVERROR(EINVAL);
+            return size;
         ctx->coded_frame_list = cx_frame->next;
         free_coded_frame(cx_frame);
     }
@@ -429,9 +429,9 @@ static int queue_frames(AVCodecContext *avctx, uint8_t *buf, int buf_size,
                    provided a frame for output */
                 assert(!ctx->coded_frame_list);
                 cx_pktcpy(&cx_frame, pkt);
-                size = storeframe(avctx, &cx_frame, buf, buf_size, coded_frame);
+                size = storeframe(avctx, &cx_frame, pkt_out, coded_frame);
                 if (size < 0)
-                    return AVERROR(EINVAL);
+                    return size;
             } else {
                 struct FrameListData *cx_frame =
                     av_malloc(sizeof(struct FrameListData));
@@ -477,11 +477,10 @@ static int queue_frames(AVCodecContext *avctx, uint8_t *buf, int buf_size,
     return size;
 }
 
-static int vp8_encode(AVCodecContext *avctx, uint8_t *buf, int buf_size,
-                      void *data)
+static int vp8_encode(AVCodecContext *avctx, AVPacket *pkt,
+                      const AVFrame *frame, int *got_packet)
 {
     VP8Context *ctx = avctx->priv_data;
-    AVFrame *frame = data;
     struct vpx_image *rawimg = NULL;
     int64_t timestamp = 0;
     int res, coded_size;
@@ -503,7 +502,7 @@ static int vp8_encode(AVCodecContext *avctx, uint8_t *buf, int buf_size,
         log_encoder_error(avctx, "Error encoding frame");
         return AVERROR_INVALIDDATA;
     }
-    coded_size = queue_frames(avctx, buf, buf_size, avctx->coded_frame);
+    coded_size = queue_frames(avctx, pkt, avctx->coded_frame);
 
     if (!frame && avctx->flags & CODEC_FLAG_PASS1) {
         unsigned int b64_size = AV_BASE64_SIZE(ctx->twopass_stats.sz);
@@ -517,7 +516,9 @@ static int vp8_encode(AVCodecContext *avctx, uint8_t *buf, int buf_size,
         av_base64_encode(avctx->stats_out, b64_size, ctx->twopass_stats.buf,
                          ctx->twopass_stats.sz);
     }
-    return coded_size;
+
+    *got_packet = !!coded_size;
+    return 0;
 }
 
 #define OFFSET(x) offsetof(VP8Context, x)
@@ -570,7 +571,7 @@ AVCodec ff_libvpx_encoder = {
     .id             = CODEC_ID_VP8,
     .priv_data_size = sizeof(VP8Context),
     .init           = vp8_init,
-    .encode         = vp8_encode,
+    .encode2        = vp8_encode,
     .close          = vp8_free,
     .capabilities   = CODEC_CAP_DELAY | CODEC_CAP_AUTO_THREADS,
     .pix_fmts = (const enum PixelFormat[]){PIX_FMT_YUV420P, PIX_FMT_NONE},

From 577fed3b7cef50d9d0f2e8a4b2f215129da49b4e Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Thu, 5 Jan 2012 16:55:33 +0100
Subject: [PATCH 26/40] gifenc: switch to encode2().

---
 libavcodec/gif.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/libavcodec/gif.c b/libavcodec/gif.c
index c7e7bcd185..6b190f1d0c 100644
--- a/libavcodec/gif.c
+++ b/libavcodec/gif.c
@@ -43,6 +43,7 @@
 
 #include "avcodec.h"
 #include "bytestream.h"
+#include "internal.h"
 #include "lzw.h"
 
 /* The GIF format uses reversed order for bitstreams... */
@@ -141,20 +142,32 @@ static av_cold int gif_encode_init(AVCodecContext *avctx)
 }
 
 /* better than nothing gif encoder */
-static int gif_encode_frame(AVCodecContext *avctx, unsigned char *outbuf, int buf_size, void *data)
+static int gif_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                            const AVFrame *pict, int *got_packet)
 {
     GIFContext *s = avctx->priv_data;
-    AVFrame *pict = data;
     AVFrame *const p = (AVFrame *)&s->picture;
-    uint8_t *outbuf_ptr = outbuf;
-    uint8_t *end = outbuf + buf_size;
+    uint8_t *outbuf_ptr, *end;
+    int ret;
+
+    if ((ret = ff_alloc_packet(pkt, avctx->width*avctx->height*7/5 + FF_MIN_BUFFER_SIZE)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+        return ret;
+    }
+    outbuf_ptr = pkt->data;
+    end        = pkt->data + pkt->size;
 
     *p = *pict;
     p->pict_type = AV_PICTURE_TYPE_I;
     p->key_frame = 1;
     gif_image_write_header(avctx, &outbuf_ptr, (uint32_t *)pict->data[1]);
     gif_image_write_image(avctx, &outbuf_ptr, end, pict->data[0], pict->linesize[0]);
-    return outbuf_ptr - outbuf;
+
+    pkt->size   = outbuf_ptr - pkt->data;
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
 }
 
 static int gif_encode_close(AVCodecContext *avctx)
@@ -172,7 +185,7 @@ AVCodec ff_gif_encoder = {
     .id             = CODEC_ID_GIF,
     .priv_data_size = sizeof(GIFContext),
     .init           = gif_encode_init,
-    .encode         = gif_encode_frame,
+    .encode2        = gif_encode_frame,
     .close          = gif_encode_close,
     .pix_fmts= (const enum PixelFormat[]){PIX_FMT_RGB8, PIX_FMT_BGR8, PIX_FMT_RGB4_BYTE, PIX_FMT_BGR4_BYTE, PIX_FMT_GRAY8, PIX_FMT_PAL8, PIX_FMT_NONE},
     .long_name= NULL_IF_CONFIG_SMALL("GIF (Graphics Interchange Format)"),

From 2257f66ec5b4628ca24410b4cebe0e0842a4298d Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Wed, 22 Feb 2012 21:19:35 +0100
Subject: [PATCH 27/40] lavc: drop libdirac encoder.

Libschroedinger is the preferred way to encode dirac video now, it
produces better output and has a nicer API.
---
 configure                |   5 +-
 libavcodec/Makefile      |   1 -
 libavcodec/allcodecs.c   |   2 +-
 libavcodec/libdiracenc.c | 405 ---------------------------------------
 4 files changed, 3 insertions(+), 410 deletions(-)
 delete mode 100644 libavcodec/libdiracenc.c

diff --git a/configure b/configure
index cbeffbb6f6..6278b9ab8e 100755
--- a/configure
+++ b/configure
@@ -1416,7 +1416,6 @@ h264_parser_select="golomb h264chroma h264dsp h264pred"
 
 # external libraries
 libdirac_decoder_deps="libdirac !libschroedinger"
-libdirac_encoder_deps="libdirac"
 libfaac_encoder_deps="libfaac"
 libgsm_decoder_deps="libgsm"
 libgsm_encoder_deps="libgsm"
@@ -2937,8 +2936,8 @@ enabled avisynth   && require2 vfw32 "windows.h vfw.h" AVIFileInit -lavifil32
 enabled frei0r     && { check_header frei0r.h || die "ERROR: frei0r.h header not found"; }
 enabled gnutls     && require_pkg_config gnutls gnutls/gnutls.h gnutls_global_init
 enabled libdirac   && require_pkg_config dirac                          \
-    "libdirac_decoder/dirac_parser.h libdirac_encoder/dirac_encoder.h"  \
-    "dirac_decoder_init dirac_encoder_init"
+    "libdirac_decoder/dirac_parser.h"  \
+    "dirac_decoder_init"
 enabled libfaac    && require2 libfaac "stdint.h faac.h" faacEncGetVersion -lfaac
 enabled libfreetype && require_pkg_config freetype2 "ft2build.h freetype/freetype.h" FT_Init_FreeType
 enabled libgsm     && require  libgsm gsm/gsm.h gsm_create -lgsm
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index a98ff5b52a..b177450563 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -573,7 +573,6 @@ OBJS-$(CONFIG_WTV_DEMUXER)             += mpeg4audio.o mpegaudiodata.o
 
 # external codec libraries
 OBJS-$(CONFIG_LIBDIRAC_DECODER)           += libdiracdec.o
-OBJS-$(CONFIG_LIBDIRAC_ENCODER)           += libdiracenc.o libdirac_libschro.o
 OBJS-$(CONFIG_LIBFAAC_ENCODER)            += libfaac.o
 OBJS-$(CONFIG_LIBGSM_DECODER)             += libgsm.o
 OBJS-$(CONFIG_LIBGSM_ENCODER)             += libgsm.o
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index c5a96ebc30..3d3289c4ee 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -371,7 +371,7 @@ void avcodec_register_all(void)
     REGISTER_ENCDEC  (XSUB, xsub);
 
     /* external libraries */
-    REGISTER_ENCDEC  (LIBDIRAC, libdirac);
+    REGISTER_DECODER (LIBDIRAC, libdirac);
     REGISTER_ENCODER (LIBFAAC, libfaac);
     REGISTER_ENCDEC  (LIBGSM, libgsm);
     REGISTER_ENCDEC  (LIBGSM_MS, libgsm_ms);
diff --git a/libavcodec/libdiracenc.c b/libavcodec/libdiracenc.c
deleted file mode 100644
index 156ba57718..0000000000
--- a/libavcodec/libdiracenc.c
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * Dirac encoding support via libdirac library
- * Copyright (c) 2005 BBC, Andrew Kennedy <dirac at rd dot bbc dot co dot uk>
- * Copyright (c) 2006-2008 BBC, Anuradha Suraparaju <asuraparaju at gmail dot com >
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
-* @file
-* Dirac encoding support via libdirac library; more details about the
-* Dirac project can be found at http://dirac.sourceforge.net/.
-* The libdirac_encoder library implements Dirac specification version 2.2
-* (http://dirac.sourceforge.net/specification.html).
-*/
-
-#include "libdirac_libschro.h"
-#include "libdirac.h"
-
-#undef NDEBUG
-#include <assert.h>
-
-
-#include <libdirac_encoder/dirac_encoder.h>
-
-/** Dirac encoder private data */
-typedef struct DiracEncoderParams {
-    /** Dirac encoder context */
-    dirac_encoder_context_t enc_ctx;
-
-    /** frame being encoded */
-    AVFrame picture;
-
-    /** frame size */
-    int frame_size;
-
-    /** Dirac encoder handle */
-    dirac_encoder_t* p_encoder;
-
-    /** input frame buffer */
-    unsigned char *p_in_frame_buf;
-
-    /** buffer to store encoder output before writing it to the frame queue */
-    unsigned char *enc_buf;
-
-    /** size of encoder buffer */
-    int enc_buf_size;
-
-    /** queue storing encoded frames */
-    DiracSchroQueue enc_frame_queue;
-
-    /** end of sequence signalled by user, 0 - false, 1 - true */
-    int eos_signalled;
-
-    /** end of sequence returned by encoder, 0 - false, 1 - true */
-    int eos_pulled;
-} DiracEncoderParams;
-
-/**
-* Works out Dirac-compatible chroma format.
-*/
-static dirac_chroma_t GetDiracChromaFormat(enum PixelFormat ff_pix_fmt)
-{
-    int num_formats = sizeof(dirac_pixel_format_map) /
-                      sizeof(dirac_pixel_format_map[0]);
-    int idx;
-
-    for (idx = 0; idx < num_formats; ++idx)
-        if (dirac_pixel_format_map[idx].ff_pix_fmt == ff_pix_fmt)
-            return dirac_pixel_format_map[idx].dirac_pix_fmt;
-    return formatNK;
-}
-
-/**
-* Dirac video preset table. Ensure that this tables matches up correctly
-* with the ff_dirac_schro_video_format_info table in libdirac_libschro.c.
-*/
-static const VideoFormat ff_dirac_video_formats[]={
-    VIDEO_FORMAT_CUSTOM           ,
-    VIDEO_FORMAT_QSIF525          ,
-    VIDEO_FORMAT_QCIF             ,
-    VIDEO_FORMAT_SIF525           ,
-    VIDEO_FORMAT_CIF              ,
-    VIDEO_FORMAT_4SIF525          ,
-    VIDEO_FORMAT_4CIF             ,
-    VIDEO_FORMAT_SD_480I60        ,
-    VIDEO_FORMAT_SD_576I50        ,
-    VIDEO_FORMAT_HD_720P60        ,
-    VIDEO_FORMAT_HD_720P50        ,
-    VIDEO_FORMAT_HD_1080I60       ,
-    VIDEO_FORMAT_HD_1080I50       ,
-    VIDEO_FORMAT_HD_1080P60       ,
-    VIDEO_FORMAT_HD_1080P50       ,
-    VIDEO_FORMAT_DIGI_CINEMA_2K24 ,
-    VIDEO_FORMAT_DIGI_CINEMA_4K24 ,
-};
-
-/**
-* Returns the video format preset matching the input video dimensions and
-* time base.
-*/
-static VideoFormat GetDiracVideoFormatPreset(AVCodecContext *avccontext)
-{
-    unsigned int num_formats = sizeof(ff_dirac_video_formats) /
-                               sizeof(ff_dirac_video_formats[0]);
-
-    unsigned int idx = ff_dirac_schro_get_video_format_idx(avccontext);
-
-    return (idx < num_formats) ?
-                 ff_dirac_video_formats[idx] : VIDEO_FORMAT_CUSTOM;
-}
-
-static av_cold int libdirac_encode_init(AVCodecContext *avccontext)
-{
-
-    DiracEncoderParams* p_dirac_params = avccontext->priv_data;
-    int no_local = 1;
-    int verbose  = avccontext->debug;
-    VideoFormat preset;
-
-    /* get Dirac preset */
-    preset = GetDiracVideoFormatPreset(avccontext);
-
-    /* initialize the encoder context */
-    dirac_encoder_context_init(&p_dirac_params->enc_ctx, preset);
-
-    p_dirac_params->enc_ctx.src_params.chroma = GetDiracChromaFormat(avccontext->pix_fmt);
-
-    if (p_dirac_params->enc_ctx.src_params.chroma == formatNK) {
-        av_log(avccontext, AV_LOG_ERROR,
-               "Unsupported pixel format %d. This codec supports only "
-               "Planar YUV formats (yuv420p, yuv422p, yuv444p\n",
-               avccontext->pix_fmt);
-        return -1;
-    }
-
-    p_dirac_params->enc_ctx.src_params.frame_rate.numerator   = avccontext->time_base.den;
-    p_dirac_params->enc_ctx.src_params.frame_rate.denominator = avccontext->time_base.num;
-
-    p_dirac_params->enc_ctx.src_params.width  = avccontext->width;
-    p_dirac_params->enc_ctx.src_params.height = avccontext->height;
-
-    p_dirac_params->frame_size = avpicture_get_size(avccontext->pix_fmt,
-                                                    avccontext->width,
-                                                    avccontext->height);
-
-    avccontext->coded_frame = &p_dirac_params->picture;
-
-    if (no_local) {
-        p_dirac_params->enc_ctx.decode_flag = 0;
-        p_dirac_params->enc_ctx.instr_flag  = 0;
-    } else {
-        p_dirac_params->enc_ctx.decode_flag = 1;
-        p_dirac_params->enc_ctx.instr_flag  = 1;
-    }
-
-    /* Intra-only sequence */
-    if (!avccontext->gop_size) {
-        p_dirac_params->enc_ctx.enc_params.num_L1 = 0;
-        if (avccontext->coder_type == FF_CODER_TYPE_VLC)
-            p_dirac_params->enc_ctx.enc_params.using_ac = 0;
-    } else
-        avccontext->has_b_frames = 1;
-
-    if (avccontext->flags & CODEC_FLAG_QSCALE) {
-        if (avccontext->global_quality) {
-            p_dirac_params->enc_ctx.enc_params.qf = avccontext->global_quality
-                                                    / (FF_QP2LAMBDA * 10.0);
-            /* if it is not default bitrate then send target rate. */
-            if (avccontext->bit_rate >= 1000 &&
-                avccontext->bit_rate != 200000)
-                p_dirac_params->enc_ctx.enc_params.trate = avccontext->bit_rate
-                                                           / 1000;
-        } else
-            p_dirac_params->enc_ctx.enc_params.lossless = 1;
-    } else if (avccontext->bit_rate >= 1000)
-        p_dirac_params->enc_ctx.enc_params.trate = avccontext->bit_rate / 1000;
-
-    if ((preset > VIDEO_FORMAT_QCIF || preset < VIDEO_FORMAT_QSIF525) &&
-         avccontext->bit_rate == 200000)
-        p_dirac_params->enc_ctx.enc_params.trate = 0;
-
-    if (avccontext->flags & CODEC_FLAG_INTERLACED_ME)
-        /* all material can be coded as interlaced or progressive
-         * irrespective of the type of source material */
-        p_dirac_params->enc_ctx.enc_params.picture_coding_mode = 1;
-
-    p_dirac_params->p_encoder = dirac_encoder_init(&p_dirac_params->enc_ctx,
-                                                   verbose);
-
-    if (!p_dirac_params->p_encoder) {
-        av_log(avccontext, AV_LOG_ERROR,
-               "Unrecoverable Error: dirac_encoder_init failed. ");
-        return EXIT_FAILURE;
-    }
-
-    /* allocate enough memory for the incoming data */
-    p_dirac_params->p_in_frame_buf = av_malloc(p_dirac_params->frame_size);
-
-    /* initialize the encoded frame queue */
-    ff_dirac_schro_queue_init(&p_dirac_params->enc_frame_queue);
-
-    return 0;
-}
-
-static void DiracFreeFrame(void *data)
-{
-    DiracSchroEncodedFrame *enc_frame = data;
-
-    av_freep(&enc_frame->p_encbuf);
-    av_free(enc_frame);
-}
-
-static int libdirac_encode_frame(AVCodecContext *avccontext,
-                                 unsigned char *frame,
-                                 int buf_size, void *data)
-{
-    int enc_size = 0;
-    dirac_encoder_state_t state;
-    DiracEncoderParams     *p_dirac_params      = avccontext->priv_data;
-    DiracSchroEncodedFrame *p_frame_output      = NULL;
-    DiracSchroEncodedFrame *p_next_output_frame = NULL;
-    int go = 1;
-    int last_frame_in_sequence = 0;
-
-    if (!data) {
-        /* push end of sequence if not already signalled */
-        if (!p_dirac_params->eos_signalled) {
-            dirac_encoder_end_sequence(p_dirac_params->p_encoder);
-            p_dirac_params->eos_signalled = 1;
-        }
-    } else {
-
-        /* Allocate frame data to Dirac input buffer.
-         * Input line size may differ from what the codec supports,
-         * especially when transcoding from one format to another.
-         * So use avpicture_layout to copy the frame. */
-        avpicture_layout((AVPicture *)data, avccontext->pix_fmt,
-                         avccontext->width, avccontext->height,
-                         p_dirac_params->p_in_frame_buf,
-                         p_dirac_params->frame_size);
-
-        /* load next frame */
-        if (dirac_encoder_load(p_dirac_params->p_encoder,
-                               p_dirac_params->p_in_frame_buf,
-                               p_dirac_params->frame_size) < 0) {
-            av_log(avccontext, AV_LOG_ERROR, "Unrecoverable Encoder Error."
-                   " dirac_encoder_load failed...\n");
-            return -1;
-        }
-    }
-
-    if (p_dirac_params->eos_pulled)
-        go = 0;
-
-    while (go) {
-        p_dirac_params->p_encoder->enc_buf.buffer = frame;
-        p_dirac_params->p_encoder->enc_buf.size   = buf_size;
-        /* process frame */
-        state = dirac_encoder_output(p_dirac_params->p_encoder);
-
-        switch (state) {
-        case ENC_STATE_AVAIL:
-        case ENC_STATE_EOS:
-            assert(p_dirac_params->p_encoder->enc_buf.size > 0);
-
-            /* All non-frame data is prepended to actual frame data to
-             * be able to set the pts correctly. So we don't write data
-             * to the frame output queue until we actually have a frame
-             */
-
-            p_dirac_params->enc_buf = av_realloc(p_dirac_params->enc_buf,
-                                                 p_dirac_params->enc_buf_size +
-                                                 p_dirac_params->p_encoder->enc_buf.size);
-            memcpy(p_dirac_params->enc_buf + p_dirac_params->enc_buf_size,
-                   p_dirac_params->p_encoder->enc_buf.buffer,
-                   p_dirac_params->p_encoder->enc_buf.size);
-
-            p_dirac_params->enc_buf_size += p_dirac_params->p_encoder->enc_buf.size;
-
-            if (state == ENC_STATE_EOS) {
-                p_dirac_params->eos_pulled = 1;
-                go = 0;
-            }
-
-            /* If non-frame data, don't output it until it we get an
-             * encoded frame back from the encoder. */
-            if (p_dirac_params->p_encoder->enc_pparams.pnum == -1)
-                break;
-
-            /* create output frame */
-            p_frame_output = av_mallocz(sizeof(DiracSchroEncodedFrame));
-            /* set output data */
-            p_frame_output->size      = p_dirac_params->enc_buf_size;
-            p_frame_output->p_encbuf  = p_dirac_params->enc_buf;
-            p_frame_output->frame_num = p_dirac_params->p_encoder->enc_pparams.pnum;
-
-            if (p_dirac_params->p_encoder->enc_pparams.ptype == INTRA_PICTURE &&
-                p_dirac_params->p_encoder->enc_pparams.rtype == REFERENCE_PICTURE)
-                p_frame_output->key_frame = 1;
-
-            ff_dirac_schro_queue_push_back(&p_dirac_params->enc_frame_queue,
-                                           p_frame_output);
-
-            p_dirac_params->enc_buf_size = 0;
-            p_dirac_params->enc_buf      = NULL;
-            break;
-
-        case ENC_STATE_BUFFER:
-            go = 0;
-            break;
-
-        case ENC_STATE_INVALID:
-            av_log(avccontext, AV_LOG_ERROR,
-                   "Unrecoverable Dirac Encoder Error. Quitting...\n");
-            return -1;
-
-        default:
-            av_log(avccontext, AV_LOG_ERROR, "Unknown Dirac Encoder state\n");
-            return -1;
-        }
-    }
-
-    /* copy 'next' frame in queue */
-
-    if (p_dirac_params->enc_frame_queue.size == 1 && p_dirac_params->eos_pulled)
-        last_frame_in_sequence = 1;
-
-    p_next_output_frame = ff_dirac_schro_queue_pop(&p_dirac_params->enc_frame_queue);
-
-    if (!p_next_output_frame)
-        return 0;
-
-    memcpy(frame, p_next_output_frame->p_encbuf, p_next_output_frame->size);
-    avccontext->coded_frame->key_frame = p_next_output_frame->key_frame;
-    /* Use the frame number of the encoded frame as the pts. It is OK to do
-     * so since Dirac is a constant framerate codec. It expects input to be
-     * of constant framerate. */
-    avccontext->coded_frame->pts = p_next_output_frame->frame_num;
-    enc_size = p_next_output_frame->size;
-
-    /* Append the end of sequence information to the last frame in the
-     * sequence. */
-    if (last_frame_in_sequence && p_dirac_params->enc_buf_size > 0) {
-        memcpy(frame + enc_size, p_dirac_params->enc_buf,
-               p_dirac_params->enc_buf_size);
-        enc_size += p_dirac_params->enc_buf_size;
-        av_freep(&p_dirac_params->enc_buf);
-        p_dirac_params->enc_buf_size = 0;
-    }
-
-    /* free frame */
-    DiracFreeFrame(p_next_output_frame);
-
-    return enc_size;
-}
-
-static av_cold int libdirac_encode_close(AVCodecContext *avccontext)
-{
-    DiracEncoderParams *p_dirac_params = avccontext->priv_data;
-
-    /* close the encoder */
-    dirac_encoder_close(p_dirac_params->p_encoder);
-
-    /* free data in the output frame queue */
-    ff_dirac_schro_queue_free(&p_dirac_params->enc_frame_queue,
-                              DiracFreeFrame);
-
-    /* free the encoder buffer */
-    if (p_dirac_params->enc_buf_size)
-        av_freep(&p_dirac_params->enc_buf);
-
-    /* free the input frame buffer */
-    av_freep(&p_dirac_params->p_in_frame_buf);
-
-    return 0;
-}
-
-
-AVCodec ff_libdirac_encoder = {
-    .name           = "libdirac",
-    .type           = AVMEDIA_TYPE_VIDEO,
-    .id             = CODEC_ID_DIRAC,
-    .priv_data_size = sizeof(DiracEncoderParams),
-    .init           = libdirac_encode_init,
-    .encode         = libdirac_encode_frame,
-    .close          = libdirac_encode_close,
-   .capabilities = CODEC_CAP_DELAY,
-   .pix_fmts = (const enum PixelFormat[]){PIX_FMT_YUV420P, PIX_FMT_YUV422P, PIX_FMT_YUV444P, PIX_FMT_NONE},
-   .long_name = NULL_IF_CONFIG_SMALL("libdirac Dirac 2.2"),
-};

From 62acb4937e06a3802735106a9d9fa48db87d0d36 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Thu, 23 Feb 2012 08:51:45 +0100
Subject: [PATCH 28/40] tiffenc: properly forward error codes in
 encode_frame().

---
 libavcodec/tiffenc.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/libavcodec/tiffenc.c b/libavcodec/tiffenc.c
index 0aa9740a99..3d87568d99 100644
--- a/libavcodec/tiffenc.c
+++ b/libavcodec/tiffenc.c
@@ -207,7 +207,6 @@ static int encode_frame(AVCodecContext * avctx, unsigned char *buf,
     AVFrame *pict = data;
     AVFrame *const p = (AVFrame *) & s->picture;
     int i;
-    int n;
     uint8_t *ptr = buf;
     uint8_t *offset;
     uint32_t strips;
@@ -216,7 +215,7 @@ static int encode_frame(AVCodecContext * avctx, unsigned char *buf,
     int bytes_per_row;
     uint32_t res[2] = { 72, 1 };        // image resolution (72/1)
     static const uint16_t bpp_tab[] = { 8, 8, 8, 8 };
-    int ret = -1;
+    int ret;
     int is_yuv = 0;
     uint8_t *yuv_line = NULL;
     int shift_h, shift_v;
@@ -332,13 +331,13 @@ static int encode_frame(AVCodecContext * avctx, unsigned char *buf,
                        p->data[0] + j * p->linesize[0], bytes_per_row);
             zn += bytes_per_row;
         }
-        n = encode_strip(s, zbuf, ptr, zn, s->compr);
+        ret = encode_strip(s, zbuf, ptr, zn, s->compr);
         av_free(zbuf);
-        if (n<0) {
+        if (ret < 0) {
             av_log(s->avctx, AV_LOG_ERROR, "Encode strip failed\n");
             goto fail;
         }
-        ptr += n;
+        ptr += ret;
         strip_sizes[0] = ptr - buf - strip_offsets[0];
     } else
 #endif
@@ -355,20 +354,19 @@ static int encode_frame(AVCodecContext * avctx, unsigned char *buf,
             }
             if (is_yuv){
                  pack_yuv(s, yuv_line, i);
-                 n = encode_strip(s, yuv_line, ptr, bytes_per_row, s->compr);
+                 ret = encode_strip(s, yuv_line, ptr, bytes_per_row, s->compr);
                  i += s->subsampling[1] - 1;
             }
             else
-                n = encode_strip(s, p->data[0] + i * p->linesize[0],
+                ret = encode_strip(s, p->data[0] + i * p->linesize[0],
                         ptr, bytes_per_row, s->compr);
-            if (n < 0) {
+            if (ret < 0) {
                 av_log(s->avctx, AV_LOG_ERROR, "Encode strip failed\n");
                 goto fail;
             }
-            strip_sizes[i / s->rps] += n;
-            ptr += n;
+            strip_sizes[i / s->rps] += ret;
+            ptr += ret;
             if(s->compr == TIFF_LZW && (i==s->height-1 || i%s->rps == s->rps-1)){
-                int ret;
                 ret = ff_lzw_encode_flush(s->lzws, flush_put_bits);
                 strip_sizes[(i / s->rps )] += ret ;
                 ptr += ret;
@@ -422,8 +420,10 @@ static int encode_frame(AVCodecContext * avctx, unsigned char *buf,
     }
     bytestream_put_le32(&offset, ptr - buf);    // write offset to dir
 
-    if (check_size(s, 6 + s->num_entries * 12))
+    if (check_size(s, 6 + s->num_entries * 12)) {
+        ret = AVERROR(EINVAL);
         goto fail;
+    }
     bytestream_put_le16(&ptr, s->num_entries);  // write tag count
     bytestream_put_buffer(&ptr, s->entries, s->num_entries * 12);
     bytestream_put_le32(&ptr, 0);

From 760b004086265bf12091698c729bd507cb9829da Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Thu, 23 Feb 2012 09:00:34 +0100
Subject: [PATCH 29/40] tiffenc: switch to encode2().

---
 libavcodec/tiffenc.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/libavcodec/tiffenc.c b/libavcodec/tiffenc.c
index 3d87568d99..f85b1577ca 100644
--- a/libavcodec/tiffenc.c
+++ b/libavcodec/tiffenc.c
@@ -200,14 +200,13 @@ static void pack_yuv(TiffEncoderContext * s, uint8_t * dst, int lnum)
     }
 }
 
-static int encode_frame(AVCodecContext * avctx, unsigned char *buf,
-                        int buf_size, void *data)
+static int encode_frame(AVCodecContext * avctx, AVPacket *pkt,
+                        const AVFrame *pict, int *got_packet)
 {
     TiffEncoderContext *s = avctx->priv_data;
-    AVFrame *pict = data;
     AVFrame *const p = (AVFrame *) & s->picture;
     int i;
-    uint8_t *ptr = buf;
+    uint8_t *ptr;
     uint8_t *offset;
     uint32_t strips;
     uint32_t *strip_sizes = NULL;
@@ -221,9 +220,6 @@ static int encode_frame(AVCodecContext * avctx, unsigned char *buf,
     int shift_h, shift_v;
 
     s->avctx = avctx;
-    s->buf_start = buf;
-    s->buf = &ptr;
-    s->buf_size = buf_size;
 
     *p = *pict;
     p->pict_type = AV_PICTURE_TYPE_I;
@@ -287,6 +283,17 @@ static int encode_frame(AVCodecContext * avctx, unsigned char *buf,
 
     strips = (s->height - 1) / s->rps + 1;
 
+    if (!pkt->data &&
+        (ret = av_new_packet(pkt, avctx->width * avctx->height * s->bpp * 2 +
+                                  avctx->height * 4 + FF_MIN_BUFFER_SIZE)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+        return ret;
+    }
+    ptr          = pkt->data;
+    s->buf_start = pkt->data;
+    s->buf       = &ptr;
+    s->buf_size  = pkt->size;
+
     if (check_size(s, 8))
         goto fail;
 
@@ -318,7 +325,7 @@ static int encode_frame(AVCodecContext * avctx, unsigned char *buf,
 
         zlen = bytes_per_row * s->rps;
         zbuf = av_malloc(zlen);
-        strip_offsets[0] = ptr - buf;
+        strip_offsets[0] = ptr - pkt->data;
         zn = 0;
         for (j = 0; j < s->rps; j++) {
             if (is_yuv){
@@ -338,7 +345,7 @@ static int encode_frame(AVCodecContext * avctx, unsigned char *buf,
             goto fail;
         }
         ptr += ret;
-        strip_sizes[0] = ptr - buf - strip_offsets[0];
+        strip_sizes[0] = ptr - pkt->data - strip_offsets[0];
     } else
 #endif
     {
@@ -350,7 +357,7 @@ static int encode_frame(AVCodecContext * avctx, unsigned char *buf,
                     ff_lzw_encode_init(s->lzws, ptr, s->buf_size - (*s->buf - s->buf_start),
                                        12, FF_LZW_TIFF, put_bits);
                 }
-                strip_offsets[i / s->rps] = ptr - buf;
+                strip_offsets[i / s->rps] = ptr - pkt->data;
             }
             if (is_yuv){
                  pack_yuv(s, yuv_line, i);
@@ -418,7 +425,7 @@ static int encode_frame(AVCodecContext * avctx, unsigned char *buf,
         add_entry(s, TIFF_YCBCR_SUBSAMPLING, TIFF_SHORT,    2, s->subsampling);
         add_entry(s, TIFF_REFERENCE_BW,      TIFF_RATIONAL, 6, refbw);
     }
-    bytestream_put_le32(&offset, ptr - buf);    // write offset to dir
+    bytestream_put_le32(&offset, ptr - pkt->data);    // write offset to dir
 
     if (check_size(s, 6 + s->num_entries * 12)) {
         ret = AVERROR(EINVAL);
@@ -428,7 +435,9 @@ static int encode_frame(AVCodecContext * avctx, unsigned char *buf,
     bytestream_put_buffer(&ptr, s->entries, s->num_entries * 12);
     bytestream_put_le32(&ptr, 0);
 
-    ret = ptr - buf;
+    pkt->size   = ptr - pkt->data;
+    pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
 
 fail:
     av_free(strip_sizes);
@@ -462,7 +471,7 @@ AVCodec ff_tiff_encoder = {
     .type           = AVMEDIA_TYPE_VIDEO,
     .id             = CODEC_ID_TIFF,
     .priv_data_size = sizeof(TiffEncoderContext),
-    .encode         = encode_frame,
+    .encode2        = encode_frame,
     .pix_fmts =
         (const enum PixelFormat[]) {PIX_FMT_RGB24, PIX_FMT_PAL8, PIX_FMT_GRAY8,
                               PIX_FMT_MONOBLACK, PIX_FMT_MONOWHITE,

From 4da6d194e5b00404f4d545adcaa8e206592ae746 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Thu, 23 Feb 2012 08:19:13 +0100
Subject: [PATCH 30/40] libxvid: switch to encode2().

---
 libavcodec/libxvidff.c | 60 +++++++++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 21 deletions(-)

diff --git a/libavcodec/libxvidff.c b/libavcodec/libxvidff.c
index a11e4ac913..44580d1364 100644
--- a/libavcodec/libxvidff.c
+++ b/libavcodec/libxvidff.c
@@ -32,6 +32,7 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mathematics.h"
 #include "libxvid_internal.h"
+#include "mpegvideo.h"
 #if !HAVE_MKSTEMP
 #include <fcntl.h>
 #endif
@@ -73,7 +74,7 @@ struct xvid_ff_pass1 {
 };
 
 /* Prototypes - See function implementation for details */
-int xvid_strip_vol_header(AVCodecContext *avctx, unsigned char *frame, unsigned int header_len, unsigned int frame_len);
+int xvid_strip_vol_header(AVCodecContext *avctx, AVPacket *pkt, unsigned int header_len, unsigned int frame_len);
 int xvid_ff_2pass(void *ref, int opt, void *p1, void *p2);
 void xvid_correct_framerate(AVCodecContext *avctx);
 
@@ -408,17 +409,25 @@ static av_cold int xvid_encode_init(AVCodecContext *avctx)  {
  * @param data Pointer to AVFrame of unencoded frame
  * @return Returns 0 on success, -1 on failure
  */
-static int xvid_encode_frame(AVCodecContext *avctx,
-                         unsigned char *frame, int buf_size, void *data) {
-    int xerr, i;
+static int xvid_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                             const AVFrame *picture, int *got_packet)
+{
+    int xerr, i, ret, user_packet = !!pkt->data;
     char *tmp;
     struct xvid_context *x = avctx->priv_data;
-    AVFrame *picture = data;
     AVFrame *p = &x->encoded_picture;
+    int mb_width   = (avctx->width  + 15) / 16;
+    int mb_height  = (avctx->height + 15) / 16;
 
     xvid_enc_frame_t xvid_enc_frame;
     xvid_enc_stats_t xvid_enc_stats;
 
+    if (!user_packet &&
+        (ret = av_new_packet(pkt, mb_width*mb_height*MAX_MB_BYTES + FF_MIN_BUFFER_SIZE)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+        return ret;
+    }
+
     /* Start setting up the frame */
     memset(&xvid_enc_frame, 0, sizeof(xvid_enc_frame));
     xvid_enc_frame.version = XVID_VERSION;
@@ -427,8 +436,8 @@ static int xvid_encode_frame(AVCodecContext *avctx,
     *p = *picture;
 
     /* Let Xvid know where to put the frame. */
-    xvid_enc_frame.bitstream = frame;
-    xvid_enc_frame.length = buf_size;
+    xvid_enc_frame.bitstream = pkt->data;
+    xvid_enc_frame.length    = pkt->size;
 
     /* Initialize input image fields */
     if( avctx->pix_fmt != PIX_FMT_YUV420P ) {
@@ -488,7 +497,9 @@ static int xvid_encode_frame(AVCodecContext *avctx,
         }
     }
 
-    if( 0 <= xerr ) {
+    if (xerr > 0) {
+        *got_packet = 1;
+
         p->quality = xvid_enc_stats.quant * FF_QP2LAMBDA;
         if( xvid_enc_stats.type == XVID_TYPE_PVOP )
             p->pict_type = AV_PICTURE_TYPE_P;
@@ -500,14 +511,21 @@ static int xvid_encode_frame(AVCodecContext *avctx,
             p->pict_type = AV_PICTURE_TYPE_I;
         if( xvid_enc_frame.out_flags & XVID_KEYFRAME ) {
             p->key_frame = 1;
+            pkt->flags |= AV_PKT_FLAG_KEY;
             if( x->quicktime_format )
-                return xvid_strip_vol_header(avctx, frame,
+                return xvid_strip_vol_header(avctx, pkt,
                     xvid_enc_stats.hlength, xerr);
          } else
             p->key_frame = 0;
 
-        return xerr;
+        pkt->size = xerr;
+
+        return 0;
     } else {
+        if (!user_packet)
+            av_free_packet(pkt);
+        if (!xerr)
+            return 0;
         av_log(avctx, AV_LOG_ERROR, "Xvid: Encoding Error Occurred: %i\n", xerr);
         return -1;
     }
@@ -551,16 +569,16 @@ static av_cold int xvid_encode_close(AVCodecContext *avctx) {
  * @return Returns new length of frame data
  */
 int xvid_strip_vol_header(AVCodecContext *avctx,
-                  unsigned char *frame,
+                  AVPacket *pkt,
                   unsigned int header_len,
                   unsigned int frame_len) {
     int vo_len = 0, i;
 
     for( i = 0; i < header_len - 3; i++ ) {
-        if( frame[i] == 0x00 &&
-            frame[i+1] == 0x00 &&
-            frame[i+2] == 0x01 &&
-            frame[i+3] == 0xB6 ) {
+        if( pkt->data[i] == 0x00 &&
+            pkt->data[i+1] == 0x00 &&
+            pkt->data[i+2] == 0x01 &&
+            pkt->data[i+3] == 0xB6 ) {
             vo_len = i;
             break;
         }
@@ -570,15 +588,15 @@ int xvid_strip_vol_header(AVCodecContext *avctx,
         /* We need to store the header, so extract it */
         if( avctx->extradata == NULL ) {
             avctx->extradata = av_malloc(vo_len);
-            memcpy(avctx->extradata, frame, vo_len);
+            memcpy(avctx->extradata, pkt->data, vo_len);
             avctx->extradata_size = vo_len;
         }
         /* Less dangerous now, memmove properly copies the two
            chunks of overlapping data */
-        memmove(frame, &frame[vo_len], frame_len - vo_len);
-        return frame_len - vo_len;
-    } else
-        return frame_len;
+        memmove(pkt->data, &pkt->data[vo_len], frame_len - vo_len);
+        pkt->size = frame_len - vo_len;
+    }
+    return 0;
 }
 
 /**
@@ -814,7 +832,7 @@ AVCodec ff_libxvid_encoder = {
     .id             = CODEC_ID_MPEG4,
     .priv_data_size = sizeof(struct xvid_context),
     .init           = xvid_encode_init,
-    .encode         = xvid_encode_frame,
+    .encode2        = xvid_encode_frame,
     .close          = xvid_encode_close,
     .pix_fmts= (const enum PixelFormat[]){PIX_FMT_YUV420P, PIX_FMT_NONE},
     .long_name= NULL_IF_CONFIG_SMALL("libxvidcore MPEG-4 part 2"),

From cfc6ab3199e7ac4197744e17e82144d857886096 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Thu, 23 Feb 2012 10:04:57 +0100
Subject: [PATCH 31/40] a64multienc: don't write into output buffer when
 there's no output.

---
 libavcodec/a64multienc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libavcodec/a64multienc.c b/libavcodec/a64multienc.c
index 532f2a2824..d50586e84b 100644
--- a/libavcodec/a64multienc.c
+++ b/libavcodec/a64multienc.c
@@ -317,8 +317,6 @@ static int a64multi_encode_frame(AVCodecContext *avctx, unsigned char *buf,
             charset  += charset_size;
             req_size += charset_size;
         }
-        /* no charset so clean buf */
-        else memset(buf, 0, charset_size);
 
         /* write x frames to buf */
         for (frame = 0; frame < c->mc_lifetime; frame++) {

From 7340008f18dc7d1557efbf5a331c9452913f7f61 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Thu, 23 Feb 2012 10:21:07 +0100
Subject: [PATCH 32/40] a64multienc: switch to encode2().

We have no decoder, so cannot test if the output is decodable.
---
 libavcodec/a64enc.h      |  3 +++
 libavcodec/a64multienc.c | 40 ++++++++++++++++++++++++++--------------
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/libavcodec/a64enc.h b/libavcodec/a64enc.h
index b64b952fe4..bf5eb02cd1 100644
--- a/libavcodec/a64enc.h
+++ b/libavcodec/a64enc.h
@@ -50,6 +50,9 @@ typedef struct A64Context {
     uint8_t *mc_colram;
     uint8_t *mc_palette;
     int mc_pal_size;
+
+    /* pts of the next packet that will be output */
+    int64_t next_pts;
 } A64Context;
 
 #endif /* AVCODEC_A64ENC_H */
diff --git a/libavcodec/a64multienc.c b/libavcodec/a64multienc.c
index d50586e84b..cb2425fdf9 100644
--- a/libavcodec/a64multienc.c
+++ b/libavcodec/a64multienc.c
@@ -28,6 +28,7 @@
 #include "a64colors.h"
 #include "a64tables.h"
 #include "elbg.h"
+#include "internal.h"
 #include "libavutil/intreadwrite.h"
 
 #define DITHERSTEPS   8
@@ -221,6 +222,8 @@ static av_cold int a64multi_init_encoder(AVCodecContext *avctx)
     if (!avctx->codec_tag)
          avctx->codec_tag = AV_RL32("a64m");
 
+    c->next_pts = AV_NOPTS_VALUE;
+
     return 0;
 }
 
@@ -239,11 +242,10 @@ static void a64_compress_colram(unsigned char *buf, int *charmap, uint8_t *colra
     }
 }
 
-static int a64multi_encode_frame(AVCodecContext *avctx, unsigned char *buf,
-                                 int buf_size, void *data)
+static int a64multi_encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                                 const AVFrame *pict, int *got_packet)
 {
     A64Context *c = avctx->priv_data;
-    AVFrame *pict = data;
     AVFrame *const p = (AVFrame *) & c->picture;
 
     int frame;
@@ -251,7 +253,8 @@ static int a64multi_encode_frame(AVCodecContext *avctx, unsigned char *buf,
     int b_height;
     int b_width;
 
-    int req_size;
+    int req_size, ret;
+    uint8_t *buf;
 
     int *charmap     = c->mc_charmap;
     uint8_t *colram  = c->mc_colram;
@@ -274,7 +277,7 @@ static int a64multi_encode_frame(AVCodecContext *avctx, unsigned char *buf,
     }
 
     /* no data, means end encoding asap */
-    if (!data) {
+    if (!pict) {
         /* all done, end encoding */
         if (!c->mc_lifetime) return 0;
         /* no more frames in queue, prepare to flush remaining frames */
@@ -292,6 +295,8 @@ static int a64multi_encode_frame(AVCodecContext *avctx, unsigned char *buf,
             p->key_frame = 1;
             to_meta_with_crop(avctx, p, meta + 32000 * c->mc_frame_counter);
             c->mc_frame_counter++;
+            if (c->next_pts == AV_NOPTS_VALUE)
+                c->next_pts = pict->pts;
             /* lifetime is not reached so wait for next frame first */
             return 0;
         }
@@ -302,6 +307,13 @@ static int a64multi_encode_frame(AVCodecContext *avctx, unsigned char *buf,
         req_size = 0;
         /* any frames to encode? */
         if (c->mc_lifetime) {
+            req_size = charset_size + c->mc_lifetime*(screen_size + colram_size);
+            if ((ret = ff_alloc_packet(pkt, req_size)) < 0) {
+                av_log(avctx, AV_LOG_ERROR, "Error getting output packet of size %d.\n", req_size);
+                return ret;
+            }
+            buf = pkt->data;
+
             /* calc optimal new charset + charmaps */
             ff_init_elbg(meta, 32, 1000 * c->mc_lifetime, best_cb, CHARSET_CHARS, 50, charmap, &c->randctx);
             ff_do_elbg  (meta, 32, 1000 * c->mc_lifetime, best_cb, CHARSET_CHARS, 50, charmap, &c->randctx);
@@ -310,12 +322,11 @@ static int a64multi_encode_frame(AVCodecContext *avctx, unsigned char *buf,
             render_charset(avctx, charset, colram);
 
             /* copy charset to buf */
-            memcpy(buf,charset, charset_size);
+            memcpy(buf, charset, charset_size);
 
             /* advance pointers */
             buf      += charset_size;
             charset  += charset_size;
-            req_size += charset_size;
         }
 
         /* write x frames to buf */
@@ -349,11 +360,12 @@ static int a64multi_encode_frame(AVCodecContext *avctx, unsigned char *buf,
         /* reset counter */
         c->mc_frame_counter = 0;
 
-        if (req_size > buf_size) {
-            av_log(avctx, AV_LOG_ERROR, "buf size too small (need %d, got %d)\n", req_size, buf_size);
-            return -1;
-        }
-        return req_size;
+        pkt->pts = pkt->dts = c->next_pts;
+        c->next_pts         = AV_NOPTS_VALUE;
+
+        pkt->size   = req_size;
+        pkt->flags |= AV_PKT_FLAG_KEY;
+        *got_packet = !!req_size;
     }
     return 0;
 }
@@ -364,7 +376,7 @@ AVCodec ff_a64multi_encoder = {
     .id             = CODEC_ID_A64_MULTI,
     .priv_data_size = sizeof(A64Context),
     .init           = a64multi_init_encoder,
-    .encode         = a64multi_encode_frame,
+    .encode2        = a64multi_encode_frame,
     .close          = a64multi_close_encoder,
     .pix_fmts       = (const enum PixelFormat[]) {PIX_FMT_GRAY8, PIX_FMT_NONE},
     .long_name      = NULL_IF_CONFIG_SMALL("Multicolor charset for Commodore 64"),
@@ -377,7 +389,7 @@ AVCodec ff_a64multi5_encoder = {
     .id             = CODEC_ID_A64_MULTI5,
     .priv_data_size = sizeof(A64Context),
     .init           = a64multi_init_encoder,
-    .encode         = a64multi_encode_frame,
+    .encode2        = a64multi_encode_frame,
     .close          = a64multi_close_encoder,
     .pix_fmts       = (const enum PixelFormat[]) {PIX_FMT_GRAY8, PIX_FMT_NONE},
     .long_name      = NULL_IF_CONFIG_SMALL("Multicolor charset for Commodore 64, extended with 5th color (colram)"),

From 8d3348c2c4c3373bf6e1410659790d985c14b057 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Thu, 23 Feb 2012 10:47:30 +0100
Subject: [PATCH 33/40] snowenc: don't abuse input picture for storing
 information.

---
 libavcodec/snowenc.c | 45 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/libavcodec/snowenc.c b/libavcodec/snowenc.c
index 270af12cb8..77687fdee6 100644
--- a/libavcodec/snowenc.c
+++ b/libavcodec/snowenc.c
@@ -1604,6 +1604,7 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
     SnowContext *s = avctx->priv_data;
     RangeCoder * const c= &s->c;
     AVFrame *pict = data;
+    AVFrame *pic = &s->new_picture;
     const int width= s->avctx->width;
     const int height= s->avctx->height;
     int level, orientation, plane_index, i, y;
@@ -1624,27 +1625,25 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
 
     s->m.picture_number= avctx->frame_number;
     if(avctx->flags&CODEC_FLAG_PASS2){
-        s->m.pict_type =
-        pict->pict_type= s->m.rc_context.entry[avctx->frame_number].new_pict_type;
-        s->keyframe= pict->pict_type==AV_PICTURE_TYPE_I;
+        s->m.pict_type = pic->pict_type = s->m.rc_context.entry[avctx->frame_number].new_pict_type;
+        s->keyframe = pic->pict_type == AV_PICTURE_TYPE_I;
         if(!(avctx->flags&CODEC_FLAG_QSCALE)) {
-            pict->quality= ff_rate_estimate_qscale(&s->m, 0);
-            if (pict->quality < 0)
+            pic->quality = ff_rate_estimate_qscale(&s->m, 0);
+            if (pic->quality < 0)
                 return -1;
         }
     }else{
         s->keyframe= avctx->gop_size==0 || avctx->frame_number % avctx->gop_size == 0;
-        s->m.pict_type=
-        pict->pict_type= s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
+        s->m.pict_type = pic->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
     }
 
     if(s->pass1_rc && avctx->frame_number == 0)
-        pict->quality= 2*FF_QP2LAMBDA;
-    if(pict->quality){
-        s->qlog= qscale2qlog(pict->quality);
-        s->lambda = pict->quality * 3/2;
+        pic->quality = 2*FF_QP2LAMBDA;
+    if (pic->quality) {
+        s->qlog   = qscale2qlog(pic->quality);
+        s->lambda = pic->quality * 3/2;
     }
-    if(s->qlog < 0 || (!pict->quality && (avctx->flags & CODEC_FLAG_QSCALE))){
+    if (s->qlog < 0 || (!pic->quality && (avctx->flags & CODEC_FLAG_QSCALE))) {
         s->qlog= LOSSLESS_QLOG;
         s->lambda = 0;
     }//else keep previous frame's qlog until after motion estimation
@@ -1654,7 +1653,7 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
     s->m.current_picture_ptr= &s->m.current_picture;
     s->m.last_picture.f.pts = s->m.current_picture.f.pts;
     s->m.current_picture.f.pts = pict->pts;
-    if(pict->pict_type == AV_PICTURE_TYPE_P){
+    if(pic->pict_type == AV_PICTURE_TYPE_P){
         int block_width = (width +15)>>4;
         int block_height= (height+15)>>4;
         int stride= s->current_picture.linesize[0];
@@ -1679,7 +1678,7 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
         s->m.mb_stride=   s->m.mb_width+1;
         s->m.b8_stride= 2*s->m.mb_width+1;
         s->m.f_code=1;
-        s->m.pict_type= pict->pict_type;
+        s->m.pict_type = pic->pict_type;
         s->m.me_method= s->avctx->me_method;
         s->m.me.scene_change_score=0;
         s->m.flags= s->avctx->flags;
@@ -1703,13 +1702,13 @@ static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size,
 
 redo_frame:
 
-    if(pict->pict_type == AV_PICTURE_TYPE_I)
+    if (pic->pict_type == AV_PICTURE_TYPE_I)
         s->spatial_decomposition_count= 5;
     else
         s->spatial_decomposition_count= 5;
 
-    s->m.pict_type = pict->pict_type;
-    s->qbias= pict->pict_type == AV_PICTURE_TYPE_P ? 2 : 0;
+    s->m.pict_type = pic->pict_type;
+    s->qbias = pic->pict_type == AV_PICTURE_TYPE_P ? 2 : 0;
 
     ff_snow_common_init_after_header(avctx);
 
@@ -1742,12 +1741,12 @@ redo_frame:
             predict_plane(s, s->spatial_idwt_buffer, plane_index, 0);
 
             if(   plane_index==0
-               && pict->pict_type == AV_PICTURE_TYPE_P
+               && pic->pict_type == AV_PICTURE_TYPE_P
                && !(avctx->flags&CODEC_FLAG_PASS2)
                && s->m.me.scene_change_score > s->avctx->scenechange_threshold){
                 ff_init_range_encoder(c, buf, buf_size);
                 ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
-                pict->pict_type= AV_PICTURE_TYPE_I;
+                pic->pict_type= AV_PICTURE_TYPE_I;
                 s->keyframe=1;
                 s->current_picture.key_frame=1;
                 goto redo_frame;
@@ -1773,7 +1772,7 @@ redo_frame:
                 ff_spatial_dwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
 
             if(s->pass1_rc && plane_index==0){
-                int delta_qlog = ratecontrol_1pass(s, pict);
+                int delta_qlog = ratecontrol_1pass(s, pic);
                 if (delta_qlog <= INT_MIN)
                     return -1;
                 if(delta_qlog){
@@ -1793,7 +1792,7 @@ redo_frame:
                     if(!QUANTIZE2)
                         quantize(s, b, b->ibuf, b->buf, b->stride, s->qbias);
                     if(orientation==0)
-                        decorrelate(s, b, b->ibuf, b->stride, pict->pict_type == AV_PICTURE_TYPE_P, 0);
+                        decorrelate(s, b, b->ibuf, b->stride, pic->pict_type == AV_PICTURE_TYPE_P, 0);
                     encode_subband(s, b, b->ibuf, b->parent ? b->parent->ibuf : NULL, b->stride, orientation);
                     assert(b->parent==NULL || b->parent->stride == b->stride*2);
                     if(orientation==0)
@@ -1820,7 +1819,7 @@ redo_frame:
             predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
         }else{
             //ME/MC only
-            if(pict->pict_type == AV_PICTURE_TYPE_I){
+            if(pic->pict_type == AV_PICTURE_TYPE_I){
                 for(y=0; y<h; y++){
                     for(x=0; x<w; x++){
                         s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x]=
@@ -1859,7 +1858,7 @@ redo_frame:
     s->m.p_tex_bits = s->m.frame_bits - s->m.misc_bits - s->m.mv_bits;
     s->m.current_picture.f.display_picture_number =
     s->m.current_picture.f.coded_picture_number   = avctx->frame_number;
-    s->m.current_picture.f.quality                = pict->quality;
+    s->m.current_picture.f.quality                = pic->quality;
     s->m.total_bits += 8*(s->c.bytestream - s->c.bytestream_start);
     if(s->pass1_rc)
         if (ff_rate_estimate_qscale(&s->m, 0) < 0)

From 171273fec82d465e0c0d94e462f1073d5ff26cf3 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Thu, 23 Feb 2012 10:59:22 +0100
Subject: [PATCH 34/40] snowenc: switch to encode2().

---
 libavcodec/snowenc.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/libavcodec/snowenc.c b/libavcodec/snowenc.c
index 77687fdee6..6f0d43eb3b 100644
--- a/libavcodec/snowenc.c
+++ b/libavcodec/snowenc.c
@@ -1600,18 +1600,25 @@ static void calculate_visual_weight(SnowContext *s, Plane *p){
     }
 }
 
-static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
+static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
+                        const AVFrame *pict, int *got_packet)
+{
     SnowContext *s = avctx->priv_data;
     RangeCoder * const c= &s->c;
-    AVFrame *pict = data;
     AVFrame *pic = &s->new_picture;
     const int width= s->avctx->width;
     const int height= s->avctx->height;
-    int level, orientation, plane_index, i, y;
+    int level, orientation, plane_index, i, y, ret;
     uint8_t rc_header_bak[sizeof(s->header_state)];
     uint8_t rc_block_bak[sizeof(s->block_state)];
 
-    ff_init_range_encoder(c, buf, buf_size);
+    if (!pkt->data &&
+        (ret = av_new_packet(pkt, s->b_width*s->b_height*MB_SIZE*MB_SIZE*3 + FF_MIN_BUFFER_SIZE)) < 0) {
+        av_log(avctx, AV_LOG_ERROR, "Error getting output packet.\n");
+        return ret;
+    }
+
+    ff_init_range_encoder(c, pkt->data, pkt->size);
     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
 
     for(i=0; i<3; i++){
@@ -1744,7 +1751,7 @@ redo_frame:
                && pic->pict_type == AV_PICTURE_TYPE_P
                && !(avctx->flags&CODEC_FLAG_PASS2)
                && s->m.me.scene_change_score > s->avctx->scenechange_threshold){
-                ff_init_range_encoder(c, buf, buf_size);
+                ff_init_range_encoder(c, pkt->data, pkt->size);
                 ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
                 pic->pict_type= AV_PICTURE_TYPE_I;
                 s->keyframe=1;
@@ -1777,7 +1784,7 @@ redo_frame:
                     return -1;
                 if(delta_qlog){
                     //reordering qlog in the bitstream would eliminate this reset
-                    ff_init_range_encoder(c, buf, buf_size);
+                    ff_init_range_encoder(c, pkt->data, pkt->size);
                     memcpy(s->header_state, rc_header_bak, sizeof(s->header_state));
                     memcpy(s->block_state, rc_block_bak, sizeof(s->block_state));
                     encode_header(s);
@@ -1873,7 +1880,12 @@ redo_frame:
 
     emms_c();
 
-    return ff_rac_terminate(c);
+    pkt->size = ff_rac_terminate(c);
+    if (avctx->coded_frame->key_frame)
+        pkt->flags |= AV_PKT_FLAG_KEY;
+    *got_packet = 1;
+
+    return 0;
 }
 
 static av_cold int encode_end(AVCodecContext *avctx)
@@ -1908,7 +1920,7 @@ AVCodec ff_snow_encoder = {
     .id             = CODEC_ID_SNOW,
     .priv_data_size = sizeof(SnowContext),
     .init           = encode_init,
-    .encode         = encode_frame,
+    .encode2        = encode_frame,
     .close          = encode_end,
     .long_name = NULL_IF_CONFIG_SMALL("Snow"),
     .priv_class     = &snowenc_class,

From ff311c091854a2cf2d49b5ecfa6759084fe643a7 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Thu, 23 Feb 2012 11:02:11 +0100
Subject: [PATCH 35/40] lavc: drop encode() support for video.

All video encoders have been converted to encode2(), all new encoders
should also use only encode2().
---
 libavcodec/utils.c | 48 ++++++++--------------------------------------
 1 file changed, 8 insertions(+), 40 deletions(-)

diff --git a/libavcodec/utils.c b/libavcodec/utils.c
index c88f379570..67c9e36bba 100644
--- a/libavcodec/utils.c
+++ b/libavcodec/utils.c
@@ -1082,9 +1082,6 @@ int attribute_align_arg avcodec_encode_video(AVCodecContext *avctx, uint8_t *buf
 }
 #endif
 
-#define MAX_CODED_FRAME_SIZE(width, height)\
-    (8*(width)*(height) + FF_MIN_BUFFER_SIZE)
-
 int attribute_align_arg avcodec_encode_video2(AVCodecContext *avctx,
                                               AVPacket *avpkt,
                                               const AVFrame *frame,
@@ -1103,44 +1100,15 @@ int attribute_align_arg avcodec_encode_video2(AVCodecContext *avctx,
     if (av_image_check_size(avctx->width, avctx->height, 0, avctx))
         return AVERROR(EINVAL);
 
-    if (avctx->codec->encode2) {
-        *got_packet_ptr = 0;
-        ret = avctx->codec->encode2(avctx, avpkt, frame, got_packet_ptr);
-        if (!ret) {
-            if (!*got_packet_ptr)
-                avpkt->size = 0;
-            else if (!(avctx->codec->capabilities & CODEC_CAP_DELAY))
-                avpkt->pts = avpkt->dts = frame->pts;
-        }
-    } else {
-        /* for compatibility with encoders not supporting encode2(), we need to
-           allocate a packet buffer if the user has not provided one or check
-           the size otherwise */
-        int buf_size = avpkt->size;
+    av_assert0(avctx->codec->encode2);
 
-        if (!user_packet)
-            buf_size = MAX_CODED_FRAME_SIZE(avctx->width, avctx->height);
-
-        if ((ret = ff_alloc_packet(avpkt, buf_size)))
-            return ret;
-
-        /* encode the frame */
-        ret = avctx->codec->encode(avctx, avpkt->data, avpkt->size, frame);
-        if (ret >= 0) {
-            if (!ret) {
-                /* no output. if the packet data was allocated by libavcodec,
-                   free it */
-                if (!user_packet)
-                    av_freep(&avpkt->data);
-            } else if (avctx->coded_frame) {
-                avpkt->pts    = avctx->coded_frame->pts;
-                avpkt->flags |= AV_PKT_FLAG_KEY*avctx->coded_frame->key_frame;
-            }
-
-            avpkt->size     = ret;
-            *got_packet_ptr = (ret > 0);
-            ret             = 0;
-        }
+    *got_packet_ptr = 0;
+    ret = avctx->codec->encode2(avctx, avpkt, frame, got_packet_ptr);
+    if (!ret) {
+        if (!*got_packet_ptr)
+            avpkt->size = 0;
+        else if (!(avctx->codec->capabilities & CODEC_CAP_DELAY))
+            avpkt->pts = avpkt->dts = frame->pts;
     }
 
     if (!ret)

From 7337484ed21d068184eff74fcdb14b886553159f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 9 Feb 2012 23:28:15 +0200
Subject: [PATCH 36/40] rtpenc: Add an option for not sending RTCP packets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavformat/rtpenc.c | 5 +++--
 libavformat/rtpenc.h | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c
index 604c4a0552..66ea4ced92 100644
--- a/libavformat/rtpenc.c
+++ b/libavformat/rtpenc.c
@@ -388,8 +388,9 @@ static int rtp_write_packet(AVFormatContext *s1, AVPacket *pkt)
 
     rtcp_bytes = ((s->octet_count - s->last_octet_count) * RTCP_TX_RATIO_NUM) /
         RTCP_TX_RATIO_DEN;
-    if (s->first_packet || ((rtcp_bytes >= RTCP_SR_SIZE) &&
-                           (ff_ntp_time() - s->last_rtcp_ntp_time > 5000000))) {
+    if ((s->first_packet || ((rtcp_bytes >= RTCP_SR_SIZE) &&
+                            (ff_ntp_time() - s->last_rtcp_ntp_time > 5000000))) &&
+        !(s->flags & FF_RTP_FLAG_SKIP_RTCP)) {
         rtcp_send_sr(s1, ff_ntp_time());
         s->last_octet_count = s->octet_count;
         s->first_packet = 0;
diff --git a/libavformat/rtpenc.h b/libavformat/rtpenc.h
index ff423a55d1..9ccc8e71fa 100644
--- a/libavformat/rtpenc.h
+++ b/libavformat/rtpenc.h
@@ -65,11 +65,13 @@ typedef struct RTPMuxContext RTPMuxContext;
 
 #define FF_RTP_FLAG_MP4A_LATM 1
 #define FF_RTP_FLAG_RFC2190   2
+#define FF_RTP_FLAG_SKIP_RTCP 4
 
 #define FF_RTP_FLAG_OPTS(ctx, fieldname) \
     { "rtpflags", "RTP muxer flags", offsetof(ctx, fieldname), AV_OPT_TYPE_FLAGS, {.dbl = 0}, INT_MIN, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM, "rtpflags" }, \
     { "latm", "Use MP4A-LATM packetization instead of MPEG4-GENERIC for AAC", 0, AV_OPT_TYPE_CONST, {.dbl = FF_RTP_FLAG_MP4A_LATM}, INT_MIN, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM, "rtpflags" }, \
     { "rfc2190", "Use RFC 2190 packetization instead of RFC 4629 for H.263", 0, AV_OPT_TYPE_CONST, {.dbl = FF_RTP_FLAG_RFC2190}, INT_MIN, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM, "rtpflags" }, \
+    { "skip_rtcp", "Don't send RTCP sender reports", 0, AV_OPT_TYPE_CONST, {.dbl = FF_RTP_FLAG_SKIP_RTCP}, INT_MIN, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM, "rtpflags" }, \
 
 void ff_rtp_send_data(AVFormatContext *s1, const uint8_t *buf1, int len, int m);
 

From f553462041096d5d2afd9a8841a7af50df5c2540 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 23 Feb 2012 11:54:13 +0200
Subject: [PATCH 37/40] rtpenc: Move max_packet_size to a context variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is in preparation for exposing this via an avoption.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavformat/rtpenc.c | 12 ++++++------
 libavformat/rtpenc.h |  1 +
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c
index 66ea4ced92..bdbe411192 100644
--- a/libavformat/rtpenc.c
+++ b/libavformat/rtpenc.c
@@ -82,7 +82,7 @@ static int is_supported(enum CodecID id)
 static int rtp_write_header(AVFormatContext *s1)
 {
     RTPMuxContext *s = s1->priv_data;
-    int max_packet_size, n;
+    int n;
     AVStream *st;
 
     if (s1->nb_streams != 1) {
@@ -109,16 +109,16 @@ static int rtp_write_header(AVFormatContext *s1)
         s->first_rtcp_ntp_time = (s1->start_time_realtime / 1000) * 1000 +
                                  NTP_OFFSET_US;
 
-    max_packet_size = s1->pb->max_packet_size;
-    if (max_packet_size <= 12) {
-        av_log(s1, AV_LOG_ERROR, "Max packet size %d too low\n", max_packet_size);
+    s->max_packet_size = s1->pb->max_packet_size;
+    if (s->max_packet_size <= 12) {
+        av_log(s1, AV_LOG_ERROR, "Max packet size %d too low\n", s->max_packet_size);
         return AVERROR(EIO);
     }
-    s->buf = av_malloc(max_packet_size);
+    s->buf = av_malloc(s->max_packet_size);
     if (s->buf == NULL) {
         return AVERROR(ENOMEM);
     }
-    s->max_payload_size = max_packet_size - 12;
+    s->max_payload_size = s->max_packet_size - 12;
 
     s->max_frames_per_packet = 0;
     if (s1->max_delay) {
diff --git a/libavformat/rtpenc.h b/libavformat/rtpenc.h
index 9ccc8e71fa..2bb2b815c3 100644
--- a/libavformat/rtpenc.h
+++ b/libavformat/rtpenc.h
@@ -34,6 +34,7 @@ struct RTPMuxContext {
     uint32_t timestamp;
     uint32_t base_timestamp;
     uint32_t cur_timestamp;
+    int max_packet_size;
     int max_payload_size;
     int num_frames;
 

From ba605cef7961ee699c893d1a3b5c9730f0a37b6c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 23 Feb 2012 11:56:15 +0200
Subject: [PATCH 38/40] rtpenc: Expose the max packet size via an avoption
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows opting for a lower MTU than what the AVIOContext
indicated, and allows writing into outputs that don't indicate
an MTU at all (such as plain files, which is useful for testing).

This also allows querying for the MTU via the avoption.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavformat/rtpenc.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c
index bdbe411192..e4ef0fc92b 100644
--- a/libavformat/rtpenc.c
+++ b/libavformat/rtpenc.c
@@ -33,6 +33,7 @@
 static const AVOption options[] = {
     FF_RTP_FLAG_OPTS(RTPMuxContext, flags)
     { "payload_type", "Specify RTP payload type", offsetof(RTPMuxContext, payload_type), AV_OPT_TYPE_INT, {.dbl = -1 }, -1, 127, AV_OPT_FLAG_ENCODING_PARAM },
+    { "max_packet_size", "Max packet size", offsetof(RTPMuxContext, max_packet_size), AV_OPT_TYPE_INT, {.dbl = 0 }, 0, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
     { NULL },
 };
 
@@ -109,7 +110,12 @@ static int rtp_write_header(AVFormatContext *s1)
         s->first_rtcp_ntp_time = (s1->start_time_realtime / 1000) * 1000 +
                                  NTP_OFFSET_US;
 
-    s->max_packet_size = s1->pb->max_packet_size;
+    if (s->max_packet_size) {
+        if (s1->pb->max_packet_size)
+            s->max_packet_size = FFMIN(s->max_payload_size,
+                                       s1->pb->max_packet_size);
+    } else
+        s->max_packet_size = s1->pb->max_packet_size;
     if (s->max_packet_size <= 12) {
         av_log(s1, AV_LOG_ERROR, "Max packet size %d too low\n", s->max_packet_size);
         return AVERROR(EIO);

From 480b133e6f79c470aff0f84d9ed3648d37c32b03 Mon Sep 17 00:00:00 2001
From: Derek Buitenhuis <derek.buitenhuis@gmail.com>
Date: Thu, 23 Feb 2012 10:55:35 -0500
Subject: [PATCH 39/40] wavpack: Don't shift minclip/maxclip

Since we are clipping before we shift the values to
16 or 32 bits, we should not shift the min/max clip
values to compensate.

Fixes 8 and 24 bit lossy decoding.

Signed-off-by: Derek Buitenhuis <derek.buitenhuis@gmail.com>
Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavcodec/wavpack.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/wavpack.c b/libavcodec/wavpack.c
index fb0e0b2fa7..6eb913fa64 100644
--- a/libavcodec/wavpack.c
+++ b/libavcodec/wavpack.c
@@ -813,8 +813,8 @@ static int wavpack_decode_block(AVCodecContext *avctx, int block_no,
     s->hybrid         =   s->frame_flags & WV_HYBRID_MODE;
     s->hybrid_bitrate =   s->frame_flags & WV_HYBRID_BITRATE;
     s->post_shift     = bpp * 8 - orig_bpp + ((s->frame_flags >> 13) & 0x1f);
-    s->hybrid_maxclip = (( 1LL << (orig_bpp - 1)) - 1) >> s->post_shift;
-    s->hybrid_minclip = ((-1LL << (orig_bpp - 1)))     >> s->post_shift;
+    s->hybrid_maxclip = (( 1LL << (orig_bpp - 1)) - 1);
+    s->hybrid_minclip = ((-1LL << (orig_bpp - 1)));
     s->CRC            = AV_RL32(buf); buf += 4;
     if (wc->mkv_mode)
         buf += 4; //skip block size;

From 31632e73f47d25e2077fce729571259ee6354854 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Thu, 23 Feb 2012 11:53:27 -0800
Subject: [PATCH 40/40] swf: check return values for av_get/new_packet().

Prevents crashers when using the packet if allocation failed.

Found-by: Mateusz "j00ru" Jurczyk and Gynvael Coldwind
CC: libav-stable@libav.org
---
 libavformat/swfdec.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/libavformat/swfdec.c b/libavformat/swfdec.c
index 1bcb24919d..c194310a37 100644
--- a/libavformat/swfdec.c
+++ b/libavformat/swfdec.c
@@ -84,7 +84,7 @@ static int swf_read_packet(AVFormatContext *s, AVPacket *pkt)
     SWFContext *swf = s->priv_data;
     AVIOContext *pb = s->pb;
     AVStream *vst = NULL, *ast = NULL, *st = 0;
-    int tag, len, i, frame, v;
+    int tag, len, i, frame, v, res;
 
     for(;;) {
         uint64_t pos = avio_tell(pb);
@@ -147,7 +147,8 @@ static int swf_read_packet(AVFormatContext *s, AVPacket *pkt)
                 st = s->streams[i];
                 if (st->codec->codec_type == AVMEDIA_TYPE_VIDEO && st->id == ch_id) {
                     frame = avio_rl16(pb);
-                    av_get_packet(pb, pkt, len-2);
+                    if ((res = av_get_packet(pb, pkt, len-2)) < 0)
+                        return res;
                     pkt->pos = pos;
                     pkt->pts = frame;
                     pkt->stream_index = st->index;
@@ -160,9 +161,11 @@ static int swf_read_packet(AVFormatContext *s, AVPacket *pkt)
                 if (st->codec->codec_type == AVMEDIA_TYPE_AUDIO && st->id == -1) {
             if (st->codec->codec_id == CODEC_ID_MP3) {
                 avio_skip(pb, 4);
-                av_get_packet(pb, pkt, len-4);
+                if ((res = av_get_packet(pb, pkt, len-4)) < 0)
+                    return res;
             } else { // ADPCM, PCM
-                av_get_packet(pb, pkt, len);
+                if ((res = av_get_packet(pb, pkt, len)) < 0)
+                    return res;
             }
             pkt->pos = pos;
             pkt->stream_index = st->index;
@@ -187,7 +190,8 @@ static int swf_read_packet(AVFormatContext *s, AVPacket *pkt)
                 st = vst;
             }
             avio_rl16(pb); /* BITMAP_ID */
-            av_new_packet(pkt, len-2);
+            if ((res = av_new_packet(pkt, len-2)) < 0)
+                return res;
             avio_read(pb, pkt->data, 4);
             if (AV_RB32(pkt->data) == 0xffd8ffd9 ||
                 AV_RB32(pkt->data) == 0xffd9ffd8) {