; ; This code implements the basic idct on a 8x8 pixel block. ; Basically, it's the same as in the JPEG engine, with the sole difference ; that it's inlined and register-wise a little bit more optimized there. ; ; This is a complete rewrite in assembler. Heavy stuff. Lotsa work. ; ; Michael Rausch 14-4-94 1:14:00 ; ; ; The whole code handles D-Frames not very well, but I'll fix it on day. ; DCTSIZE EQU 8 PASS1_BITS EQU 2 CONST_BITS EQU 13 FIX_0_298631336 EQU 2446 ;1 + $98e 100110001110 _FIX_0_390180644 EQU -3196 ;2 - $c7c FIX_0_541196100 EQU 4433 ;3 + $1151 u FIX_0_765366865 EQU 6270 ;4 + $187e u _FIX_0_899976223 EQU -7373 ;5 - $1ccd FIX_1_175875602 EQU 9633 ;6 + $25a1 FIX_1_501321110 EQU 12299 ;7 + $300b _FIX_1_847759065 EQU -15137 ;8 - $3b21 u _FIX_1_961570560 EQU -16069 ;9 - $3ec5 FIX_2_053119869 EQU 16819 ;10 + $41b3 _FIX_2_562915447 EQU -20995 ;11 - $5203 FIX_3_072711026 EQU 25172 ;12 + $6254 ; FIX_1_847759065-FIX_0_765366865 = 2* FIX_0_541196100 ; ************************************************************************** jrevdct: sub.w #16,sp move.l a0,-(sp) lea compose1(pc),a5 moveq #DCTSIZE-1,d7 idct1: move.l d7,-(sp) lea 2(a0),a1 move.l (a1)+,d2 move.l d2,d0 move.l (a1)+,d4 move.l (a1)+,d3 or.l d4,d0 or.w (a1)+,d0 or.l d3,d0 bne.s idct1_no_ac0 move.w (a0),d0 lsl.w #PASS1_BITS,d0 move.w d0,d1 swap d0 move.w d1,d0 REPT 4 move.l d0,(a0)+ ENDR bra idct1_next idct1_no_ac0: move.w d2,d1 ; 2 add.w d3,d1 ; 6 muls #FIX_0_541196100,d1 muls #_FIX_1_847759065,d3 add.l d1,d3 muls #FIX_0_765366865,d2 add.l d1,d2 move.w (a0),d0 ext.l d0 ; 0 ext.l d4 ; 4 move.l d0,d5 sub.l d4,d5 add.l d0,d4 lsl.l #5,d4 lsl.l #5,d5 addq.l #1<<2,d4 addq.l #1<<2,d5 lsl.l #8,d4 lsl.l #8,d5 lea 12(sp),a1 ; top + 2 longs -> 16 bytes platz auf dem stack move.l d4,d0 add.l d2,d4 move.l d4,(a1)+ ; tmp10 sub.l d2,d0 move.l d5,d1 add.l d3,d5 move.l d5,(a1)+ ; tmp11 sub.l d3,d1 move.l d1,(a1)+ ; tmp12 move.l d0,(a1)+ ; tmp13 odd_part1: move.w 7*2(a0),d1 ;7 beq o0xxx o1xxx: move.w 5*2(a0),d2 ;5 beq o10xx o11xx: move.w 3*2(a0),d3 ;3 beq o110x o111x: move.w 1*2(a0),d4 ;1 bne.s odd1_1111 ; 7531 odd1_1110: move.w d2,d6 move.w d1,d0 moveq #0,d4 bra.s abk_2 ; 7531 odd1_1111: move.w d2,d6 add.w d4,d6 move.w d1,d0 add.w d4,d0 muls #FIX_1_501321110,d4 abk_2: move.w d1,d5 add.w d3,d5 move.w d5,d7 add.w d6,d7 muls #FIX_1_175875602,d7 muls #_FIX_1_961570560,d5 muls #_FIX_0_390180644,d6 add.l d7,d5 add.l d7,d6 move.w d2,d7 add.w d3,d7 muls #FIX_0_298631336,d1 muls #FIX_2_053119869,d2 muls #FIX_3_072711026,d3 muls #_FIX_0_899976223,d0 muls #_FIX_2_562915447,d7 add.l d0,d1 add.l d7,d2 add.l d5,d1 add.l d6,d2 add.l d3,d5 add.l d4,d6 add.l d7,d5 add.l d0,d6 jmp (a5) o2110x: move.w 1*DCTSIZE*2(a0),d4 ;1 bne.s odd1_1101 ; 7531 odd1_1100: move.w d2,d6 move.w d1,d3 moveq #0,d4 bra.s abk_3 o110x: move.w 1*2(a0),d4 ;1 beq.s odd1_1100 ; 7531 odd1_1101: move.w d2,d6 move.w d1,d3 add.w d4,d6 add.w d4,d3 muls #FIX_1_501321110,d4 abk_3: move.w d1,d5 move.w d5,d7 add.w d6,d7 muls #FIX_1_175875602,d7 muls #_FIX_1_961570560,d5 muls #_FIX_0_390180644,d6 add.l d7,d5 add.l d7,d6 move.w d2,d0 muls #FIX_0_298631336,d1 muls #FIX_2_053119869,d2 muls #_FIX_0_899976223,d3 muls #_FIX_2_562915447,d0 add.l d3,d1 add.l d0,d2 add.l d5,d1 add.l d6,d2 add.l d4,d6 add.l d0,d5 add.l d3,d6 jmp (a5) o10xx: move.w 3*2(a0),d3 ;3 beq o100x o101x: move.w 1*2(a0),d4 ;1 beq.s odd1_1010 ; 7531 odd1_1011: move.w d1,d5 add.w d3,d5 move.w d1,d0 move.w d4,d6 add.w d4,d0 muls #FIX_1_501321110,d4 move.w d5,d7 add.w d6,d7 muls #_FIX_0_390180644,d6 abk_4: muls #FIX_1_175875602,d7 muls #_FIX_1_961570560,d5 add.l d7,d6 add.l d7,d5 move.w d3,d7 muls #FIX_0_298631336,d1 muls #FIX_3_072711026,d3 muls #_FIX_0_899976223,d0 muls #_FIX_2_562915447,d7 add.l d0,d1 move.l d6,d2 add.l d5,d1 add.l d7,d2 add.l d3,d5 add.l d4,d6 add.l d7,d5 add.l d0,d6 jmp (a5) o210xx: move.w 3*DCTSIZE*2(a0),d3 ;3 beq o2100x o2101x: move.w 1*DCTSIZE*2(a0),d4 ;1 bne.s odd1_1011 ; 7531 odd1_1010: move.w d1,d5 add.w d3,d5 move.w d1,d0 moveq #0,d4 move.w d5,d7 moveq #0,d6 bra.s abk_4 o100x: move.w 1*2(a0),d4 ;1 beq.s odd1_1000 ; 7531 odd1_1001: move.w d1,d0 add.w d4,d0 move.w d1,d5 move.w d4,d6 move.w d0,d7 muls #FIX_1_175875602,d7 muls #_FIX_1_961570560,d5 muls #_FIX_0_390180644,d6 add.l d7,d5 add.l d7,d6 muls #FIX_0_298631336,d1 muls #FIX_1_501321110,d4 muls #_FIX_0_899976223,d0 add.l d0,d1 add.l d5,d1 move.l d6,d2 add.l d4,d6 add.l d0,d6 jmp (a5) o2100x: move.w 1*DCTSIZE*2(a0),d4 ;1 bne.s odd1_1001 ; 7531 odd1_1000: move.w d1,d2 move.w d1,d5 move.w d1,d6 muls #FIX_1_175875602,d2 muls #FIX_1_175875602+_FIX_0_899976223,d6 muls #FIX_1_175875602+_FIX_1_961570560,d5 muls #FIX_1_175875602+_FIX_0_899976223+_FIX_1_961570560+FIX_0_298631336,d1 jmp (a5) o0xxx: move.w 5*2(a0),d2 ;5 beq o00xx o01xx: move.w 3*2(a0),d3 ;3 beq o010x o011x: move.w 1*2(a0),d4 ;1 beq.s odd1_0110 ; 7531 odd1_0111: ; opt8 move.w d2,d6 add.w d4,d6 move.w d4,d1 muls #FIX_1_501321110,d4 muls #_FIX_0_899976223,d1 abk_1: move.w d2,d0 add.w d3,d0 move.w d3,d5 move.w d5,d7 add.w d6,d7 muls #FIX_1_175875602,d7 muls #_FIX_1_961570560,d5 muls #_FIX_0_390180644,d6 ; ???? 2 add.l d7,d5 add.l d7,d6 muls #FIX_2_053119869,d2 muls #FIX_3_072711026,d3 muls #_FIX_2_562915447,d0 add.l d0,d2 add.l d6,d2 add.l d4,d6 add.l d1,d6 add.l d5,d1 add.l d3,d5 add.l d0,d5 jmp (a5) o20xxx: move.w 5*DCTSIZE*2(a0),d2 ;5 beq o200xx o201xx: move.w 3*DCTSIZE*2(a0),d3 ;3 beq.s o2010x o2011x: move.w 1*DCTSIZE*2(a0),d4 ;1 bne.s odd1_0111 ; 7531 odd1_0110: move.w d2,d6 moveq.l #0,d1 moveq.l #0,d4 bra.s abk_1 o010x: move.w 1*2(a0),d4 ;1 beq.s odd1_0100 ; 7531 odd1_0101: move.w d2,d6 move.w d2,d7 add.w d4,d6 move.w d4,d1 move.w d6,d5 muls #FIX_1_175875602,d5 muls #_FIX_0_390180644+FIX_1_175875602,d6 muls #FIX_2_053119869+_FIX_2_562915447,d2 muls #FIX_1_501321110,d4 muls #_FIX_0_899976223,d1 muls #_FIX_2_562915447,d7 add.l d6,d2 add.l d1,d6 add.l d5,d1 add.l d7,d5 add.l d4,d6 jmp (a5) o2010x: move.w 1*DCTSIZE*2(a0),d4 ;1 bne.s odd1_0101 ; 7531 odd1_0100: move.w d2,d6 move.w d2,d1 move.w d2,d5 muls #FIX_1_175875602,d1 muls #FIX_1_175875602+_FIX_2_562915447,d5 muls #FIX_1_175875602+_FIX_0_390180644,d6 muls #FIX_1_175875602+_FIX_2_562915447+_FIX_0_390180644+FIX_2_053119869,d2 jmp (a5) o00xx: move.w 3*2(a0),d5 ;3 beq.s o000x o001x: move.w 1*2(a0),d4 ;1 beq.s odd1_0010 ; 7531 odd1_0011: ; opt12 move.w d5,d2 move.w d5,d3 move.w d4,d1 move.w d4,d6 move.w d3,d7 add.w d4,d7 muls #FIX_1_175875602,d7 muls #_FIX_1_961570560,d5 muls #_FIX_0_390180644,d6 add.l d7,d5 add.l d7,d6 muls #_FIX_2_562915447+FIX_3_072711026,d3 muls #_FIX_0_899976223+FIX_1_501321110,d4 muls #_FIX_0_899976223,d1 muls #_FIX_2_562915447,d2 add.l d5,d1 add.l d6,d2 add.l d3,d5 add.l d4,d6 jmp (a5) o200xx: move.w 3*DCTSIZE*2(a0),d5 ;3 beq o2000x o2001x: move.w 1*DCTSIZE*2(a0),d4 ;1 bne.s odd1_0011 ; 7531 odd1_0010: move.w d5,d6 move.w d5,d2 move.w d5,d1 muls #FIX_1_175875602,d6 muls #FIX_1_175875602+_FIX_2_562915447,d2 muls #FIX_1_175875602+_FIX_1_961570560,d1 muls #FIX_1_175875602+_FIX_2_562915447+_FIX_1_961570560+FIX_3_072711026,d5 jmp (a5) o000x: move.w 1*2(a0),d6 ;1 beq.s odd1_0000 ; 7531 odd1_0001: ; opt 14 move.w d6,d5 move.w d6,d1 move.w d6,d2 muls #FIX_1_175875602,d5 muls #FIX_1_175875602+_FIX_0_899976223,d1 muls #FIX_1_175875602+_FIX_0_390180644,d2 muls #FIX_1_175875602+_FIX_0_899976223+_FIX_0_390180644+FIX_1_501321110,d6 jmp (a5) ; priority: 14 12 8 0 ; 7531 odd1_0000: moveq #CONST_BITS-PASS1_BITS,d7 ; optimized compose ! lea 12(sp),a1 move.l (a1)+,d0 ; tmp10 lsl.l #16-(CONST_BITS-PASS1_BITS),d0 move.l (a1)+,d1 ; tmp11 lsr.l d7,d1 move.w d1,d0 move.l (a1)+,d2 ; tmp12 lsl.l #16-(CONST_BITS-PASS1_BITS),d2 move.l (a1)+,d3 ; tmp13 lsr.l d7,d3 move.w d3,d2 move.l d0,(a0)+ swap d0 move.l d2,(a0)+ swap d2 move.l d2,(a0)+ move.l d0,(a0)+ move.l (sp)+,d7 dbra d7,idct1 bra.s idct1_ready ; keep 1 2 5 6 compose1: moveq #CONST_BITS-PASS1_BITS,d7 lea 12(sp),a1 move.l (a1)+,d4 ; tmp10 sub.l d6,d4 add.l d6,d6 add.l d4,d6 lsl.l #16-(CONST_BITS-PASS1_BITS),d6 move.l (a1)+,d3 ; tmp11 sub.l d5,d3 add.l d5,d5 add.l d3,d5 lsr.l d7,d5 move.w d5,d6 move.l d6,(a0)+ move.l (a1)+,d6 ; tmp12 sub.l d2,d6 add.l d2,d2 add.l d6,d2 lsl.l #16-(CONST_BITS-PASS1_BITS),d2 move.l (a1)+,d5 ; tmp13 sub.l d1,d5 add.l d1,d1 add.l d5,d1 lsr.l d7,d1 move.w d1,d2 move.l d2,(a0)+ lsl.l #16-(CONST_BITS-PASS1_BITS),d5 lsr.l d7,d6 move.w d6,d5 move.l d5,(A0)+ lsl.l #16-(CONST_BITS-PASS1_BITS),d3 lsr.l d7,d4 move.w d4,d3 move.l d3,(a0)+ idct1_next: move.l (sp)+,d7 dbra d7,idct1 idct1_ready: ; ******************************************************* move.l (sp)+,a0 lea compose2(pc),a5 moveq #DCTSIZE-1,d7 idct2: move.l d7,-(sp) odd_part2: move.w 7*DCTSIZE*2(a0),d1 ;7 beq o20xxx o21xxx: move.w 5*DCTSIZE*2(a0),d2 ;5 beq o210xx o211xx: move.w 3*DCTSIZE*2(a0),d3 ;3 beq o2110x o2111x: move.w 1*DCTSIZE*2(a0),d4 ;1 beq odd1_1110 bra odd1_1111 o2000x: move.w 1*DCTSIZE*2(a0),d6 ;1 bne odd1_0001 odd0_0000: move.w 2*DCTSIZE*2(a0),d2 move.w 4*DCTSIZE*2(a0),d4 move.w 6*DCTSIZE*2(a0),d3 move.w d2,d0 add.w d3,d0 muls #FIX_0_541196100/4,d0 muls #_FIX_1_847759065/4,d3 add.l d0,d3 muls #FIX_0_765366865/4,d2 add.l d0,d2 move.w (a0),d0 add.w #1<<(PASS1_BITS+3-1),d0 ; precalc from the descaling part below ext.l d4 ext.l d0 move.l d0,d5 sub.l d4,d5 add.l d0,d4 moveq #CONST_BITS-2,d0 lsl.l d0,d4 lsl.l d0,d5 move.l d4,d0 add.l d2,d4 swap d4 move.w d4,(a0)+ sub.l d2,d0 move.w d4,7*DCTSIZE*2-2(a0) swap d0 move.w d0,3*DCTSIZE*2-2(a0) move.l d5,d4 move.w d0,4*DCTSIZE*2-2(a0) add.l d3,d5 swap d5 sub.l d3,d4 move.w d5,1*DCTSIZE*2-2(a0) swap d4 move.w d5,6*DCTSIZE*2-2(a0) move.w d4,2*DCTSIZE*2-2(a0) move.w d4,5*DCTSIZE*2-2(a0) move.l (sp)+,d7 dbra d7,idct2 bra idct2_ready compose2: move.w 2*DCTSIZE*2(a0),d3 move.w 4*DCTSIZE*2(a0),d4 move.w 6*DCTSIZE*2(a0),d7 move.w d3,d0 add.w d7,d0 muls #FIX_0_541196100,d0 muls #_FIX_1_847759065,d7 add.l d0,d7 muls #FIX_0_765366865,d3 add.l d0,d3 asr.l #2,d7 asr.l #2,d3 move.l d7,a3 move.w (a0),d0 add.w #1<<(PASS1_BITS+3-1),d0 ; precalc from the descaling part below ext.l d4 ext.l d0 move.l d0,d7 sub.l d4,d7 add.l d0,d4 moveq #CONST_BITS-2,d0 lsl.l d0,d4 lsl.l d0,d7 asr.l #2,d6 asr.l #2,d5 asr.l #2,d2 asr.l #2,d1 move.l d4,d0 add.l d3,d4 sub.l d3,d0 move.l d7,d3 add.l a3,d7 sub.l a3,d3 sub.l d6,d4 add.l d6,d6 add.l d4,d6 swap d6 ; moveq #CONST_BITS+PASS1_BITS+3 -2 ,d6 ; asr.l d6,d3 move.w d6,(a0)+ swap d4 move.w d4,7*DCTSIZE*2-2(a0) sub.l d1,d0 add.l d1,d1 add.l d0,d1 swap d1 move.w d1,3*DCTSIZE*2-2(a0) swap d0 move.w d0,4*DCTSIZE*2-2(a0) sub.l d5,d7 add.l d5,d5 add.l d7,d5 swap d5 move.w d5,1*DCTSIZE*2-2(a0) swap d7 move.w d7,6*DCTSIZE*2-2(a0) sub.l d2,d3 add.l d2,d2 add.l d3,d2 swap d2 move.w d2,2*DCTSIZE*2-2(a0) swap d3 move.w d3,5*DCTSIZE*2-2(a0) idct2_next: move.l (sp)+,d7 dbra d7,idct2 idct2_ready; add.w #16,sp ; movem.l (sp)+,JREVDCTREGS movem.l (sp)+,ri_regs rts XDEF @j_rev_dct @j_rev_dct: movem.l ri_regs,-(sp) bra jrevdct ifeq 1 ; ************************************************************************** ; Pre compute singleton coefficient IDCT values. ; ; void init_pre_idct(void) ; XDEF @init_pre_idct @init_pre_idct: movem.l d2/a2,-(sp) lea PreIDCT,a2 move.w #64*64/4/4-1,d2 preidctclr: clr.l (a2)+ clr.l (a2)+ clr.l (a2)+ clr.l (a2)+ dbra d2,preidctclr lea PreIDCT+63*64*2,a2 moveq #63,d2 preidctloop: move.w #2048,(a2,d2.w) move.l a2,a0 bsr @j_rev_dct sub.w #64,a2 dbra d2,preidctloop movem.l (sp)+,d2/a2 rts ; ************************************************************************************ ; Perform the inverse DCT on one block of coefficients. ; ; void j_rev_dct_sparse (DCTBLOCK data, int pos) ; XDEF @j_rev_dct_sparse @j_rev_dct_sparse: tst.l d0 bne itsnotthedc ; the single element to cope with is the dc coefficient move.w (a0),d1 bpl.s scale_dc subq.w #3+4,d1 ; "implement" the rounding error scale_dc:addq.w #4,d1 asr.w #3,d1 move.w d1,d0 ; extend to longword swap d0 move.w d1,d0 moveq #7,d1 set_dc: move.l d0,(a0)+ move.l d0,(a0)+ move.l d0,(a0)+ move.l d0,(a0)+ dbra d1,set_dc rts ; not that pretty ; bra exit_jrds itsnotthedc: movem.l d2/d3,-(sp) ; Some other coefficient. move.w (a0,d0.w),d1 ; get coeff lea PreIDCT,a1 ; get precalculated DCT lsl.l #7,d0 add.l d0,a1 moveq #CONST_BITS-PASS1_BITS-8,d3 ; scale down moveq.l #31,d0 set_ac: move.w d1,d2 muls (a1)+,d2 lsr.l d3,d2 move.w d2,(a0)+ move.w d1,d2 muls (a1)+,d2 lsr.l d3,d2 move.w d2,(a0)+ dbra d0,set_ac movem.l (sp)+,d2/d3 exit_jrds: rts ; ************************************************************************************ section bss,BSS ; ; Precomputed idct value arrays ; PreIDCT: ds.w 64*64 endc ; END