*******************************************************************
*:ts=15
*:fo=helvetica,13
*
*:ts=22
*:fo=times,15
*
*:ts=10
*:fo=diamond,12
*
*:ts=15
*:fo=ruby,15
*
** or any fixed-width-font, e.g.
*:ts=11
*:fo=excel,11
*
*
*
* TxtCharCopyA()
*
* This routine was written as a replacement for the stuff
* done in ANSI-C for "BareED".
*
* This routine can be used to copy overlayed, whilst non-
* overlayed copying is supported, too.  It's more than 100%
* faster than its eqivalent (byte-copy) in ANSI-C.
* Does not require a MC68020  or higher or other specific
* stuff, runs with a plain MC68000.
*
* Copy-mode selected within routine; there are a couple of
* it avaiable.
*
* Calling convention:
*
*	TxtCharCopyA( source-addr, dest-addr, length);
* void	TxtCharCopyA( char *, char *, ULONG)
*
* IMPROVEMENTS:
*	MEMORY ACCESS DRASTICALLY REDUCED
*	MUCH FASTER ON A 68020 OR WHEN CACHES ARE DISABLED
*	DRASTICALLY SHORTENED
*	OPTIMIZED FOR 32BIT CPUs
*
*	Can now be assembled using a68k.
*
*
* Copyright 1996/97/98/99 Jörg van de Loo
*
* $VER: TxtCharCopyA 2.0 (07.02.1997)
*

	IFD		__G2		; HiSoft Devpac Amiga assembler?
	OPT		L+		; Create link-able code
	IDNT		'cpylib.asm'
	MACHINE		MC68000		; Processor
	OUTPUT		cpylib.lib		; Where to store object?
	ENDC

	XDEF	_TxtCharCopy
	XDEF	_TxtCharCopyA

	include	exec/execbase.i

	SECTION TEXT,CODE
	dc.b	'TXTCHARCOPY 2.0 © COPYRIGHT 1996-99 J.v.d.Loo'
	CNOP	0,4
*
** C-entry point
*
_TxtCharCopy
	movem.l	4(sp),A0-A1
	move.l	12(sp),D0
*
** Source, Destination, size - calling convention "CopyMem()" alike
*
_TxtCharCopyA
	movem.l	D2-D6/A2,-(sp)

	moveq	#24,D3		; Shif-values...
	moveq	#8,D4

	cmpa.l	A1,A0
	bhi.w	_ForwardCopy	; Ascend copy

	move.l	A0,D1		; Check if we're going to
	add.l	D0,D1		; copy 'overlayed'
	cmp.l	A1,D1
	bls.w	_ForwardCopy	; Ascend copy

* %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

_ReverseCopy			; Descend copy
	lea	0(A0,D0.l),A0
	lea	0(A1,D0.l),A1

	cmpi.l	#32,D0		; At least 33 bytes to copy?
	bls.w	.rbytecpy

	move.l	A0,D1
	andi.b	#1,D1
	beq.s	.rsrceven		; Source even

* --------------------------------- *
.rsrcodd
	move.l	A1,D1
	andi.b	#1,D1
	bne.s	.rbothodd

.rscrOddDestEven
* ---- Source is odd, destination is even --- *
.sode
	movea.l	(4).w,A2		; 2478 0004 instead of 2479 0000 0004, Zero-Page = word addressing mode
	move.w	AttnFlags(A2),D1
	andi.w	#AFF_68020!AFF_68030!AFF_68040!AFF_68060,D1
	bne.s	.rbotheven		; Don't care about the limits of the 68000/08/10/12

	move.l	D0,D2		; Amount
	andi.l	#-4,D2		; Modulo 4
	sub.l	D2,D0		; Compute rest
	lsr.l	#2,D2		; Through 4 (number of longs)
	subq.l	#1,D2		; 'Cause of 'Carry'

	moveq	#0,D1		; No conflicts, please
	move.b	-(A0),D1		; Get the even byte	[ ___e ]
.sode1
	move.l	-(A0),D5		; Get the even long	[ abcd ]
	move.l	D5,D6		; Save it for later	[ abcd ]
	lsl.l	D4,D5		; <<8		[ bcd_ ]
	lsr.l	D3,D6		; >>24		[ ___a ]
	or.l	D1,D5		; Join...		[ bcde ]
	move.l	D5,-(A1)		; Store longword	[ bcde ]
	move.b	D6,D1		; D1 (byte)	[ ___a ]
	subq.l	#1,D2		; Decrease number of loops
	bcc.s	.sode1		; If there is a rest...
	addq.l	#1,A0		; One too far...
	bra.s	.rbytecpy

.rbothodd
* ---- Source is odd, destination is odd --- *
	move.b	-(A0),-(A1)
	subq.l	#1,D0		; Now both even!

* ---- Source is even, destination is even --- * \\\\ Or we have CPU that can handle 32 bit addresses ////
.rbotheven
	move.l	D0,D2		; Amount
	andi.l	#-32,D2		; Modulo 32
	sub.l	D2,D0		; Compute rest
	lsr.l	#5,D2		; Through 32 (number of longs)
	subq.l	#1,D2		; 'Cause of 'Carry'
.sede
	move.l	-(A0),-(A1)
	move.l	-(A0),-(A1)
	move.l	-(A0),-(A1)
	move.l	-(A0),-(A1)

	move.l	-(A0),-(A1)
	move.l	-(A0),-(A1)
	move.l	-(A0),-(A1)
	move.l	-(A0),-(A1)

	subq.l	#1,D2		; Stored 32 bytes
	bcc.s	.sede		; A rest?
	bra.s	.rbytecpy

* --------------------------------- *
.rsrceven
	move.l	A1,D1
	andi.b	#1,D1
	beq.s	.rbotheven

.rsrcEvenDestOdd
* ---- Source is even, destination is odd --- *
.sedo
	move.b	-(A0),-(A1)
	subq.l	#1,D0
	bra.s	.sode		; Now source odd, destination even!

* ---- Check for rest of characters to copy --- *
.rbytecpy
	tst.b	D0
	beq.s	.rdone

.rbytecpyLoop
	move.b	-(A0),-(A1)
	subq.b	#1,D0
	bne.s	.rbytecpyLoop
.rdone
	bra.w	_TxtCharCopyDone

* %%%%%%%%%%%%%%%%%%%%%%%%%%%

_ForwardCopy
	cmpi.l	#32,D0		; At least 33 bytes to copy?
	bls.s	.fbytecpy

	move.l	A0,D1
	andi.b	#1,D1
	beq.s	.fsrceven		; Source even

* --------------------------------- *
.fsrcodd
	move.l	A1,D1
	andi.b	#1,D1
	bne.s	.fbothodd

.fscrOddDestEven
* ---- Source is odd, destination is even --- *
.fsode
	movea.l	(4).w,A2
	move.w	AttnFlags(A2),D1
	andi.w	#AFF_68020!AFF_68030!AFF_68040!AFF_68060,D1
	bne.s	.fbotheven		; Don't care about the limit of the 68000

	move.l	D0,D2		; Amount
	andi.l	#-4,D2		; Modulo 4
	sub.l	D2,D0		; Compute rest
	lsr.l	#2,D2		; Through 4 (number of longs)
	subq.l	#1,D2		; 'Cause of 'Carry'

	move.b	(A0)+,D1		; The odd byte	[ ><>a ]
.fsode1
	lsl.l	D3,D1		; <<24		[ a___ ]
	move.l	(A0)+,D5		; The even long	[ bcde ]
	move.b	D5,D6		; The odd byte	[ ><>e ]
	lsr.l	D4,D5		; >>8		[ _bcd ]
	or.l	D5,D1		; Join...		[ abcd ]
	move.l	D1,(A1)+		; Store longword	[ abcd ]
	move.b	D6,D1		; D1 (byte)	[ ><>e ]
	subq.l	#1,D2		; Decrease number of loops
	bcc.s	.fsode1		; If there is a rest...
	subq.l	#1,A0		; One too far...
	bra.s	.fbytecpy

.fbothodd
* ---- Source is odd, destination is odd --- *
	move.b	(A0)+,(A1)+
	subq.l	#1,D0

* ---- Source is even, destination is even --- * \\\\ Or we have CPU that can handle 32 bit addresses ////
.fbotheven
	move.l	D0,D2		; Amount
	andi.l	#-32,D2		; Modulo 32
	sub.l	D2,D0		; Compute rest
	lsr.l	#5,D2		; Through 32 (number of longs)
	subq.l	#1,D2		; 'Cause of 'Carry'
.fsede
	move.l	(A0)+,(A1)+
	move.l	(A0)+,(A1)+
	move.l	(A0)+,(A1)+
	move.l	(A0)+,(A1)+

	move.l	(A0)+,(A1)+
	move.l	(A0)+,(A1)+
	move.l	(A0)+,(A1)+
	move.l	(A0)+,(A1)+

	subq.l	#1,D2		; Stored 32 bytes?
	bcc.s	.fsede		; A rest?
	bra.s	.fbytecpy

* --------------------------------- *
.fsrceven
	move.l	A1,D1
	andi.b	#1,D1
	beq.s	.fbotheven

.fsrcEvenDestOdd
* ---- Source is even, destination is odd --- *
	move.b	(A0)+,(A1)+
	subq.l	#1,D0
	bra.s	.fsode

.fbytecpy
	tst.b	D0
	beq.s	.fdone
.fbytecpyLoop
	move.b	(A0)+,(A1)+
	subq.b	#1,D0
	bne.s	.fbytecpyLoop
.fdone

* --------------------------------- *
_TxtCharCopyDone
	movem.l	(sp)+,D2-D6/A2
	rts

	END