;******************************************************************************
; 96/192-bit sync pattern finder.  Computes running correlations over 4 frames.
;******************************************************************************

;Algorithm:
;  Clear correlation buffer (corbuf)
;  Clear running sum buffer (sumbuf)
;  Receive F+1 values via DMA in Y buffer
;  Repeat forever {
;    Clear running max (max, maxsq)
;    Repeat for i=1,...,B/A
;      Receive A values from DMA in Y buffer (ybuf)
;      Receive A values from DMA in correlation buffer
;      Receive A values from DMA in sum buffer
;      Repeat for ct=A-1,...,0 {
;        Compute Fx1 correlation
;        Update running sum, running max, correlation buffer
;      }
;      Send A values from correlation buffer via DMA
;      Send A values from sum buffer via DMA
;    }
;    Report max-2B to control DSP
;  }
;
;The "Update running sum, ..." line really means:
; m=s[ct]+sum(yf)
; max=(m^2>=max^2)?m:max
; s[ct]=m-c[j][ct]
; c[j][ct]=sum(yf)
;
;F=frame sync length (96), B=block length (1788*3+96), A<4096 (B/2)
;
;Internal RAM:
;          y:                   correlation:           sums:
; <----- power of 2 ------>
; <-- A+F --><-- A --><--->  <-- A --><-- A -->  <-- A --><-- A -->
;   compute     DMA           compute    DMA      compute    DMA
;
;SBSRAM:
;                               correlation:           sums:
;                          i-2 <---- B ---->       <---- B ---->
;                          i-1 <---- B ---->
;                           i  <---- B ---->
;Global RAM:
;          y:
; -- from control DSP --


;The pipelined multiply-add's of the correlation computation are arranged as
;shown in the following sketch.  Time is horizontal; computation for each pair
;of correlations begins at the top and finishes at the bottom of the sketch.
;The top and bottom edges are numbered to show when each pair of computations
;begins and ends, respectively.  Each correlation computation requires 54
;clock cycles, (numbered 2 through 55 for historical reasons).  A few of
;these key cycle numbers are shown on the left edge of the sketch.  In the
;assembly code, most lines are commented with this cycle number as
;";NN: comment".  For each pair of correlations computed, the input symbols
;are fetched in 22 sets of four (or occasionally two), numbered 0 through 21.
;About half are processed by the A-side registers, and half by the B-side
;registers, and are sorted by whether they are fetched from even or odd word
;addresses (to avoid bank hits).  For example, the 7'th step of a correlation
;computation involves (among other things) reading the 5'th set of four
;symbols from memory.  Those lines in the assembly language bear the comments
;";7: a5" and ";7: b5", and these comments can be used as search keys.
;
;     0   2   4   6      8  10  12      14  16  18     20  22  24      26   ...
; 2  |\   \   \   \   | |\   \   \   |  |\   \   \   | |\   \   \   |  |\
;    | \   \   \   \  | | \   \   \  |  | \   \   \  | | \   \   \  |  | \
;    |  \   \   \   \ | |  \   \   \ |  |  \   \   \ | |  \   \   \ |  |  \
;    |   \   \   \   \|_|   \   \   \|__|   \   \   \|_|   \   \   \|__|   \
;25  |\   \   \   \   | |\   \   \   |  |\   \   \   | |\   \   \   |  |\
;    | \   \   \   \  | | \   \   \  |  | \   \   \  | | \   \   \  |  | \
;    |  \   \   \   \ | |  \   \   \ |  |  \   \   \ | |  \   \   \ |  |  \
;    |   \   \   \   \|_|   \   \   \|  |   \   \   \|_|   \   \   \|  |   \
;48  |\   \   \   \   | |\   \   \   |  |\   \   \   | |\   \   \   |  |\
;55  | \   \   \   \  | | \   \   \  |\ | \   \   \  | | \   \   \  |\ | \
;       x  -2   0   2      4   6   8  10   x  12  14     16  18  20  22   x ...
;             |__________________________|    |_________________________|
;                result=max(these 12)            result=max(these 12)       ...
;
; Sketched for A=6, B=A*2=12.
;
;Note that the very first pass, the loop is repeated (A+2)/2 times; all
;subsequent passes execute the loop A/2 times.  This means the block of B
;symbols fetched are offset from the max function by 2 more than one might
;expect.

	.include "gram.asm"
;	.include "gramsync.asm"	;## for testing as a single DSP

YSIZE	.equ	16384	;must be >= 2(2A+F+2), and a power of 2
			;It turns out 32768 is optimal for all F
			;but 16384's OK, and avoids an assembler error for now
	.if	F=96
CONST1	.equ	122	;Magical constants to correct various maxi offsets
CONST2	.equ	-124
CONST3	.equ	-126
	.else	;F=192
;CONST1	.equ	34	;I thought this was right before 10/30/00.
CONST1	.equ	82	;As of 10/30/00, I now think maybe it's this. ##
CONST2	.equ	-36
CONST3	.equ	-38
	.endif

	.bss	yblk,YSIZE,YSIZE
ybuf	.equ	yblk
ybufe	.equ	yblk+YSIZE	;just past end of ybuf
	.bss	corblk,A*4+4,8	;1 word overwritten, 2A shorts
corbuf	.equ	corblk+4	;odd word aligned
	.bss	sumblk,A*4+8,8	;1 word unused, 1 word overwritten, 2A shorts
sumbuf	.equ	sumblk+8	;even word aligned
	.bss	marktbl,0x100	;for debug and timing only

corbufx	.usect	"sbsram",(B-LASTA+A)*6,4	;might be slightly bigger than
corbufxe .equ	corbufx+(B-LASTA+A)*6		;B, since A>=LASTA.
sumbufx	.usect	"sbsram",(B-LASTA+A)*2,4	;not used when A=B/2
sumbufxe .equ	sumbufx+(B-LASTA+A)*2


timer0	.set	0x01940000
timerC	.set	0
timerT	.set	1
timerD	.set	2
DMA	.set	0x01840000
DMA0	.set	0
DMA2	.set	1
DMA1	.set	16
DMA3	.set	17
DMApri	.set	0
DMAsec	.set	2
DMAsrc	.set	4
DMAdst	.set	6
DMAcnt	.set	8
DMActA	.set	10
DMActB	.set	11
DMAixA	.set	12
DMAixB	.set	13
DMAadA	.set	14
DMAadB	.set	15
DMAadC	.set	26
DMAadD	.set	27
VINTA	.set	0x016D8004	;DSP A's interrupt register
VINTB	.set	0x016D8008	;DSP B's interrupt register

;*** Name some registers ***
;Shuffle these assignments at will, within the following constraints:
;  Preserve a-side/b-side for all but DMA registers
;  ct,s2,dmac,cnd are condition regs
;  ada, adb, and dmady require circular addressing.
maxsq	.equ	a0	;max squared
s0	.equ	b0	;sum of values with +1 in correlations 1 and 2
s2	.equ	a1	;sum of values with +1 in correlation 2 only
ct	.equ	b1	;loop counter
cnd	.equ	a2	;a condition register
sum	.equ	b2	;sum of all F values
dlta	.equ	a3	;second inductive correction to sum
dltb	.equ	b3	;first inductive correction to sum
ada	.equ	a4	;circular address (side a/b) (in ybuf)
adb	.equ	b4
sbuf	.equ	a5	;sum buffer pointer
cbuf	.equ	b5	;correlation buffer pointer
timer	.equ	a6
ofs	.equ	b6	;offset to actual y values in SBSRAM
rcor	.equ	a7	;running correlation
s1	.equ	b7	;sum of values with +1 in correlation 1 only
lowa	.equ	a8	;first (low half) of fetched pair
lowb	.equ	b8
higha	.equ	a9	;second (high half) of fetched pair
highb	.equ	b9
onea	.equ	a10	;const 10000h, low half is also outer loop counter
oneb	.equ	b10	;const 10000h for splitting high/low halves
s0a	.equ	a11	;running sum0 (s0=s0a+s0b)
s0b	.equ	b11
s1a	.equ	a12	;running sum1 (s1=s1a+s1b)
s1b	.equ	b12
s2a	.equ	a13	;running sum2 (s2=s2a+s2b)
s2b	.equ	b13
mark	.equ	a14	;where the final results are stored
cor	.equ	b14	;old correlation
max	.equ	a15	;max value
maxi	.equ	b15	;offset of max value

	;DANGEROUS!  Double-named registers are:
	;s0, s1, s2, a6, b6, and dltb are available during spin-down code
;dmad	.equ	s0	;DMA simulator circular dest address (in ybuf)
dmac	.equ	s2	;DMA simulator counter
;dmasy	.equ	a6	;DMA simulator source address
;dmady	.equ	b6	;DMA simulator non-circular dest address
dmas	.equ	dltb	;DMA simulator source address for y
dmav	.equ	s0	;DMA simulator data value

	.sect	".vectors"
	b.s1	start	;IST entry 0 (RESET)
||	nop	5

tstamp	.macro
;	ldw	*timer[timerD],dmav
;	nop	4
;	stw	dmav,*mark++
	.endm

	.sect	.text
start:

;*********** Initialization ***********

;Initialize some CPU control registers for safety
	mvk	0x00000100,ada	;Disable interrupts, little-endian.
	mvc	ada,csr
	zero	s0a		;Make all registers linear
	mvc	s0a,amr
	mvk	-1,dmav		;Clear any pending interrupts
	mvc	dmav,icr

	mvk	timer0,timer	;Fire up the timer
	mvkh	timer0,timer
	mvk	0x204,dmav	;cpuclk(0x200)+enable global SRAM(0x4)
	stw	dmav,*timer[timerC]	;hold timer
	mvk	-1,dmav
	stw	dmav,*timer[timerT]	;max count=0xffffffff
	mvk	0x2c4,dmav	;cpuclk(0x200)+run(0xc0)+global SRAM (0x4)
	stw	dmav,*timer[timerC]	;start timing

;Clear corbuf in local memory
	mvk	corbuf-4,ada
	mvkh	corbuf-4,ada
	zero	s0a
	mvk	A,ct	;number of words to clear, minus 1
clrcor:
[ct]	b	clrcor
	sub	ct,1,ct
	stw	s0a,*ada++
	nop	3

;Clear sumbuf in local memory, including the preceding half word!
	mvk	sumbuf-4,ada
	mvkh	sumbuf-4,ada
	zero	s0a
	mvk	A,ct	;number of words to clear, minus 1
clrsum:
[ct]	b	clrsum
	sub	ct,1,ct
	stw	s0a,*ada++
	nop	3

;Clear corbuf in external memory
	mvk	corbufx,ada
	mvkh	corbufx,ada
	mvk	(corbufxe-corbufx)/4-1,ct ;number of words to clear, minus 1
	mvkh	(corbufxe-corbufx)/4-1,ct
clrcrx:
[ct]	b	clrcrx
	sub	ct,1,ct
	stw	s0a,*ada++
	nop	3

;Clear sumbuf in external memory (sumbufx not needed when B=2A, or B=A)
	.if B>A+LASTA
	mvk	sumbufx,ada
	mvkh	sumbufx,ada
	mvk	(sumbufxe-sumbufx)/4-1,ct ;number of words to clear, minus 1
	mvkh	(sumbufxe-sumbufx)/4-1,ct
clrsmx:
[ct]	b	clrsmx
	sub	ct,1,ct
	stw	s0a,*ada++
	nop	3
	.endif

;Make addresses in ybuf circular
	mvk	0x1101,s0a	;Make A4, B4, B6 circular, size 16K
	mvklh	0x000d,s0a
	mvc	s0a,amr

;Initialize lots of DMA registers
	mvk	DMA,dmas
	mvkh	DMA,dmas
	mvk	0x0000a000,dmav		;clear block IE for DMA 0,2
	mvkh	0x0000a000,dmav
	stw	dmav,*dmas[DMA0+DMAsec]
	stw	dmav,*dmas[DMA2+DMAsec]
	mvk	0x0000a080,dmav		;set block IE for DMA 1
	mvkh	0x0000a080,dmav
	stw	dmav,*dmas[DMA1+DMAsec]
	mvk	sumbufx,dmav		;store sumbufx ptr in secret reg
	mvkh	sumbufx,dmav
	stw	dmav,*dmas[DMAadA]
	mvk	corbufx,dmav		;store corbufx ptr in secret reg
	mvkh	corbufx,dmav
	stw	dmav,*dmas[DMAadD]

;Announce to Control DSP that we're alive
	mvk	gsync,s0a
	mvkh	gsync,s0a
	mvk	-1,dmav
	stw	dmav,*s0a

;Wait for B+A+F+2 values to arrive in gybuf
iwait:	mvc	ifr,ct		;What interrupts are pending?
	extu	ct,27,31,ct	;mask for inter-DSP interrupt
	shl	ct,4,ct
[!ct]	b	iwait
	nop	5
	mvc	ct,icr		;Found.  Now clear it at the CPU
	mvk	VINTB,dmav	;and in the Monaco registers
	mvkh	VINTB,dmav
	stw	ct,*dmav

;Receive A+F+2 values for ybuf with DMA0
	mvk	gybuf0,dmav	;source (next block is B-short aligned)
	mvkh	gybuf0,dmav
	stw	dmav,*dmas[DMA0+DMAsrc]
	mvk	ybuf,dmav		;destination
	mvkh	ybuf,dmav
	stw	dmav,*dmas[DMA0+DMAdst]
	mvk	(A+F+2)/2,dmav		;count
	stw	dmav,*dmas[DMA0+DMAcnt]
	mvk	0x01000051,dmav		;vanilla DMA, DMA priority
	mvkh	0x01000051,dmav
	stw	dmav,*dmas[DMA0+DMApri]	;start DMA
	nop	9			;how fast does the DMA respond?
dwait:	ldw	*dmas[DMA0+DMApri],cnd	;wait for it to finish
	nop	4
	and	0xc,cnd,cnd
[cnd]	b	dwait
	nop	5

;Compute sum for correlation
	mvk	ybuf,ada	;Compute sum of first F elements
	mvkh	ybuf,ada
	mvk	F-1,ct		;number of elements to sum, minus 1
	zero	sum
isum:
	ldh	*ada++,lowa
	nop
[ct]	b	isum
	nop	2
	add	lowa,sum,sum
	sub	ct,1,ct
	nop

;Read A values for ybuf with DMA0
	.if	B=A+LASTA	;If this is the last fragment,
	mvk	LASTA/2,dmav	;then count is LASTA shorts
	.else
	mvk	A/2,dmav	;else, count is A shorts.
	.endif
	stw	dmav,*dmas[DMA0+DMAcnt]
	mvk	0x01000051,dmav		;basic DMA, DMA priority
	mvkh	0x01000051,dmav
	stw	dmav,*dmas[DMA0+DMApri]	;start DMA

;Initialize all registers
	mvk	marktbl,mark	;table of identified ASM locations
	mvkh	marktbl,mark
	mvk	gybuf+(B-A-F-2)*2,ofs	;actual address of external y data
	mvkh	gybuf+(B-A-F-2)*2,ofs
	mvk	corbuf-4,cbuf	;first good data goes at corbuf
	mvkh	corbuf-4,cbuf
	mvk	sumbuf-4,sbuf	;first good data goes at sumbuf
	mvkh	sumbuf-4,sbuf
	mvk	(B-LASTA)/A,onea	;low half is outer loop counter
	mvklh	1,onea		;high half=1 for multiply/copy
	mvk	0,oneb		;low half is unused
	mvklh	1,oneb		;high half=1 for multiply/copy
	mvk	ybuf+4,ada
	mvkh	ybuf+4,ada
	mvk	ybuf+24,adb
	mvkh	ybuf+24,adb
	zero	max		;clear all these so running sums are correct
	zero	maxsq		;despite the loop fragments executed
	zero	cor
	zero	rcor
	zero	s1
	zero	s2
	shr	sum,1,s1a	;set s1a, s2a to sum/2, so corr's come out
	shr	sum,1,s2a	;about zero.  Not exact for first 4 results!
	zero	s1b
	zero	s2b
	zero	s0a
	zero	s0b
	zero	dlta
	zero	dltb
	zero	higha
	zero	highb
	zero	lowa
	zero	lowb
	mvk	A+2,ct		;Number of correlations to compute, plus 2

;Ready to go!
	tstamp
	b	clk2		;Start loop at clock 2
	nop	5		;Fill remaining branch delay slots

;*********** The loop ***********

	.if F=192

	;Clock 44' (22)
loop:	ldw.d1	*ada++[-24],lowa	;44': a42
||	ldw.d2	*adb++[-22],lowb	;44': b42
||	mpyh.m1	lowa,onea,higha	;44': a37
||	mpyh.m2	lowb,oneb,highb	;44': b37
||	add.s2	s1b,lowb,s1b	;44': b37
||	add.l1	s0a,higha,s0a	;44': a35
||	add.l2	s0b,highb,s0b	;44': b35

	;Clock 45' (23)
	ldw.d1	*ada++[-21],lowa	;45': a43
||	ldw.d2	*adb++[7],dltb	;45': b43
||	mpyh.m2	lowb,oneb,highb	;45': b38
||	add.s2	s0b,lowb,s0b	;45': b38
||	add.l1	s0a,higha,s0a	;45': a36
||	add.l2	s1b,highb,s1b	;45': b36
||	mpyh.m1	rcor,rcor,cnd	;47: square cor

	;Clock 46' (24)
	mpyh.m2	lowb,oneb,highb	;46': a39
||	add.s2	s0b,lowb,s0b	;46': a39
||	add.l1	s2a,higha,s2a	;46': a37
||	add.l2	s2b,highb,s2b	;46': b37
||	stw.d1	s2,*sbuf++	;47: store running correlation
||	stw.d2	s0,*cbuf++	;47: store new correlation

	;Clock 2' (2)
clk2:	ldw.d1	*ada++[2],s1a	;2': a0, clear s1a
||	ldw.d2	*adb++[-4],s2b	;2': b0, clear s2b
||	mpyh.m2	lowb,oneb,highb	;47': b40
||	add.s1	s1a,lowa,s1a	;47': a40
||	add.s2	s2b,lowb,s2b	;47': b40
||	add.l2	s0b,highb,s0b	;47': b38
||	mpy.m1	rcor,rcor,cnd	;48: square rcor

	;Clock 3' (3)
clk3:	ldw.d1	*ada++[2],s2a	;3': a1, clear s2a
||	ldw.d2	*adb++[8],lowb	;3': b1
||	mpyh.m1	lowa,onea,higha	;48': a41
||	mpyh.m2	lowb,oneb,highb	;48': b41
||	add.s1	s0a,lowa,s0a	;48': a41
||	add.l2	s0b,highb,s0b	;48': a39
||	cmplt.l1  maxsq,cnd,s2	;49: s2=max^2<cor^2 ? 1 : 0
				;.s2 unused

	;Clock 4' (4)
	ldw.d1	*ada++[2],lowa	;4': a2
||	mpyh.m1	lowa,onea,higha	;49': a42
||	mpyh.m2	lowb,oneb,highb	;49': b42
||	add.s1	s2a,lowa,s2a	;49': a42
||	add.s2	s2b,lowb,s2b	;49': b42
||	add.l2	s0b,highb,s0b	;49': b40
||	ldw.d2	*cbuf,cor	;27: get old correlation
||[s2]	mv.l1x	cor,max		;50: s2 ? max=cor

	.endif

	.if F=96

	;Clock 21
loop:	ldw.d1	*ada++[2],lowa	;21: a19
||	ldw.d2	*adb++[-22],lowb	;21: b19
||	mpyh.m1	lowa,onea,higha	;21: a14
||	mpyh.m2	lowb,oneb,highb	;21: b14
||	add.s1	s0a,lowa,s0a	;21: a14
||	add.s2	s0b,lowb,s0b	;21: b14
||	add.l1	s2a,higha,s2a	;21: a12
||	add.l2	s2b,highb,s2b	;21: b12

	;Clock 22
	ldw.d1	*ada++[-24],lowa	;22: a20
||	ldw.d2	*adb++[-22],lowb	;22: b20
||	mpyh.m1	lowa,onea,higha	;22: a15
||	mpyh.m2	lowb,oneb,highb	;22: b15
||	add.s1	s2a,lowa,s2a	;22: a15
||	add.s2	s0b,lowb,s0b	;22: b15
||	add.l1	s0a,higha,s0a	;22: a13
||	add.l2	s1b,highb,s1b	;22: b13

	;Clock 23
	ldw.d1	*ada++[-21],lowa	;23: a21
||	ldw.d2	*adb++[7],dltb	;23: b21
||	mpyh.m1	lowa,onea,higha	;23: a16
||	mpyh.m2	lowb,oneb,highb	;23: b16 (23: a16, 23: b16)
||	add.l1	s1a,higha,s1a	;23: a14
||	add.l2	s1b,highb,s1b	;23: b14
||	sub2.s1	rcor,cor,s2	;46: sub old term from running correlation
||	shr.s2x	rcor,16,cor	;46: unstack running correlation

	;Clock 24
	mpyh.m2	lowb,oneb,highb	;24: b17
||	add.s1	s1a,lowa,s1a	;24: a17
||	add.s2	s2b,lowb,s2b	;24: b17
||	add.l1	s0a,higha,s0a	;24: a15
||	add.l2	s0b,highb,s0b	;24: b15
||	stw.d1	s2,*sbuf++	;47: store running correlation
||	stw.d2	s0,*cbuf++	;47: store new correlation
||	mpyh.m1	rcor,rcor,cnd	;47: square cor

	;Clock 2
clk2:	ldw.d1	*ada++[2],s1a	;2: a0, clear s1a
||	ldw.d2	*adb++[-4],s2b	;2: b0, clear s2b
||	mpyh.m2	lowb,oneb,highb	;25: a18
||	add.s2	s0b,lowb,s0b	;25: a18
||	add.s1	s1a,dlta,s1a	;25: b18
||	add.l1	s2a,higha,s2a	;25: a16
||	add.l2	s2b,highb,s2b	;25: b16
||	mpy.m1	rcor,rcor,cnd	;48: square rcor

	;Clock 3
clk3:	ldw.d1	*ada++[2],s2a	;3: a1, clear s2a
||	ldw.d2	*adb++[8],lowb	;3: b1
||	mpyh.m1	lowa,onea,higha	;26: a19
||	mpyh.m2	lowb,oneb,highb	;26: b19
||	add.s1	s0a,lowa,s0a	;26: a19 (26: b19)
||	add.s2	s1b,highb,s1b	;26: b17 (26: a17)
||	cmplt.l1  maxsq,cnd,s2	;49: s2=max^2<cor^2 ? 1 : 0
				;.l2 unused

	;Clock 4
	ldw.d1	*ada++[2],lowa	;4: a2
||	mpyh.m1	lowa,onea,higha	;27: a20
||	mpyh.m2	lowb,oneb,highb	;27: b20
||	add.s1	s2a,lowa,s2a	;27: a20
||	add.s2	s2b,lowb,s2b	;27: b20
||	add.l2	s1b,highb,s1b	;27: a18 (27: b18)
||	ldw.d2	*cbuf,cor	;27: get old correlation
||[s2]	mv.l1x	cor,max		;50: s2 ? max=cor
	.endif

	;Clock 5
	ldw.d2	*adb++[2],lowb	;5: b3
||	mpyh.m1	lowa,onea,higha	;28: a21
||	mpyh.m2	dltb,oneb,highb	;28: b21
||	add.s1	s0a,lowa,s0a	;28: a21
||	add.s2	s2b,dltb,s2b	;28: b21
||	add.l2x	s1b,higha,s1b	;28: a19
||	add.l1x	s2a,highb,s2a	;28: b19
||	ldw.d1	*sbuf,rcor	;28: get old running correlation

	;Clock 6
	ldw.d1	*ada++[2],lowa	;6: a4
||	ldw.d2	*adb++[2],lowb	;6: b4
||	add.l1	s0a,higha,s0a	;29: a20
||	add.l2	s0b,highb,s0b	;29: b20
||	add.s1x	s2a,s2b,s2	;29: s2=s2a+s2b
||	add.s2x	s1b,s1a,s1	;29: s1=s1a+s1b
||[s2]	mpyhl.m2 oneb,adb,maxi	;52: s2 ? maxi=offset
||	mpylh.m1 rcor,onea,rcor	;52: clear top half of rcor

	;Clock 7
	ldw.d1	*ada++[2],lowa	;7: a5
||	ldw.d2	*adb++[2],lowb	;7: b5
||	mpyh.m1 s1a,onea,higha	;7: a0
||	mpyh.m2	s2b,oneb,s1b	;7: b0, clear s1b
||	add.s1	s0a,higha,s0a	;30: a21
||	add.s2	s0b,highb,s0b	;30: b21
||	sub.l2	ct,2,ct		;decrement loop counter
||	cmplt.l1 maxsq,cnd,cnd	;53: cnd=max^2<rcor^2 ? 1 : 0

	;Clock 8
	ldw.d1	*ada++[2],s0a	;8: a6, clear s0a
||	ldw.d2	*adb++[2],lowb	;8: b6
||	mpyh.m1	s2a,onea,higha	;8: a1
||	mpyh.m2	lowb,oneb,highb	;8: b1
||	mv.l2	lowb,s0b	;8: b1, clear s0b
||	add.s2x	s0b,s0a,s0	;31: s0=s0a+s0b
||	sub2.s1x dlta,dltb,dlta	;31: delta=yy-xx
||[cnd]	mv.l1	rcor,max	;54: cnd ? max=rcor
				;scrounge .l2 by changing s0b to s0 in clk 31

	;Clock 9
	ldw.d1	*ada++[2],lowa	;9: a7
||	ldw.d2	*adb++[2],lowb	;9: b7
||	mpyh.m1	lowa,onea,higha	;9: a2
||	add.s2x	s2b,lowa,s2b	;9: a2
||	add.s1	s2a,higha,s2a	;9: a0
||	add.l1x	s2,s0,s2	;32: s2=s0+s2
||	add.l2	s1,s0,s1	;32: s1=s0+s1
||[cnd] mpyhl.m2 oneb,adb,maxi	;55: cnd ? maxi=offset'

	;Clock 10
	ldw.d1	*ada++[2],lowa	;10: a8
||	ldw.d2	*adb++[4],lowb	;10: b8
||	mpyh.m2	lowb,oneb,highb	;10: b3
||	add.s2	s0b,lowb,s0b	;10: b3
||	add.l1	s1a,higha,s1a	;10: a1
||	add.l2	s1b,highb,s1b	;10: b1
||	add.s1	s2,s2,s2	;33: double s2
				;.m1 unused

	;Clock 11
	ldw.d1	*ada++[2],lowb	;11: a9 (other side!)
||	ldw.d2	*adb++[4],lowa	;11: b9 (other side!)
||	mpyh.m1	lowa,onea,higha	;11: a4
||	add.s1	s2a,lowa,s2a	;11: a4
||	add.l2	s1b,lowb,s1b	;11: b4
||	add.l1	s1a,higha,s1a	;11: a2
||	mpyh.m2x oneb,dlta,dltb	;34: split deltas
||[cnd]	addk.s2	-26,maxi	;57: cnd ? correct maxi for odd side

	;Clock 12
	ldw.d1	*ada++[2],lowa	;12: a10
||	ldw.d2	*adb++[-2],lowb	;12: b10
||	mpyh.m1	lowa,onea,higha	;12: a5
||	add.s1	s2a,lowa,s2a	;12: a5
||	add.s2	s1b,lowb,s1b	;12: b5
||	add.l2	s0b,highb,s0b	;12: b3
||	sub.l1x	s2,sum,s2	;35: s2=correlation
||	mpy.m2	s1,2,s1		;35: double s1

	;Clock 13
	ldw.d1	*ada++[6],lowa	;13: a11
||	ldw.d2	*adb++[4],lowb	;13: b11
||	mpyh.m1	s0a,onea,higha	;13: a6
||	mpyh.m2	lowb,oneb,highb	;13: b6
||	add.s2	s1b,lowb,s1b	;13: b6
||	add.l1	s1a,higha,s1a	;13: a4 (13: b4)
||	add.l2x	sum,dlta,sum	;36: walk sum over one place
||	clr.s1	s2,16,31,s2	;36: clear s2's top half

	;Clock 14
	ldw.d1	*ada++[2],lowa	;14: a12
||	ldw.d2	*adb++[2],lowb	;14: b12
||	mpyh.m1	lowa,onea,higha	;14: a7
||	mpyh.m2	lowb,oneb,highb	;14: b7
||	add.s1	s2a,lowa,s2a	;14: a7
||	add.s2	s1b,lowb,s1b	;14: b7
||	add.l1	s0a,higha,s0a	;14: a5, clear s0a (14: b5)
||	sub.l2	s1,sum,s1	;37: s1=correlation

	;Clock 15
	.if	F=96
	ldw.d1	*ada++[2],lowa	;15: a13
||	ldw.d2	*adb++[2],lowb	;15: b13
||	mpyh.m1	lowa,onea,higha	;15: a8
||	mpyh.m2	lowb,oneb,highb	;15: b8 (15:a8, 15: b8)
||	add.l1	s0a,higha,s0a	;15: a6
||	add.l2	s2b,highb,s2b	;15: b6
||	shl.s2	s1,16,s1	;38: shift s1 up to MSW
||[ct]	b.s1	loop
	.else
	ldw.d1	*ada++[2],lowa	;15: a13
||	ldw.d2	*adb++[2],lowb	;15: b13
||	mpyh.m1	lowa,onea,higha	;15: a8
||	mpyh.m2	lowb,oneb,highb	;15: b8 (15:a8, 15: b8)
||	add.l1	s0a,higha,s0a	;15: a6
||	add.l2	s2b,highb,s2b	;15: b6
||	shl.s2	s1,16,s1	;38: shift s1 up to MSW
	.endif

	;Clock 16
	ldw.d1	*ada++[2],lowa	;16: a14
||	ldw.d2	*adb++[2],lowb	;16: b14
||	mpyh.m2	oneb,lowb,highb	;16: a9 (other side!)
||	add.s2	s0b,lowb,s0b	;16: a9 (other side!)
||	add.s1	s1a,lowa,s1a	;16: b9 (other side!)
||	add.l1	s0a,higha,s0a	;16: a7
||	add.l2	s2b,highb,s2b	;16: b7
||	mpy.m1	max,max,maxsq	;39: square max

	;Clock 17
	ldw.d1	*ada++[2],lowa	;17: a15
||	ldw.d2	*adb++[2],lowb	;17: b15
||	mpyh.m1	lowa,onea,higha	;17: a10
||	add.s1	s0a,lowa,s0a	;17: a10
||	add.s2	s1b,lowb,s1b	;17: b10
||	add.l1	s2a,higha,s2a	;17: a8
||	add.l2	s2b,highb,s2b	;17: b8
				;.m2 unused

	;Clock 18
	ldw.d1	*ada++[4],lowa	;18: a16
||	ldw.d2	*adb++[4],lowb	;18: b16
||	mpyh.m1	lowa,onea,higha	;18: a11
||	mpyh.m2	lowb,oneb,highb	;18: b11
||	add.l1	s0a,lowa,s0a	;18: a11
||	add.s1x	s2a,lowb,s2a	;18: b11
||	add.l2	s0b,highb,s0b	;18: a9 (other side!) (18: b9)
||	add.s2x	s2,s1,s0	;41: stack s1,s2

	;Clock 19
	ldw.d1	*ada++[-2],lowa	;19: a17
||	ldw.d2	*adb++[6],lowb	;19: b17
||	mpyh.m1	lowa,onea,higha	;19: a12
||	mpyh.m2	lowb,oneb,highb	;19: b12 (19: a12)
||	add.s2	s1b,lowb,s1b	;19: b12
||	add.l1	s1a,higha,s1a	;19: a10 (19: b10)
||	add.l2	sum,dltb,sum	;42: walk sum over another place
||	add2.s1x  rcor,s0,rcor	;42: add new term to running correlation

	;Clock 20
	ldw.d1	*ada++[8],lowb	;20: a18, fetched into lowb, not lowa!
||	ldw.d2	*adb++[-4],dlta	;20: b18
||	mpyh.m1	lowa,onea,higha	;20: a13
||	mpyh.m2	lowb,oneb,highb	;20: b13
||	add.s1	s2a,lowa,s2a	;20: a13
||	add.s2	s2b,lowb,s2b	;20: b13
||	add.l1	s1a,higha,s1a	;20: a11
||	add.l2	s1b,highb,s1b	;20: b11

	.if F=192
	;Clock 21'
	ldw.d1	*ada++[6],lowa	;21': a19
||	ldw.d2	*adb++[8],lowb	;21': b19
||	mpyh.m1	lowa,onea,higha	;21': a14
||	mpyh.m2	lowb,oneb,highb	;21': b14
||	add.s1	s0a,lowa,s0a	;21': a14
||	add.s2	s0b,lowb,s0b	;21': b14
||	add.l1	s2a,higha,s2a	;21': a12
||	add.l2	s2b,highb,s2b	;21': b12

	;Clock 22'
	ldw.d1	*ada++[-2],lowa	;22': a20
||	ldw.d2	*adb++[2],lowb	;22': b20
||	mpyh.m1	lowa,onea,higha	;22': a15
||	mpyh.m2	lowb,oneb,highb	;22': b15
||	add.s1	s2a,lowa,s2a	;22': a15
||	add.s2	s0b,lowb,s0b	;22': b15
||	add.l1	s0a,higha,s0a	;22': a13
||	add.l2	s1b,highb,s1b	;22': b13

	;Clock 23'
	ldw.d1	*ada++[4],lowa	;23': a21
||	ldw.d2	*adb++[2],lowb	;23': b21
||	mpyh.m1	lowa,onea,higha	;23': a16
||	mpyh.m2	lowb,oneb,highb	;23': b16
||	add.l1	s1a,higha,s1a	;23': a14
||	add.l2	s1b,highb,s1b	;23': b14

	;Clock 24'
	ldw.d1	*ada++[2],lowa	;24': a22
||	ldw.d2	*adb++[4],lowb	;24': b22
||	mpyh.m2	lowb,oneb,highb	;24': b17
||	add.s1	s1a,lowa,s1a	;24': a17
||	add.s2	s2b,lowb,s2b	;24': b17
||	add.l1	s0a,higha,s0a	;24': a15
||	add.l2	s0b,highb,s0b	;24': b15

	;Clock 25'
	ldw.d1	*ada++[2],lowa	;25': a23
||	ldw.d2	*adb++[2],lowb	;25': b23
||	mpyh.m1x lowb,onea,higha	;25': a18
||	add.s2	s0b,lowb,s0b	;25': a18
||	add.s1	s1a,dlta,s1a	;25': b18
||	add.l1	s2a,higha,s2a	;25': a16
||	add.l2	s2b,highb,s2b	;25': b16

	;Clock 26'
	ldw.d1	*ada++[4],lowa	;26': a24
||	ldw.d2	*adb++[2],lowb	;26': b24
||	mpyh.m1	lowa,onea,higha	;26': a19
||	mpyh.m2	lowb,oneb,highb	;26': b19
||	add.s1	s0a,lowa,s0a	;26': a19
||	add.l2	s1b,highb,s1b	;26': b17

	;Clock 27'
	ldw.d1	*ada++[2],lowa	;27': a25
||	ldw.d2	*adb++[2],lowb	;27': b25
||	mpyh.m1	lowa,onea,higha	;27': a20
||	mpyh.m2	lowb,oneb,highb	;27': b20
||	add.s1	s1a,lowa,s1a	;27': a20
||	add.s2	s0b,lowb,s0b	;27': b20
||	add.l2x	s1b,higha,s1b	;27': a18

	;Clock 28'
	ldw.d1	*ada++[4],lowa	;28': a26
||	ldw.d2	*adb++[2],lowb	;28': b26
||	mpyh.m1	lowa,onea,higha	;28': a21
||	mpyh.m2	lowb,oneb,highb	;28': b21
||	add.s1	s2a,lowa,s2a	;28': a21
||	add.s2	s1b,lowb,s1b	;28': b21
||	add.l1	s1a,higha,s1a	;28': a19
||	add.l2	s2b,highb,s2b	;28': b19

	;Clock 29'
	ldw.d1	*ada++[2],lowa	;29': a27
||	ldw.d2	*adb++[4],lowb	;29': b27
||	mpyh.m1	lowa,onea,higha	;29': a22
||	mpyh.m2	lowb,oneb,highb	;29': b22
||	add.s1	s1a,lowa,s1a	;29': a22
||	add.s2	s0b,lowb,s0b	;29': b22
||	add.l2x	s2b,higha,s2b	;29': a20
||	add.l1x	s0a,highb,s0a	;29': b20

	;Clock 30'
	ldw.d1	*ada++[4],lowa	;30': a28
||	ldw.d2	*adb++[-2],lowb	;30': b28
||	mpyh.m1	lowa,onea,higha	;30': a23
||	mpyh.m2	lowb,oneb,highb	;30': b23
||	add.s1	s1a,lowa,s1a	;30': a23
||	add.s2	s2b,lowb,s2b	;30': b23
||	add.l2x	s1b,higha,s1b	;30': a21
||	add.l1x	s2a,highb,s2a	;30': b21

	;Clock 31'
	ldw.d1	*ada++[4],lowa	;31': a29
||	ldw.d2	*adb++[4],lowb	;31': b29
||	mpyh.m2	lowb,oneb,highb	;31': b24
||	add.s1	s1a,lowa,s1a	;31': a24
||	add.s2	s2b,lowb,s2b	;31': b24
||	add.l1	s2a,higha,s2a	;31': a22
||	add.l2	s0b,highb,s0b	;31': b22

	;Clock 32'
	ldw.d1	*ada++[2],lowa	;32': a30
||	ldw.d2	*adb++[4],lowb	;32': b30
||	mpyh.m2	lowb,oneb,highb	;32': b25
||	add.s1	s1a,lowa,s1a	;32': a25
||	add.s2	s2b,lowb,s2b	;32': b25
||	add.l1	s2a,higha,s2a	;32': a23
||	add.l2	s0b,highb,s0b	;32': b23

	;Clock 33'
	ldw.d1	*ada++[2],lowa	;33': a31
||	ldw.d2	*adb++[2],lowb	;33': b31
||	mpyh.m1	lowa,onea,higha	;33': a26
||	mpyh.m2	lowb,oneb,highb	;33': b26
||	add.s1	s0a,lowa,s0a	;33': a26
||	add.s2	s2b,lowb,s2b	;33': b26
||	add.l2	s0b,highb,s0b	;33': b24

	;Clock 34'
	ldw.d1	*ada++[2],lowa	;34': a32
||	ldw.d2	*adb++[2],lowb	;34': b32
||	mpyh.m1	lowa,onea,higha	;34': a27
||	mpyh.m2	lowb,oneb,highb	;34': b27
||	add.s2	s0b,lowb,s0b	;34': b27
||	add.l1x	s1a,highb,s1a	;34': b25

	;Clock 35'
	ldw.d1	*ada++[2],lowa	;35': a33
||	ldw.d2	*adb++[4],lowb	;35': b33
||	mpyh.m1	lowa,onea,higha	;35': a28
||	mpyh.m2	lowb,oneb,highb	;35': b28
||	add.s2	s2b,lowb,s2b	;35': b28
||	add.l1	s1a,higha,s1a	;35': a26
||	add.l2	s1b,highb,s1b	;35': b26

	;Clock 36'
	ldw.d1	*ada++[2],lowa	;36': a34
||	ldw.d2	*adb++[2],lowb	;36': b34
||	mpyh.m1	lowa,onea,higha	;36': a29
||	add.s1	s0a,lowa,s0a	;36': a29
||	add.s2	s1b,lowb,s1b	;36': b29
||	add.l2x	s2b,higha,s2b	;36': a27
||	add.l1x	s1a,highb,s1a	;36': b27

	;Clock 37'
	ldw.d1	*ada++[2],lowa	;37': a35
||	ldw.d2	*adb++[4],lowb	;37': b35
||	mpyh.m2	lowb,oneb,highb	;37': b30
||	add.s1	s1a,lowa,s1a	;37': a30
||	add.s2	s1b,lowb,s1b	;37': b30
||	add.l1	s2a,higha,s2a	;37': a28
||	add.l2	s0b,highb,s0b	;37': b28

	;Clock 38'
	ldw.d1	*ada++[4],lowa	;38': a36
||	ldw.d2	*adb++[-2],lowb	;38': b36
||	mpyh.m1	lowa,onea,higha	;38': a31
||	mpyh.m2	lowb,oneb,highb	;38': b31
||	add.l2	s2b,lowb,s2b	;38': b31
||	add.l1	s0a,higha,s0a	;38': a29
||	sub2.s1	rcor,cor,s2	;46: sub old term from running correlation
||[ct]	b.s2	loop

	;Clock 39'
	ldw.d1	*ada++[-2],lowa	;39': a37
||	ldw.d2	*adb++[4],lowb	;39': b37
||	mpyh.m2	lowb,oneb,highb	;39': b32
||	add.s1	s1a,lowa,s1a	;39': a32
||	add.s2	s1b,lowb,s1b	;39': b32
||	add.l2	s2b,highb,s2b	;39': b30

	;Clock 40'
	ldw.d2	*adb++[2],lowb	;40': b38
||	mpyh.m1	lowa,onea,higha	;40': a33
||	mpyh.m2	lowb,oneb,highb	;40': b33
||	add.s1	s0a,lowa,s0a	;40': a33
||	add.l1	s2a,higha,s2a	;40': a31
||	add.l2	s1b,highb,s1b	;40': b31
||	shr.s2x	rcor,16,cor	;46: unstack running correlation

	;Clock 41'
	ldw.d1	*ada++[4],lowb	;41': a39
||	ldw.d2	*adb++[-20],dlta	;41': b39
||	mpyh.m1	lowa,onea,higha	;41': a34
||	mpyh.m2	lowb,oneb,highb	;41': b34
||	add.s2	s0b,lowb,s0b	;41': b34
||	add.l1x	s2a,highb,s2a	;41': b32

	;Clock 42'
	ldw.d1	*ada++[-20],lowa	;42': a40
||	ldw.d2	*adb++[-26],lowb	;42': b40
||	mpyh.m1	lowa,onea,higha	;42': a35
||	mpyh.m2	lowb,oneb,highb	;42': b35
||	add.s1	s2a,lowa,s2a	;42': a35
||	add.s2	s0b,lowb,s0b	;42': b35
||	add.l1	s1a,higha,s1a	;42': a33
||	add.l2	s2b,highb,s2b	;42': b33

	;Clock 43' (21)
	ldw.d1	*ada++[-28],lowa	;43': a41
||	ldw.d2	*adb++[-28],lowb	;43': b41
||	mpyh.m1	lowa,onea,higha	;43': a36
||	mpyh.m2	lowb,oneb,highb	;43': b36
||	add.s1	s0a,lowa,s0a	;43': a36
||	add.s2	s0b,lowb,s0b	;43': b36
||	add.l1	s2a,higha,s2a	;43': a34
||	add.l2	s1b,highb,s1b	;43': b34

	.endif

;branch to loop occurs here.

;Spin down pipeline
	.if	F=96

	;Clock 21
	mpyh.m1	lowa,onea,higha	;21: a14
||	mpyh.m2	lowb,oneb,highb	;21: b14
||	add.s1	s0a,lowa,s0a	;21: a14
||	add.s2	s0b,lowb,s0b	;21: b14
||	add.l1	s2a,higha,s2a	;21: a12
||	add.l2	s2b,highb,s2b	;21: b12

	;Clock 22
	mpyh.m1	lowa,onea,higha	;22: a15
||	mpyh.m2	lowb,oneb,highb	;22: b15
||	add.s1	s2a,lowa,s2a	;22: a15
||	add.s2	s0b,lowb,s0b	;22: b15
||	add.l1	s0a,higha,s0a	;22: a13
||	add.l2	s1b,highb,s1b	;22: b13

	;Clock 23
	mpyh.m1	lowa,onea,higha	;23: a16
||	mpyh.m2	lowb,oneb,highb	;23: b16 (23: a16, 23: b16)
||	add.l1	s1a,higha,s1a	;23: a14
||	add.l2	s1b,highb,s1b	;23: b14
||	sub2.s1	rcor,cor,s2	;46: sub old term from running correlation
||	shr.s2x	rcor,16,cor	;46: unstack running correlation

	;Clock 24
	mpyh.m2	lowb,oneb,highb	;24: b17
||	add.s1	s1a,lowa,s1a	;24: a17
||	add.s2	s2b,lowb,s2b	;24: b17
||	add.l1	s0a,higha,s0a	;24: a15
||	add.l2	s0b,highb,s0b	;24: b15
||	stw.d1	s2,*sbuf++	;47: store running correlation
||	stw.d2	s0,*cbuf++	;47: store new correlation

	;Clock 2
	mpyh.m2	lowb,oneb,highb	;25: a18
||	add.s2	s0b,lowb,s0b	;25: a18
||	add.s1	s1a,dlta,s1a	;25: b18
||	add.l1	s2a,higha,s2a	;25: a16
||	add.l2	s2b,highb,s2b	;25: b16

	.endif

	.if	F=192

	;Clock 44' (22)
	mpyh.m1	lowa,onea,higha	;44': a37
||	mpyh.m2	lowb,oneb,highb	;44': b37
||	add.s2	s1b,lowb,s1b	;44': b37
||	add.l1	s0a,higha,s0a	;44': a35
||	add.l2	s0b,highb,s0b	;44': b35

	;Clock 45' (23)
	mpyh.m2	lowb,oneb,highb	;45': b38
||	add.s2	s0b,lowb,s0b	;45': b38
||	add.l1	s0a,higha,s0a	;45': a36
||	add.l2	s1b,highb,s1b	;45': b36

	;Clock 46' (24)
	mpyh.m2	lowb,oneb,highb	;46': a39
||	add.s2	s0b,lowb,s0b	;46': a39
||	add.l1	s2a,higha,s2a	;46': a37
||	add.l2	s2b,highb,s2b	;46': b37
||	stw.d1	s2,*sbuf++	;47: store running correlation
||	stw.d2	s0,*cbuf++	;47: store new correlation

	;Clock 2' (2)
	mpyh.m2	lowb,oneb,highb	;47': b40
||	add.s1	s1a,lowa,s1a	;47': a40
||	add.s2	s2b,lowb,s2b	;47': b40
||	add.l2	s0b,highb,s0b	;47': b38

	.endif

;Take as long as we like here.
	tstamp
	ext	onea,16,16,cnd	;extract outer loop counter
[cnd]	addk	-A*2,sbuf	;Back up sbuf and cbuf to where they started
[cnd]	addk	-A*2,cbuf
[!cnd]	addk	-LASTA*2,sbuf
[!cnd]	addk	-LASTA*2,cbuf
[cnd]	b	dma		;not done with block.  Just refresh buffers.
[!cnd]	addk	(B-LASTA)/A,onea	;decrement counter circularly
[cnd]	sub	onea,1,onea
	nop	3

;Finish max computation
	mpyh.m1	rcor,rcor,cnd	;47: square cor
	mpy.m1	rcor,rcor,cnd	;48: square rcor
	cmplt.l1  maxsq,cnd,s2	;49: s2=max^2<cor^2 ? 1 : 0
[s2]	mv.l1x	cor,max		;50: s2 ? max=cor
[s2]	mpyhl.m2 oneb,adb,maxi	;52: s2 ? maxi=offset
	nop
[s2]	addk	CONST2,maxi	;correction since adb hasn't been updated
||	mpylh.m1 rcor,onea,rcor	;52: clear top half of rcor
	cmplt.l1 maxsq,cnd,cnd	;53: cnd=max^2<rcor^2 ? 1 : 0
[cnd]	mv.l1	rcor,max	;54: cnd ? max=rcor
[cnd]	mpyhl.m2 oneb,adb,maxi	;55: cnd ? maxi=offset'
	nop			;56:
[cnd]	addk.s2	CONST3,maxi	;correction since adb hasn't been updated

	mvk	-0x100,cnd	;See if maxi is new.
	cmpgt	maxi,cnd,cnd	;If new, -26<=maxi<3fff; if old, maxi<-0x100.
[!cnd]	addk	-A*2,maxi	;No.  Now maxi<-0x100-2*A-4
[cnd]	sub	maxi,adb,maxi	;Yes, new. Now -CONST1-2*A-4<=maxi<-CONST1
[cnd]	addk	CONST1,maxi	;now -0x2000<=-2*A-4<=maxi<0
[cnd]	ext	maxi,18,18,maxi	;Preserve 14 bits of circular addressing
[cnd]	addk	-0x100,maxi	;now maxi<-0x100

;Compute offset to report to master
;	stw	max,*mark++	;just for fun
	mvk	0x100+2*B,dmav
	mvkh	0x100+2*B,dmav
	add	maxi,dmav,maxi	;now 0<=maxi<2*B
	add	maxi,ofs,maxi	;add offset to true address
	mvk	gybufe,cnd	;wrap around gybuf if necessary
	mvkh	gybufe,cnd
	cmpeq	maxi,cnd,cnd
[cnd]	mvk	gybuf-gybufe,dmav
[cnd]	mvkh	gybuf-gybufe,dmav
[cnd]	add	maxi,dmav,maxi
;	stw	maxi,*mark++	;store final result here too for convenience
	mvk	gsync,cnd	;store final result!
	mvkh	gsync,cnd
	stw	maxi,*cnd
	stw	max,*cnd[1]	;Store max for Control DSP.
	mvk	VINTA,cnd	;Interrupt Control DSP to announce that
	mvkh	VINTA,cnd	;we've got an answer.
	mvk	-1,maxi
	stw	maxi,*cnd

;Search for a new max
	zero	max
	zero	maxsq
	zero	cor
	zero	rcor
	mvk	B*2,dmav
	mvkh	B*2,dmav
	add	ofs,dmav,ofs	;future max's will assume a bigger offset.

;Wait for B values to arrive in gybuf
wait:	mvc	ifr,ct		;What interrupts are pending?
	extu	ct,27,31,ct	;mask for inter-DSP interrupt
	shl	ct,4,ct
[!ct]	b	wait
	nop	5
	mvc	ct,icr		;Found.  Now clear it at the CPU
	mvk	VINTB,dmav	;and in the Monaco registers
	mvkh	VINTB,dmav
	stw	ct,*dmav

dma:	mvk	-0x100,cnd	;See if maxi is new.
	cmpgt	maxi,cnd,cnd	;If new, -26<=maxi<3fff; if old, maxi<-0x100.
[!cnd]	addk	-A*2,maxi	;No.  Now maxi<-0x100-2*A-4
[cnd]	sub	maxi,adb,maxi	;Yes, new. Now -CONST1-2*A-4<=maxi<-CONST1-4
[cnd]	addk	CONST1,maxi	;now -0x2000<=-2*A<=maxi<-4
[cnd]	ext	maxi,18,18,maxi	;Preserve 14 bits of circular addressing
[cnd]	addk	-0x100,maxi	;now maxi<-0x100-4

	mvk	DMA,dmas
	mvkh	DMA,dmas
	.if	0	;Turn on timing information for now
	ldw	*dmas[DMA0+DMAcnt],dmav
	nop	4
	stw	dmav,*mark++
	ldw	*dmas[DMA1+DMAcnt],dmav
	nop	4
	stw	dmav,*mark++
	ldw	*dmas[DMA2+DMAcnt],dmav
	nop	4
	stw	dmav,*mark++
	.endif
	tstamp
dmalp:	ldw	*dmas[DMA0+DMApri],cnd	;wait for DMA0 and DMA2 to finish
	ldw	*dmas[DMA2+DMApri],dmav
	nop	4
	or	dmav,cnd,cnd
	and	0xc,cnd,cnd
[cnd]	b	dmalp
	nop	5
	tstamp

	.if	B>A+LASTA
;Write A values from sumbuf with DMA1, then
;read A values from sumbuf with DMA2 when DMA1 is done
	stw	sbuf,*dmas[DMA1+DMAsrc]	;source1
	stw	sbuf,*dmas[DMA2+DMAdst]	;destination2
	ldw	*dmas[DMAadA],dmav	;retrieve sumbufx ptr from secret reg
	mvk	sumbufxe,dmac
	mvkh	sumbufxe,dmac
	nop	2
	stw	dmav,*dmas[DMA1+DMAdst]	;destination 1
	addk	A*2,dmav
	cmpgt	dmac,dmav,cnd
[!cnd]	mvk	sumbufx,dmav
[!cnd]	mvkh	sumbufx,dmav
	stw	dmav,*dmas[DMAadA]	;update secret reg
	addk	A*2,dmav		;source 2
	cmpgt	dmac,dmav,cnd
[!cnd]	mvk	sumbufx,dmav
[!cnd]	mvkh	sumbufx,dmav
	stw	dmav,*dmas[DMA2+DMAsrc]
	mvk	A/2,dmav		;count1, count2
	stw	dmav,*dmas[DMA1+DMAcnt]
	stw	dmav,*dmas[DMA2+DMAcnt]
	mvk	0x00000080,dmav
	stw	dmav,*dmas[DMA1+DMAsec]	;DMA1 interrupts when done
	mvk	0x03000051,dmav
	mvkh	0x03000051,dmav
	stw	dmav,*dmas[DMA1+DMApri]	;start DMA1
	mvk	0x05024051,dmav		;DMA2 starts after DMA1 finishes
	mvkh	0x05024051,dmav
	stw	dmav,*dmas[DMA2+DMApri]	;enable DMA2

dmalp2:	ldw	*dmas[DMA2+DMApri],cnd	;wait for DMA2 to finish
	nop	4
	and	0xc,cnd,cnd
[cnd]	b	dmalp2
	nop	5
	tstamp
	.endif

;Write A values from corbuf with DMA1
;Read A values for corbuf with DMA2 when DMA1 is done
	stw	cbuf,*dmas[DMA1+DMAsrc]	;source1
	stw	cbuf,*dmas[DMA2+DMAdst]	;destination2
	ldw	*dmas[DMAadD],dmav	;retrieve corbufx ptr from secret reg
	mvk	corbufxe,dmac
	mvkh	corbufxe,dmac
	nop	2
	stw	dmav,*dmas[DMA1+DMAdst]	;destination1
	addk	A*2,dmav
	cmpgt	dmac,dmav,cnd
[!cnd]	mvk	corbufx,dmav
[!cnd]	mvkh	corbufx,dmav
	stw	dmav,*dmas[DMAadD]	;update secret reg
	addk	A*2,dmav		;source2
	cmpgt	dmac,dmav,cnd
[!cnd]	mvk	corbufx,dmav
[!cnd]	mvkh	corbufx,dmav
	stw	dmav,*dmas[DMA2+DMAsrc]
	mvk	A/2,dmav		;count1, count2
	stw	dmav,*dmas[DMA1+DMAcnt]
	stw	dmav,*dmas[DMA2+DMAcnt]
	mvk	0x00000080,dmav
	stw	dmav,*dmas[DMA1+DMAsec]	;DMA1 interrupts when done
	mvk	0x03000051,dmav
	mvkh	0x03000051,dmav
	stw	dmav,*dmas[DMA1+DMApri]	;start DMA1
	mvk	0x05024051,dmav		;DMA2 starts after DMA1 finishes
	mvkh	0x05024051,dmav
	stw	dmav,*dmas[DMA2+DMApri]	;enable DMA2

;Read A (or LASTA) values for ybuf with DMA0
	ext	onea,16,16,cnd	;extract outer loop counter again
	sub	cnd,1,cnd	;cnd=0 ? fetch LASTA values : fetch A values
[cnd]	mvk	A,ct		;Number of correlations to compute
[!cnd]	mvk	LASTA,ct	;Maybe fewer correlations last time.

	ldw	*dmas[DMA0+DMAsrc],dmav	;wrap around source pointer if needed
	mvk	gybufe,cnd
	mvkh	gybufe,cnd
	nop	2
	cmpeq	dmav,cnd,cnd
[cnd]	mvk	gybuf,dmav
[cnd]	mvkh	gybuf,dmav
[cnd]	stw	dmav,*dmas[DMA0+DMAsrc]
[cnd]	mvk	gybuf-gybufe,dmav	;future max's will assume the wrapped
[cnd]	mvkh	gybuf-gybufe,dmav	;(smaller) offset.
[cnd]	add	ofs,dmav,ofs
	ldw	*dmas[DMA0+DMAdst],dmav	;if 2 frames last time, must fix dst
	nop	4
	shr	dmav,31,cnd		;if high bit clear, no good
[!cnd]	addk	ybufe-ybuf,dmav		;undo erroneous end-of-frame indexing
[!cnd]	stw	dmav,*dmas[DMA0+DMAdst]	;store fixed version
	mvk	ybufe,cnd		;are we exactly at the end of ybuf?
	mvkh	ybufe,cnd
	cmpeq	cnd,dmav,cnd
[cnd]	b	wrap0			;wrap around, no fractional part first
	nop	5
	mvk	ybufe,cnd		;are at least A words left in ybuf?
	mvkh	ybufe,cnd
	sub	cnd,ct,cnd		;ct is in shorts, so subtract it twice
	sub	cnd,ct,cnd
	cmplt	cnd,dmav,cnd		;if cnd=0, we don't wrap around
[!cnd]	b	wrap1
	nop	5
	mvk	ybufe,cnd		;move two partial blocks
	mvkh	ybufe,cnd
	sub	cnd,dmav,dmav		;count for first (in bytes)
	shr	dmav,2,dmav		;convert to words
	mvklh	2,dmav			;two frames
	stw	dmav,*dmas[DMA0+DMAcnt]
	shr	ct,1,cnd		;count for second
	sub	cnd,dmav,dmav
	mvkh	0,dmav			;clear high half (one frame)
	stw	dmav,*dmas[DMActA]
	mvk	4,dmav			;index reg gives offset to second dest
	mvklh	ybuf-ybufe+4,dmav
	stw	dmav,*dmas[DMAixA]
	mvk	0x010000d1,dmav		;2-frame DMA, ixA for dest
	mvkh	0x010000d1,dmav
	b	wrap2			;done.
	nop	5
wrap0:	mvk	ybuf,cnd		;destination
	mvkh	ybuf,cnd
	stw	cnd,*dmas[DMA0+DMAdst]
wrap1:	shr	ct,1,dmav		;count
	stw	dmav,*dmas[DMA0+DMAcnt]
	mvk	0x01000051,dmav		;basic DMA
	mvkh	0x01000051,dmav
wrap2:	stw	dmav,*dmas[DMA0+DMApri]	;start DMA
;Bizarre fact: if the above DMA is not given priority, the system occasionally
;deadlocks, apparently due to a bug internal to the '6201 DMA priority logic.
;Channel 0 finishes except for the last write to DMEM, and channel 1 fills
;the DMA FIFO and then stalls.  Apparently this also ties up the DMEM bus,
;since any subsequent HPI access to DMEM fails (the A24 VME chip times out?),
;though all other memory and periperals are accessible.

;Reset count, and ping-pong cbuf and sbuf to other buffer half
	mvk	corbuf+A,cnd	;cbuf=(cbuf==corbuf+2A)?corbuf:corbuf+2A
	mvkh	corbuf+A,cnd
	sub	cnd,cbuf,cbuf
	add	cnd,cbuf,cbuf
	.if	B>A		;if B>A, ping-pong sbuf to other half
	mvk	sumbuf+A,cnd	;sbuf=(sbuf==sumbuf+2A)?sumbuf:sumbuf+2A
	mvkh	sumbuf+A,cnd
	sub	cnd,sbuf,sbuf
	add	cnd,sbuf,sbuf
	.else
	mvk	sumbuf,sbuf	;if B=A, we use the same data immediately
	mvkh	sumbuf,sbuf	;sbuf=sumbuf
	.endif
	ext	onea,16,16,cnd	;extract outer loop counter again
[cnd]	mvk	A,ct		;Number of correlations to compute
[!cnd]	mvk	LASTA,ct	;Maybe fewer correlations last time.

	tstamp
	b	clk3

;Spin up pipeline again

	.if	F=96

	;Clock 21
	ldw.d1	*ada++[2],lowa	;21: a19
||	ldw.d2	*adb++[-22],lowb	;21: b19

	;Clock 22
	ldw.d1	*ada++[-24],lowa	;22: a20
||	ldw.d2	*adb++[-22],lowb	;22: b20

	;Clock 23
	ldw.d1	*ada++[-21],lowa	;23: a21
||	ldw.d2	*adb++[7],dltb	;23: b21

	;Clock 24
	mpyh.m1	rcor,rcor,cnd	;47: square cor

	;Clock 2
	ldw.d1	*ada++[2],s1a	;2: a0
||	ldw.d2	*adb++[-4],s2b	;2: b0
||	mpy.m1	rcor,rcor,cnd	;48: square rcor

	.endif

	.if	F=192

	;Clock 43' (21)
	nop

	;Clock 44' (22)
	ldw.d1	*ada++[-24],lowa	;44': a42
||	ldw.d2	*adb++[-22],lowb	;44': b42

	;Clock 45' (23)
	ldw.d1	*ada++[-21],lowa	;45': a43
||	ldw.d2	*adb++[7],dltb	;45': b43

	;Clock 46' (24)
	mpyh.m1	rcor,rcor,cnd	;47: square cor

	;Clock 2' (2)
	ldw.d1	*ada++[2],s1a	;2': a0, clear s1a
||	ldw.d2	*adb++[-4],s2b	;2': b0, clear s2b
||	mpy.m1	rcor,rcor,cnd	;48: square rcor

	.endif

; branch to clock 3 occurs here.