summaryrefslogtreecommitdiffstats
path: root/swpll.v
blob: 02e238b7fa7b04709cac31e9f698f717f7eba583 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
//
// swpll.v
//
// Numerically controlled oscillator and PLL. This takes the
// 15.625 kHz (64 µs) hsync strobes and convert them to a 72 MHz clock
// derived from a 288 MHz master clock. The PLL multiplies by 4608;
// the MSB of the NCO counter is then used to create a (very close to)
// 50:50 clock.
//
// The master clock is 360 MHz. This frequency in Cyclone IV E-C8 is
// nontrivial, so it is important to optimize critical paths
// where possible; non-critical paths are made multicycle and marked
// as such in the .sdc file.
//
// The sometimes funny bit indicies are to flag which fixed-point
// bits represent fractions. Phase is in units of the 72 MHz output
// clock.
//

// The nominal input value to the NCO corresponds to a filter output
// of 2^20/5 ~ 0x33333. The loop filter output does not include the
// top fraction bit, so only 18 bits here, of which 4 bits
// are fractional bits of the phase counter output.
`define NCO_NOMINAL 18'h33333

//
// The NCO counter is easily the most demanding data path in this
// module. It relies of the filter module to stage the output in
// suitable chunks to consume in each cycle (lest we end up with
// mixed bits), so specify the number of bits per NCO
// chunk here.
//
`define NCO_STAGE_BITS 3

function integer imin(integer a, integer b);
  imin = a < b ? a : b;
endfunction // imin

function integer imax(integer a, integer b);
  imax = a >= b ? a : b;
endfunction // imax

function integer stages(integer total, integer perstage);
   stages = (total+perstage-1)/perstage;
endfunction // stages

// This defines a range associated with a stage; the syntax is [`R(iter,bits,max,min)]
`define R(i,b,ma,mi) (imin((((i)+1)*(b)+(mi)-1),(ma))):((i)*(b)+(mi))

//
// Loop low pass filter. Run once per hsync strobe. The clk_en should
// be the appropriately shifted version of the hsync strobe so that
// the input value is suitably stable.
//
// The loop filter contains slow combinatorial logic, however, its
// latency is not at all critical: it is perfectly fine for it to
// take 20,000+ cycles if need be.
//
// Currently this is simply a first-order filter using a decaying
// average mechanism.
//
module swpll_loop_filter (
			  input rst_n,
			  input clk,			// 360 MHz master clock
			  input clk_en,			// Clock enable
			  input signed [14:0] p1,	// Input phase
			  output reg [17:0] p2,		// Output filtered value
			  output rdy_stb
			  );

   reg signed [17:0]  acc;
   wire signed [17:0] p1_ext = { p1, 3'b0 };

   // The number of clock cycles to spread the setting of p2 out over
   parameter p2_stages = stages(18,`NCO_STAGE_BITS);

   (* ramstyle = "logic" *)
     reg [p2_stages:0]	clk_en_q;	// Delayed clock enables

   // The higher this parameter is, the more narrow the bandwidth
   parameter p1_shift = 4;
   // This parameter will force the accumulator to be biased toward
   // zero. This can help prevent false lock. 0 means no bias.
   parameter acc_shift = 8;

   // This is heavily combinatorial, but it has a very long time to
   // settle.  The value of p2 is proportional to the output
   // frequency, and goes lower as acc increases, because acc is the
   // filtered value of how far ahead we are.
   // This produces a value that ramges from 1/4 to 3/16 the
   // master clock frequency. Going outside that range makes false
   // lock a possibility.
   //wire [17:0]	next_p2 =  `NCO_NOMINAL - (acc >>> 3);
   wire [17:0]	next_p2 = { 3'b110, acc[17], ~acc[16:3] };

   always @(posedge clk or negedge rst_n)
     if (~rst_n)
       begin
	  acc      <=  18'b0;
	  clk_en_q <=  {(p2_stages+1){1'b0}};
       end
     else
       begin
	  clk_en_q <= (clk_en_q << 1) | clk_en;

	  // Only do this after the full value has been latched from
	  // the previous computation; this also allows p1 to
	  // stabilize (it is a multicycle path.)
	  if (clk_en_q[p2_stages])
	    acc <= acc + ((p1_ext - acc) >>> p1_shift);
       end

   // Yes, this really is latched a whole input cycle
   // later.  However, as the NCO counter is split into stages
   // ways (see below), latch them in stages here too, so that
   // the NCO seems them consistently. Do it here rather than in
   // the NCO module, because it is more efficient.
   wire [17:0] p2_init = `NCO_NOMINAL;

   generate
      genvar i;
      for (i = 0; i < p2_stages; i = i + 1)
	begin : p2out
	   always @(posedge clk or negedge rst_n)
	     if (~rst_n)
	       p2[`R(i,`NCO_STAGE_BITS,17,0)]
	       <= p2_init[`R(i,`NCO_STAGE_BITS,17,0)];
	     else
	       p2[`R(i,`NCO_STAGE_BITS,17,0)]
	       <= next_p2[`R(i,`NCO_STAGE_BITS,17,0)];
	end
   endgenerate
endmodule // swpll_loop_filter

// Extract a hsync strobe from the composite sync signal without
// relying on any generated clocks.  hsync is identified by a level
// change in the sync output no sooner than ~10-11 µs (3840-4096 clk
// cycles) after the previous one.
module detect_hsync (
		     input rst_n,
		     input clk,
		     input sync,
		     output sync_stb
		     );
   reg	      was_sync;		// Sync signal in previous cycle
   reg [11:0] cyc_ctr;
   reg [1:0]  cyc_ctr_cy;
   reg	      hsync_ok;
   reg	      hsync_stb;

   assign sync_stb = hsync_stb;

   always @(posedge clk or negedge rst_n)
     if (~rst_n)
       begin
	  was_sync  <= 1'b0;
	  cyc_ctr   <= 12'b0;
	  hsync_ok  <= 1'b0;
	  hsync_stb <= 1'b0;
       end
     else
       begin
	  was_sync <= sync;
	  hsync_stb <= (sync ^ was_sync) & hsync_ok;

	  // Counter split into 3 phases for timing. We only
	  // bother resetting the last third on a sync; it is
	  // close enough.
	  { cyc_ctr_cy[0], cyc_ctr[3:0] } <= cyc_ctr[3:0] + 1'b1;
	  if (cyc_ctr_cy[0])
	    { cyc_ctr_cy[1], cyc_ctr[7:4] } <= cyc_ctr[7:4] + 1'b1;
	  else
	    cyc_ctr_cy[1] <= 1'b0;

	  if (hsync_stb)
	    { hsync_ok, cyc_ctr[11:8] } <= 5'b0;
	  else if (cyc_ctr_cy[1])
	    // hsync_ok is a sticky carry out
	    { hsync_ok, cyc_ctr[11:8] } <= (cyc_ctr[11:8] + 1'b1)
	      | { hsync_ok, 4'b0 };
       end // else: !if(~rst_n)
endmodule // detect_hsync

module swpll (
	      input rst_n,
	      input clk,	// 360 MHz master clock
	      input sync,	// Sync signal synchronized to clk

	      output out_clk,	// 72 MHz clock synchronized to hsync
	      output out_clk_stb // Single master clock strobe for out_clk
	      );

   reg   [14:0] phase_ctr;	// Count of output cycles
   reg    [2:0] phase_ctr_cy;	// Staged carries
   reg   [14:0] phase_ctr_q;	// Latched (stable) value of phase counter
   reg   sync_seen;	        // Phase counter started?
   reg signed [14:0] p1;	// Last detected phase
   wire  [17:0] p2;		// Filtered phase, inverted

   // Phase delta from input strobes. This only needs to be able
   // to represent multiples of the multiplier.  In our case,
   // the bottom 9 bits will always be zero.
   reg signed [14:0] strobe_diff;

   wire		sync_stb;
   wire		clk_stb;	// Strobe for the output clock

   // Extract the hsync strobe from the input
   detect_hsync detect_hsync
     (
      .rst_n ( rst_n ),
      .clk ( clk ),
      .sync ( sync ),
      .sync_stb ( sync_stb )
      );

   // Delayed sync_stb by n cycles. Prevent the compiler from turning
   // this into a shift register RAM.
   parameter max_clk_en = 3;
   (* ramstyle = "logic" *)
     reg [max_clk_en:1] clk_en;

   parameter multiplier = 15'sd4608;

   always @(posedge clk or negedge rst_n)
     if (~rst_n)
       begin
	  phase_ctr    <= 15'b0;
	  phase_ctr_cy <= 3'b0;
	  strobe_diff  <= 15'b0;
	  sync_seen    <= 1'b0;
	  clk_en       <= {(max_clk_en){1'b0}};
	  phase_ctr_q  <= 15'b0;
	  p1           <= 15'b0;
       end
     else
       begin
	  clk_en <= {clk_en[max_clk_en-1:1], sync_stb};
	  sync_seen <= sync_seen | clk_en[1];

	  // The phase counter can be synchronously delayed as
	  // long as an appropriate clk_en[] is used to latch p1.
	  //
	  // sync_seen is used to suppress counting until first pulse
	  if (clk_stb)
	    { phase_ctr_cy[0], phase_ctr[2:0] } <= phase_ctr[2:0] + sync_seen;
	  else
	    phase_ctr_cy[0] <= 1'b0;

	  if (phase_ctr_cy[0])
	    { phase_ctr_cy[1], phase_ctr[5:3] } <= phase_ctr[5:3] + 1'b1;
	  else
	    phase_ctr_cy[1] <= 1'b0;

	  if (phase_ctr_cy[1])
	    { phase_ctr_cy[2], phase_ctr[8:6] } <= phase_ctr[8:6] + 1'b1;
	  else
	    phase_ctr_cy[2] <= 1'b0;

	  if (phase_ctr_cy[2])
	    phase_ctr[14:9] <= phase_ctr[14:9] + 1'b1;

	  if (clk_stb)
	    phase_ctr_q <= phase_ctr;

	  if (clk_en[1])
	    p1 <= phase_ctr_q + strobe_diff;

	  if (clk_en[2])
	    strobe_diff <= strobe_diff - multiplier;

       end // else: !if(~rst_n)

   swpll_loop_filter swpll_loop_filter
     (
      .rst_n ( rst_n ),
      .clk   ( clk ),
      .clk_en ( clk_en[3] ),
      .p1 ( p1 ),
      .p2 ( p2 )
      );

   //
   // Numerically controlled oscillator. The nominal input is 2^20/5 =
   // 0.33333h. The upper two fraction bits are not represented in the
   // filter output.
   //
   reg  [-1:-20] nco_ctr;	// The actual NCO counter
   parameter nco_stages = stages(20,`NCO_STAGE_BITS);
   reg [nco_stages:0] nco_ctr_cy; // Staged carries; carry 0 is always 0

   //
   // This takes the accumulated phase error and turns it into
   // a frequency. This corresponds to the nco_ctr increment for
   // each 360 MHz cycle. The maximum frequency is 360 MHz * 0.3FFFF
   // ~ 90 MHz.  The high bit of p2 is always set, so the minimum
   // frequency is 45 MHz.
   //
   // This is a phase accumulator and its output can be slightly
   // delayed as long as the inputs are all handled correctly. To
   // that end, the loop filter delays the output into [2:4:4:4:4] groups,
   // which lets us do [5:4:4:4:4] for {clk_stb_q, nco_ctr}; this appears to
   // give the best timing balance.
   //

   wire [-1:-20] nco_input = { 2'b0, p2 };
   reg clk_stb_q;

   // Permanent condition...
   always @(*)
     nco_ctr_cy[0] = 1'b0;

   generate
      genvar i;
      for (i = 0; i < nco_stages; i = i + 1)
	begin : nco
	   always @(posedge clk or negedge rst_n)
	     if (~rst_n)
	       begin
		  nco_ctr_cy[i+1] <= 1'b0;
		  nco_ctr[`R(i,`NCO_STAGE_BITS,-1,-20)]
		    <= {`NCO_STAGE_BITS{1'b0}};
	       end
	     else
	       begin
		  { nco_ctr_cy[i+1], nco_ctr[`R(i,`NCO_STAGE_BITS,-1,-20)] }
		    <= nco_ctr[`R(i,`NCO_STAGE_BITS,-1,-20)] +
		       nco_input[`R(i,`NCO_STAGE_BITS,-1,-20)] +
		       nco_ctr_cy[i];
	       end // else: !if(~rst_n)
	end // block: nco
      endgenerate

   reg		out_clk_q;		// Output clock buffer register
   reg		out_clk_stb_q;

   // out_clk_q is an (approximate) 50:50 clock with the same
   // frequency and phase as clk_stb_q; i.e. clk_stb_q is set for the
   // same first master clock cycle as out_clk_q goes high.
   always @(posedge clk or negedge rst_n)
     if (~rst_n)
       begin
	  out_clk_q <= 1'b0;
	  out_clk_stb_q <= 1'b0;
       end
     else
       begin
	  out_clk_q <= ~nco_ctr[-1];
	  out_clk_stb_q <= nco_ctr_cy[nco_stages];
       end

   assign clk_stb = out_clk_stb_q;

   assign out_clk = out_clk_q;
   assign out_clk_stb = out_clk_stb_q;

endmodule // swpll