summaryrefslogtreecommitdiffstats
path: root/arena.c
blob: 6577c5fd5500f2fd2dc6c8271921d1d3237eb40a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
/* ----------------------------------------------------------------------- *
 *   
 *   Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
 *
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU Lesser General Public License as
 *   published by the Free Software Foundation, Inc.,
 *   59 Temple Place Ste 330, Boston MA 02111-1307, USA, version 2.1,
 *   incorporated herein by reference.
 *
 * ----------------------------------------------------------------------- */

/*
 * arena.c
 *
 * Persistent memory arena implemented using memory-mapping tricks
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
#include <signal.h>
#include <inttypes.h>
#include <limits.h>
#include <fcntl.h>
#include <math.h>		/* HUGE_VAL */
#define __USE_MISC	1	/* Needed to support mremap() */
#define __USE_GNU	1	/* Needed to support mremap() */
#include <sys/mman.h>
#include <sys/stat.h>
#include <sched.h>
#include <string.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/wait.h>

#include "lpsm.h"
#include "internals.h"

/*
 * This is the data structure for the persistent memory arena.  Note
 * that only one arena is supported, due to the need to trap SIGSEGV
 * and address space allocation issues.  
 */
struct lpsm_arena *lpsm_memory_info = NULL;

/*
 * This variable is set when the persistent memory is dirty.
 */
sig_atomic_t lpsm_need_checkpoint;

/* Wrappers for read() and write() which retries if incomplete */
static ssize_t lpsm_read(int fd, void *buf, size_t count)
{
  char *bufp = buf;
  ssize_t total = 0;
  ssize_t rv;

  while ( count ) {
    rv = read(fd, bufp, count);
    if ( rv == -1 ) {
      if ( errno == EINTR || errno == EAGAIN )
	continue;
      else
	return total ? total : -1;
    } else if ( rv == 0 ) {
      return total;
    }
    bufp  += rv;
    count -= rv;
    total += rv;
  }

  return total;
}

static ssize_t lpsm_write(int fd, void *buf, size_t count)
{
  char *bufp = buf;
  ssize_t total = 0;
  ssize_t rv;

  while ( count ) {
    rv = write(fd, bufp, count);
    if ( rv == -1 ) {
      if ( errno == EINTR || errno == EAGAIN )
	continue;
      else
	return total ? total : -1;
    } else if ( rv == 0 ) {
      return total;
    }
    bufp  += rv;
    count -= rv;
    total += rv;
  }

  return total;
}

/*
 * SIGSEGV handler for persistent object store
 */
static void lpsm_sigsegv(int signal, siginfo_t *siginfo, void *ptr)
{
  void *page;
  uintptr_t npage, offset;
  char *pageinfo;
  int old_errno = errno;

#if defined(__linux__) && defined(__i386__)
  if ( siginfo->si_code == 0 ) {
    struct sigcontext *ctxt;
    /* Don't even begin to ask me where the 0x14 comes from */
    ctxt = (struct sigcontext *)((char *)ptr + 0x14);

    /* Old kernel.  Fill in data to the best of our knowledge. */
      if ( ctxt->trapno == 14 ) {
	/* Linux/i386 uses unmapped pages to mimic PROT_NONE, so we can't
	   tell ACCERR and MAPERR apart from the register state */
      siginfo->si_code = SEGV_ACCERR;
      siginfo->si_addr = (void *)ctxt->cr2;
    }
  }
#endif

  page = (void *)((uintptr_t)siginfo->si_addr & ~(PM->pagesize-1));
  offset = (uintptr_t)page - (uintptr_t)PM->arena;
  npage = (offset >> PM->pageshift);
  pageinfo = PM->pageinfo + npage;

  if ( signal != SIGSEGV || siginfo->si_code != SEGV_ACCERR ||
       offset >= PM->arena_len ) {
    struct sigaction dfl;
    
    dfl.sa_handler = SIG_DFL;
    sigemptyset(&dfl.sa_mask);
    dfl.sa_flags = SA_ONESHOT;
    sigaction(SIGSEGV, &dfl, NULL);

#ifdef PRINT_DEBUG_INFO
    abort();			/* Easier to tell from SIGSEGV */
#endif

    errno = old_errno;
    return;			/* Re-take fault */
  }
  
  mprotect(page, PM->pagesize, PROT_READ|PROT_WRITE);

  switch ( (enum page_status) *pageinfo ) {
  case page_clean:
    *pageinfo = page_dirty;	/* Page now dirty */
    PM->dirty_count++;		/* For accounting purposes */
    lpsm_need_checkpoint = 1;	/* For user programs */
    /* Leave page r/w */
    break;

  default:
    abort();			/* This shouldn't happen */
  }

  errno = old_errno;
}

/*
 * Routine to do log writeback.  Used by initial log recovery routine
 * as well as during-execution garbage collect.
 * THIS ROUTINE SHOULD BE INVOKED WITH LOCK HELD ON THE LOG FILE.
 */
static int lpsm_log_writeback(struct lpsm_arena *pm)
{
  struct lpsm_logrecord record;
  off_t position, last_commit;
  struct flock lockmain;
  
  last_commit = 0;		/* Last COMMIT record found */
  position = lseek(pm->log_fd, 0, SEEK_SET);
  
  while ( lpsm_read(pm->log_fd, &record, sizeof(record)) == sizeof(record) ) {
    if ( record.magic != LOGRECORD_MAGIC )
      break;			/* Bad magic, assume rest of log corrupt */

    switch ( record.record_type ) {
    case osrec_commit:
      /* NOTE: last_commit points to the final byte to examine, thus
	 at the *end* of the final commit record. */
      position += sizeof(record);
      last_commit = position;	/* Found a commit record */
      break;

    case osrec_page:
      /* Advance past current page cluster */
      position = lseek(pm->log_fd, record.size, SEEK_CUR);
      break;

    default:
      errno = EINVAL;
      return -1;		/* Unknown record - unsafe to process */
    }
  }
  
  /* Now we know where the last commit was.  Now we can process
     everything up to that point. */
  
  position = lseek(pm->log_fd, 0, SEEK_SET);
  
  while ( lpsm_read(pm->log_fd, &record, sizeof(record))
	  == sizeof(record) && position < last_commit ) {
    if ( record.magic != LOGRECORD_MAGIC )
      break;			/* Bad magic, assume rest of log corrupt */

    switch ( record.record_type ) {
    case osrec_commit:
      /* Found a commit record, do nothing */
      position += sizeof(record);
      break;

    case osrec_page:
      {
	/* Write back data to file */
	char *data;
	
	position += sizeof(record);
	
	lockmain.l_type   = F_WRLCK;
	lockmain.l_whence = SEEK_SET;
	lockmain.l_start  = record.offset;
	lockmain.l_len    = record.size;
	while ( fcntl(pm->main_fd, F_SETLKW, &lockmain) == -1 && errno == EINTR ); 
	data = mmap(NULL, record.size, PROT_WRITE, MAP_SHARED,
		    pm->main_fd, record.offset);
	if ( data == MAP_FAILED )
	  return -1;
	if ( lpsm_read(pm->log_fd, data, record.size) != record.size )
	  return -1;		/* Badness */
	if ( munmap(data, record.size) )
	  return -1;
	
	lockmain.l_type = F_UNLCK;
	while ( fcntl(pm->main_fd, F_SETLKW, &lockmain) == -1 && errno == EINTR );
	position += record.size;
      }
      break;

    default:
      errno = EINVAL;
      return -1;		/* Unknown record - unsafe to process */
    }
  }
  
  /* Log successfully recovered.  Truncate. */
  fsync(pm->main_fd);
  ftruncate(pm->log_fd, 0);
  /* Write initial commit record, for sequence number recovery */
  record.magic       = LOGRECORD_MAGIC;
  record.record_type = osrec_commit;
  record.size        = pm->fork_seq;
  record.offset      = 0x54494d43; /* For debugging */
  if ( lpsm_write(pm->log_fd, &record, sizeof(record)) < sizeof(record) )
    return -1;

  fsync(pm->log_fd);		/* Indicate log recovery complete */

  return 0;
}

/*
 * Routine to do log recovery
 */
static int lpsm_recover_log(struct lpsm_arena *pm)
{
  struct flock lock;
  int rv = 0;
  int rerrno;

  /* First, lock the log file */
  lock.l_type   = F_WRLCK;
  lock.l_whence = SEEK_SET;
  lock.l_start  = 0;
  lock.l_len    = 0;
  while ( fcntl(pm->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR );

  /* Do log recovery, and write initial commit record. */
  rv = lpsm_log_writeback(pm);
  rerrno = errno;

  /* Increase the sequence number, since we just wrote a commit. */
  pm->fork_seq++;

  /* Unlock file and run. */
  lock.l_type = F_UNLCK;
  while ( fcntl(pm->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR );

  errno = rerrno;
  return rv;
}

static int
lpsm_open_and_recover(struct lpsm_arena *pm, const char *main_file,
		      const char *log_file)
{
  int myerrno;

  pm->fork_seq = 0;		/* Initialize sequence counter */
  pm->main_fd = pm->log_fd = 0;

  pm->main_fd = open(main_file, O_RDWR|O_CREAT, 0666);
  if ( pm->main_fd < 0 )
    goto errx1;

  pm->pagesize = getpagesize();
  if ( pm->pagesize & (pm->pagesize - 1) )
    goto errx2;			/* WTF -- pagesize not a power of 2? */

  /* Compute log2(pm->pagesize) */
  pm->pageshift = 0;
  while ( (1 << pm->pageshift) < pm->pagesize )
    pm->pageshift++;

  /*
   * Open log file
   */
  pm->log_fd = open(log_file, O_RDWR|O_APPEND|O_CREAT, 0666);
  if ( pm->log_fd < 0 )
    goto errx2;

  /* Now, do log recovery if needed */
  if ( lpsm_recover_log(pm) )
    goto errx3;

  return 0;

 errx3:
  myerrno = errno;
  close(pm->log_fd);
  errno = myerrno;
 errx2:
  myerrno = errno;
  close(pm->main_fd);
  errno = myerrno;
 errx1:
  return -1;
}

/*
 * Opens the object store.  This includes log
 * playback (crash recovery) if the log file exists
 * and is nonempty.
 */
void *lpsm_arena_init(const char *main_file, const char *log_file,
		      size_t *arena_len, void *arena_ptr)
{
  int myerrno;
  struct sigaction sigact;
  struct flock lock;
  off_t file_len, len = arena_len ? *arena_len : 0;
  size_t file_pages, len_pages;

  arena_ptr = arena_ptr ? arena_ptr : ARENA_ADDRESS;

  PM = malloc(sizeof(struct lpsm_arena));
  if ( !PM )
    goto errx0;

  if ( lpsm_open_and_recover(PM, main_file, log_file) )
    goto errx1;

  /* Allocate arena memory space */
  lock.l_type = F_WRLCK;
  lock.l_whence = SEEK_SET;
  lock.l_start  = 0;
  lock.l_len    = 0;
  while ( fcntl(PM->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
  file_len = lseek(PM->main_fd, 0, SEEK_END);
  if ( len < file_len ) {
    len = file_len;
  }
  len = (len + PM->pagesize - 1) & ~(PM->pagesize - 1);
  if ( len > file_len ) {
    ftruncate(PM->main_fd, len);	/* Extend file */
  }
  lock.l_type = F_UNLCK;
  while ( fcntl(PM->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );

  PM->arena = mmap(arena_ptr, len, PROT_READ, MAP_PRIVATE|MAP_FIXED, PM->main_fd, 0);
  if ( PM->arena == MAP_FAILED )
    goto errx3;

  PM->arena_len = len;
  if ( *arena_len )
    *arena_len = len;

  PM->pageinfo = malloc(len >> PM->pageshift);
  if ( !PM->pageinfo )
    goto errx4;

  file_len = (file_len + PM->pagesize - 1) & ~(PM->pagesize-1);
  file_pages = file_len >> PM->pageshift;
  len_pages  = len >> PM->pageshift;

  /* All pages clean at this time */
  memset(PM->pageinfo, page_clean, len_pages);
  lpsm_need_checkpoint = 0;

  sigact.sa_sigaction = lpsm_sigsegv;
  sigemptyset(&sigact.sa_mask);
  sigact.sa_flags = SA_RESTART|SA_SIGINFO;
  if ( sigaction(SIGSEGV, &sigact, &PM->oldact) )
    goto errx5;

  return PM->arena;

 errx5:
  myerrno = errno;
  munmap(PM->pageinfo, len >> PM->pageshift);
  errno = myerrno;
 errx4:
  myerrno = errno;
  munmap(arena_ptr, len);
  errno = myerrno;
 errx3:
  myerrno = errno;
  if ( PM->log_fd >= 0 ) close(PM->log_fd);
  close(PM->main_fd);
  errno = myerrno;
 errx1:
  myerrno = errno;
  free(PM);
  errno = myerrno;
 errx0:

  return NULL;
}

/*
 * Do an offline log recovery (exported function.)
 */
int lpsm_recover(const char *mainfile, const char *logfile)
{
  struct lpsm_arena pm;
  int rv;

  rv = lpsm_open_and_recover(&pm, mainfile, logfile);
  if ( !rv ) {
    close(pm.log_fd);
    close(pm.main_fd);
  }
  return rv;
}

/*
 * Object store checkpoint.  Writes entries to the log file.
 * The "gc_factor" is the factor of maximum log size file relative
 * to the arena size.  For example, if gc_factor == 0.5 then if the
 * log size is more than 50% of the arena file size a writeback cycle
 * will take place after the log has been written.  This means other
 * checkpoints will have to wait!
 *
 * Set gc_factor to 0.0 to force a gc, and to HUGE_VAL to inhibit gc.
 *
 * The "wait" argument determines the level of synchronicity required.
 *
 * Returns 0 on synchronous completion, 1 if the checkpoint was skipped,
 * (pid_t)-1 on error, or the pid_t of the asynchronous process.  Note:
 * if an asynchronous process is launched, and the next call uses
 * PSMSYNC_NONE, then it is up to the parent process to wait for the
 * launched processes; this process only remembers the last process launched.
 * FIXME: perhaps we should double-fork these processes?
 */
pid_t lpsm_checkpoint(double gc_factor, enum psmsync wait)
{
  static pid_t last_sync = 0;
  pid_t f, w;
  char *pi, *epi;
  void *page;
  
  pi  = PM->pageinfo;
  epi = PM->pageinfo + (PM->arena_len >> PM->pageshift);

  if ( last_sync ) {
    int status;
    switch (wait) {
    case PSMSYNC_NONE:
      break;
    case PSMSYNC_SKIP:
      w = waitpid(last_sync, &status, WNOHANG);
      if ( w == 0 )
	return 1;		/* Skip */
      if ( w == (pid_t)-1 ) {
	/* Note: under strace, waitpid() will return -1 instead of 0 */
	return (pid_t)-1;
      } else if ( WIFSIGNALED(status) || (WIFEXITED(status) && WEXITSTATUS(status)) ) {
	return (pid_t)-1;	/* Badness */
      }
      break;
    case PSMSYNC_WAIT:
    case PSMSYNC_SYNC:
      w = waitpid(last_sync, &status, 0);
      if ( w == (pid_t)-1 ) {
	if ( errno != ECHILD )
	  return (pid_t)-1;	/* Badness */
      } else if ( WIFSIGNALED(status) || (WIFEXITED(status) && WEXITSTATUS(status)) ) {
	return (pid_t)-1;	/* Badness */
      }
      break;
    }
  }

  if ( !lpsm_need_checkpoint )
    return 0;			/* If not dirty, don't fork */

  f = fork();
  if ( f == (pid_t)-1 )
    return (pid_t)-1;		/* Checkpoint failed! */
  else if ( f > 0 ) {
    /* Parent process -- just mark all dirty pages clean */
    /* FIX: We probably want to do something more clever
       than memset() here, and perhaps keep around the old dirty
       array for a while. */
    mprotect(PM->arena, PM->arena_len, PROT_READ);
    memset(PM->pageinfo, page_clean, PM->arena_len >> PM->pageshift);
    lpsm_need_checkpoint = 0;

    PM->dirty_count = 0;	/* No pages dirty */
    PM->fork_seq++;		/* Increase next sequence number */

    if ( wait == PSMSYNC_SYNC ) {
      int status;
      waitpid(f, &status, 0);
      last_sync = 0;
      if ( WIFSIGNALED(status) || (WIFEXITED(status) && WEXITSTATUS(status)) )
	return (pid_t)-1;	/* Something very bad happened... */
      else
	return 0;
    } else {
      last_sync = f;
      return f;
    }
  } else {
    /* Child process -- do the actual work of writing back dirty pages */

    struct lpsm_logrecord record, last_rec;
    struct flock lock;
    off_t logsize;

    record.magic = LOGRECORD_MAGIC;
    record.record_type = osrec_page;

    lock.l_whence = SEEK_SET;
    lock.l_start  = 0;
    lock.l_len    = 0;
    for (;;) {
      /* First, lock the entire log file */
      lock.l_type   = F_WRLCK;
      while ( fcntl(PM->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR );

      /* Make sure we were indeed next in turn */
      lseek(PM->log_fd, -(off_t)sizeof(last_rec), SEEK_END);
      if ( lpsm_read(PM->log_fd, &last_rec, sizeof(last_rec)) < sizeof(last_rec) ||
	   last_rec.magic != LOGRECORD_MAGIC ) {
	/* Something bad happened... */
	  kill(getppid(), SIGABRT); /* Kill main process */
	  _exit(99);
      }
      if ( last_rec.size+1 == PM->fork_seq )
	break;			/* It's for us... */

      /* Someone else is ahead of us in line.  Yield to them. */
      lock.l_type = F_UNLCK;
      while ( fcntl(PM->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
      
      sched_yield();		/* Snore... */
    }

    /* Write dirty pages to log file */
    for ( pi = PM->pageinfo ; pi < epi ; pi++ ) {
      if ( *pi == page_dirty ) {
	page = (char *)PM->arena +
	  ((uintptr_t)(pi - PM->pageinfo) << PM->pageshift);
	record.offset = (char *)page - (char *)PM->arena;

	/* Aggregate contiguous pages into a single record */
	record.size   = PM->pagesize;
	while ( pi+1 < epi && pi[1] == page_dirty ) {
	  pi++;
	  record.size += PM->pagesize;
	}

	if ( lpsm_write(PM->log_fd, &record, sizeof(record))
	     < sizeof(record) ||
	     lpsm_write(PM->log_fd, page, record.size) < record.size ) {
	  kill(getppid(), SIGABRT); /* Kill main process */
	  _exit(99);
	}
      }
    }

    /* This might be more efficiently done with fdatasync() */
    fsync(PM->log_fd);		/* Make sure we have written everything */
    
    /* Write commit record */
    record.record_type = osrec_commit;
    record.size        = PM->fork_seq;
    record.offset      = (off_t)0x54494d43;
    if ( lpsm_write(PM->log_fd, &record, sizeof(record)) < sizeof(record) ) {
      kill(getppid(), SIGABRT);
      _exit(99);
    }
    fsync(PM->log_fd);

    /* Check to see if it's time for garbage collect */
    logsize = lseek(PM->log_fd, 0, SEEK_END);
    if ( gc_factor < HUGE_VAL && (double)logsize >= gc_factor*PM->arena_len ) {
      /* Replaying the log isn't the most efficient way to do this.
	 We could also keep a status bit per page around, and flush
	 them out of the shadow array.  The biggest problem with that
	 is that it probably can't be done in the background, unlike
	 this method.  Leave this as-is for now. */
      if ( lpsm_log_writeback(PM) ) {
	kill(getppid(), SIGABRT);
	_exit(99);
      }
    }

    /* Drop lock on log file */
    lock.l_type = F_UNLCK;
    while ( fcntl(PM->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR );

    _exit(0);			/* Done! */
  }
}

/*
 * Extend the size of the object store.
 *
 * This currently relies on several Linux-specific features,
 * specifically mremap() and knowing that we probably can extend
 * it without changing the virtual address.
 */
int lpsm_extend(size_t new_size)
{
  struct flock lock;
  void *newp, *infop;
  off_t file_size;
  int ft;
  size_t add_size, old_size;
  size_t add_pages, old_pages, new_pages, file_pages;
  int realerrno;

  old_size = PM->arena_len;

  if ( new_size <= old_size )
    return 0;			/* No action */

  new_size = (new_size + PM->pagesize - 1) & ~(PM->pagesize - 1);
  add_size = new_size - old_size;

  lock.l_type   = F_WRLCK;
  lock.l_whence = SEEK_SET;
  lock.l_start  = 0;
  lock.l_len    = 0;
  while ( fcntl(PM->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );

  lock.l_type = F_UNLCK;
  file_size = lseek(PM->main_fd, 0, SEEK_END);
  if ( file_size < new_size )
    ft = ftruncate(PM->main_fd, new_size);
  else
    ft = 0;

  if ( ft ) {
    realerrno = errno;
    goto reset_size;		/* Failure */
  }

  newp = mmap((char*)PM->arena + old_size,
	      add_size, PROT_READ,
	      MAP_PRIVATE|MAP_FIXED, PM->main_fd, old_size);
  
  if ( newp == MAP_FAILED ) {
    realerrno = errno;
    goto reset_size;		/* Failure */
  }

  /* Since we specified MAP_FIXED, this should be guaranteed */
  assert( newp == (char*)PM->arena + old_size );

  /* Convert sizes to pages */
  file_size  = (file_size + PM->pagesize - 1) & ~(PM->pagesize-1);
  new_pages  = new_size  >> PM->pageshift;
  old_pages  = old_size  >> PM->pageshift;
  file_pages = file_size >> PM->pageshift;
  add_pages  = new_pages - old_pages;

  infop = realloc(PM->pageinfo, new_pages);
  if ( !infop ) {
    realerrno = errno;
    munmap(newp, add_size);
    goto reset_size;
  }
  
  PM->arena_len  = new_size;
  PM->pageinfo   = infop;

  /* No more failure bailouts, unlock */
  while ( fcntl(PM->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );

  /* Mark new pages clean */
  assert(new_pages >= file_pages);
  memset(PM->pageinfo + file_pages, page_clean, new_pages-file_pages);

  return 0;

  /* We failed to extend the arena after ftruncate(), this is *usually*
     due to address space limitations. */
 reset_size:
  /* Restore the original file size */
  ftruncate(PM->main_fd, file_size);
  /* Drop lock */
  while ( fcntl(PM->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
  /* Set errno to the real error */
  errno = realerrno;
  return -1;
}

/*
 * Shut down the object store, free all resources.
 * THIS DOES NOT CHECKPOINT - call lpsm_checkpoint() first
 * if you want that functionality.  Calling this function without
 * first checkpointing and then calling lpsm_arena_init() can be used
 * to (very slowly) roll back to the last commit point.
 */
void lpsm_shutdown(void)
{
  munmap(PM->arena, PM->arena_len);
  free(PM->pageinfo);
  sigaction(SIGSEGV, &PM->oldact, NULL);
  close(PM->log_fd);
  close(PM->main_fd);
  free(PM);
}