aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@zytor.com>2001-07-08 21:51:31 +0000
committerH. Peter Anvin <hpa@zytor.com>2001-07-08 21:51:31 +0000
commit7be4b5ec6f20e3ffe13e2a574549e8028faba526 (patch)
tree8e14b9522e61a08a1c09c69639c0797c52b4368b
downloadlpsm-7be4b5ec6f20e3ffe13e2a574549e8028faba526.tar.gz
lpsm-7be4b5ec6f20e3ffe13e2a574549e8028faba526.tar.xz
lpsm-7be4b5ec6f20e3ffe13e2a574549e8028faba526.zip
Initial version under CVS control
-rw-r--r--alloc.c126
-rw-r--r--arena.c642
-rw-r--r--ftrunctest.c19
-rw-r--r--lpsm.h71
-rw-r--r--system.h29
-rw-r--r--test_mmap.c21
-rw-r--r--teststore.c47
7 files changed, 955 insertions, 0 deletions
diff --git a/alloc.c b/alloc.c
new file mode 100644
index 0000000..398ec2b
--- /dev/null
+++ b/alloc.c
@@ -0,0 +1,126 @@
+#ident "$Id$"
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2000 H. Peter Anvin - All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ * USA; either version 2 of the License, or (at your option) any later
+ * version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * alloc.c
+ *
+ * Provide persistent storage versions of malloc(), realloc() and free().
+ *
+ * This code uses a modified buddy system allocator. It's probably broken
+ * if your byte size isn't at least a power of 2.
+ */
+
+#include <stdlib.h>
+#include <inttypes.h>
+#include <limits.h>
+
+#define OBJSTORE_INTERNALS 1
+#include "objstore.h"
+
+#define OBJSTORE_ARENA_MAGIC 0xd8319f45
+
+/* This is the minimal order worth allocating. This must be able
+ to hold two pointers plus an integer. 2^4 = 16 bytes. */
+#define ORDER_MIN 4 /* This must be >= 1. */
+
+/* This is the size of the alloc bitmask. It will usually be sparse. */
+/* This is two bits per unit of the lowest order */
+#define ALLOC_BITMAP_SIZE ((ARENA_LIMIT >> ORDER_MIN)*2/CHAR_BIT)
+
+/* This bit is set in the order marker to indicate an occupied cell. */
+#define OCCUPIED ((unsigned char)(1 << (CHAR_BIT-1)))
+
+/* How many address bits? */
+static const int orders = sizeof(void *) * CHAR_BIT;
+
+/*
+ * Initalize the object store arena allocator. Note the argument
+ * that allows a fixed-offset structure at the beginning. This is
+ * essential, since otherwise we wouldn't be able to begin to
+ * pick apart the object hierarchy.
+ */
+void *objstore_arena_init(size_t leadin_size)
+{
+ struct ObjStore *os = objstore_os_struct;
+ void **order_list, **order_ptr;
+ uintptr_t begin_data, end_data, order_size;
+ int i;
+
+ leadin_size = (leadin_size + os->pagesize - 1) & ~(os->pagesize - 1);
+ order_list = (void **)((char *)os->arena + leadin_size);
+ begin_data = (uintptr_t)(order_list + orders + 1);
+ end_data = (uintptr_t)os->arena + os->arena_len;
+
+ if ( (uintptr_t)order_list[0] != (uintptr_t)OBJSTORE_ARENA_MAGIC ) {
+ /* The arena is uninitialized. */
+
+ /* Begin by initializing all the pointers to null */
+ for ( i = 0 ; i < orders ; i++ )
+ order_list[i] = NULL;
+
+ /* Align the beginning and end to the lowest-supported order.
+ Note that begin_data is adjusted +1 above, to handle the
+ order/alloc marker of the first unit. */
+ begin_data = (begin_data + ((uintptr_t)1 << ORDER_MIN) - 1) &
+ ~(((uintptr_t)1 << ORDER_MIN) - 1);
+ end_data = end_data & ~(((uintptr_t)1 << ORDER_MIN)-1);
+
+ for ( i = ORDER_MIN ; i < orders ; i++ ) {
+ order_size = (uintptr_t)1 << i;
+ order_ptr = &order_list[i];
+ if ( (begin_data & order_size) &&
+ (end_data-begin_data) >= order_size ) {
+ ((unsigned char *)begin_data)[-1] = i; /* Free object of order i */
+ *order_ptr = (void *)begin_data;
+ order_ptr = (void **)begin_data;
+ *order_ptr = NULL;
+ begin_data += order_size;
+ }
+ if ( (end_data & order_size) &&
+ (end_data-begin_data) >= order_size ) {
+ end_data -= order_size;
+ ((unsigned char *)begin_data)[-1] = i; /* Free object of order i */
+ *order_ptr = (void *)end_data;
+ order_ptr = (void **)end_data;
+ *order_ptr = NULL;
+ }
+ }
+
+ /* Arena now initialized */
+ order_list[0] = (void *)(uintptr_t)OBJSTORE_ARENA_MAGIC;
+ }
+}
+
+void *objstore_malloc(size_t size)
+{
+ int order_needed, order, i;
+
+ size += 1; /* Add 1 for bookkeeping byte */
+
+ /* Find the order needed */
+ for ( order_needed = ORDER_MIN ;
+ (size_t)(1UL << order_needed) < size ;
+ order_needed++ );
+
+ /* Find the order available */
+ for ( order = order_needed ; order < orders ; order++ ) {
+ if ( order_list[order] )
+ break;
+ }
+
+ if ( order >= orders )
+ return NULL; /* Nothing available */
+
+ /* Split orders down to the needed one */
+ for ( i = order ; i < order_needed ; i++ ) {
+
diff --git a/arena.c b/arena.c
new file mode 100644
index 0000000..60f9602
--- /dev/null
+++ b/arena.c
@@ -0,0 +1,642 @@
+#ident "$Id$"
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2000 H. Peter Anvin - All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ * USA; either version 2 of the License, or (at your option) any later
+ * version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * objstore.c
+ *
+ * Persistent object store implemented using memory-mapping tricks
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <math.h> /* HUGE_VAL */
+#define __USE_MISC 1 /* Needed to support mremap() */
+#define __USE_GNU 1 /* Needed to support mremap() */
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sched.h>
+
+#define OBJSTORE_INTERNALS 1
+#include "objstore.h"
+
+enum page_status {
+ page_unread = 0,
+ page_clean = 1,
+ page_dirty = 2,
+};
+
+/*
+ * This is the data structure for the object store. Note that only
+ * one active object store is supported, due to the need to trap
+ * SIGSEGV.
+ */
+struct ObjStore *objstore_os_struct;
+
+/* Wrappers for read() and write() which retries if incomplete */
+static ssize_t objstore_read(int fd, void *buf, size_t count)
+{
+ char *bufp = buf;
+ ssize_t total = 0;
+ ssize_t rv;
+
+ while ( count ) {
+ rv = read(fd, bufp, count);
+ if ( rv == -1 ) {
+ if ( errno == EINTR || errno == EAGAIN )
+ continue;
+ else
+ return total ? total : -1;
+ } else if ( rv == 0 ) {
+ return total;
+ }
+ bufp += rv;
+ count -= rv;
+ total += rv;
+ }
+
+ return total;
+}
+
+static ssize_t objstore_write(int fd, void *buf, size_t count)
+{
+ char *bufp = buf;
+ ssize_t total = 0;
+ ssize_t rv;
+
+ while ( count ) {
+ rv = write(fd, bufp, count);
+ if ( rv == -1 ) {
+ if ( errno == EINTR || errno == EAGAIN )
+ continue;
+ else
+ return total ? total : -1;
+ } else if ( rv == 0 ) {
+ return total;
+ }
+ bufp += rv;
+ count -= rv;
+ total += rv;
+ }
+
+ return total;
+}
+
+/*
+ * SIGSEGV handler for persistent object store
+ */
+static void objstore_sigsegv(int signal, siginfo_t *siginfo, void *ptr)
+{
+ struct ObjStore *os = objstore_os_struct;
+ void *page;
+ off_t offset;
+ char *pageinfo;
+ struct flock lock;
+ int old_errno = errno;
+#ifdef __linux__
+ struct sigcontext *ctxt;
+
+# ifdef __i386__ /* This is so specific to Linux/i386 */
+ if ( siginfo->si_code == 0 ) {
+ /* Old kernel. Fill in data to the best of our knowledge. */
+ /* Don't even begin to ask me where the 0x14 comes from */
+ ctxt = (struct sigcontext *)((char *)ptr + 0x14);
+ if ( ctxt->trapno == 14 ) {
+ /* Linux/i386 uses unmapped pages to mimic PROT_NONE, so we can't
+ tell ACCERR and MAPERR apart from the register state */
+ siginfo->si_code = SEGV_ACCERR;
+ siginfo->si_addr = (void *)ctxt->cr2;
+ }
+ }
+# endif /* __i386__ */
+#endif /* __linux__ */
+
+ if ( signal != SIGSEGV || siginfo->si_code != SEGV_ACCERR ||
+ ((uintptr_t)siginfo->si_addr - (uintptr_t)os->arena) >= os->arena_len ) {
+ struct sigaction dfl;
+
+ dfl.sa_handler = SIG_DFL;
+ sigemptyset(&dfl.sa_mask);
+ dfl.sa_flags = SA_ONESHOT;
+ sigaction(SIGSEGV, &dfl, NULL);
+
+ errno = old_errno;
+ return; /* Re-take fault */
+ }
+
+ page = (void *)((uintptr_t)siginfo->si_addr & ~(os->pagesize-1));
+ offset = (uintptr_t)page - (uintptr_t)os->arena;
+ pageinfo = os->pageinfo + (offset >> os->pageshift);
+
+ mprotect(page, os->pagesize, PROT_READ|PROT_WRITE);
+
+ switch ( (enum page_status) *pageinfo ) {
+ case page_unread:
+ lseek(os->main_fd, offset, SEEK_SET);
+
+ lock.l_type = F_RDLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = offset;
+ lock.l_len = os->pagesize;
+ while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+ if ( objstore_read(os->main_fd, page, os->pagesize) < os->pagesize )
+ abort(); /* Uh-oh... */
+
+ lock.l_type = F_UNLCK;
+ while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ mprotect(page, os->pagesize, PROT_READ); /* Make page readonly */
+ *pageinfo = page_clean; /* Page read and clean */
+ os->loaded_count++; /* For accounting purposes */
+ break;
+
+ case page_clean:
+ *pageinfo = page_dirty; /* Page now dirty */
+ os->dirty_count++; /* For accounting purposes */
+ /* Leave page r/w */
+ break;
+
+ default:
+ abort(); /* This shouldn't happen */
+ }
+
+ errno = old_errno;
+}
+
+/*
+ * Routine to do log writeback. Used by initial log recovery routine
+ * as well as during-execution garbage collect.
+ * THIS ROUTINE SHOULD BE INVOKED WITH LOCK HELD ON THE LOG FILE.
+ */
+static int objstore_log_writeback(void)
+{
+ struct ObjStore *os = objstore_os_struct;
+ struct ObjStore_LogRecord record;
+ off_t position, last_commit;
+ struct flock lockmain;
+
+ last_commit = 0; /* Last COMMIT record found */
+ position = lseek(os->log_fd, 0, SEEK_SET);
+
+ while ( objstore_read(os->log_fd, &record, sizeof(record)) == sizeof(record) ) {
+ if ( record.magic != LOGRECORD_MAGIC )
+ break; /* Bad magic, assume rest of log corrupt */
+ if ( record.record_type == osrec_commit ) {
+ /* NOTE: last_commit points to the final byte to examine, thus
+ at the *end* of the final commit record. */
+ position += sizeof(record);
+ last_commit = position; /* Found a commit record */
+ } else if ( record.record_type == osrec_page ) {
+ /* Advance past current page cluster */
+ position = lseek(os->log_fd, record.size, SEEK_CUR);
+ } else {
+ return -1; /* Unknown record - unsafe to process */
+ }
+ }
+
+ /* Now we know where the last commit was. Now we can process
+ everything up to that point. */
+
+ position = lseek(os->log_fd, 0, SEEK_SET);
+
+ while ( objstore_read(os->log_fd, &record, sizeof(record))
+ == sizeof(record) && position < last_commit ) {
+ if ( record.magic != LOGRECORD_MAGIC )
+ break; /* Bad magic, assume rest of log corrupt */
+ if ( record.record_type == osrec_commit ) {
+ /* Found a commit record, do nothing */
+ position += sizeof(record);
+ } else if ( record.record_type == osrec_page ) {
+ /* Write back data to file */
+ char *data;
+
+ position += sizeof(record);
+ if ( !data )
+ return -1; /* Badness... */
+
+ lockmain.l_type = F_WRLCK;
+ lockmain.l_whence = SEEK_SET;
+ lockmain.l_start = record.offset;
+ lockmain.l_len = record.size;
+ while ( fcntl(os->main_fd, F_SETLKW, &lockmain) == -1 && errno == EINTR );
+ data = mmap(NULL, record.size, PROT_WRITE, MAP_SHARED,
+ os->main_fd, record.offset);
+ if ( data == MAP_FAILED )
+ return -1;
+ if ( objstore_read(os->log_fd, data, record.size) != record.size )
+ return -1; /* Badness */
+ if ( munmap(data, record.size) )
+ return -1;
+
+ lockmain.l_type = F_UNLCK;
+ while ( fcntl(os->main_fd, F_SETLKW, &lockmain) == -1 && errno == EINTR );
+ position += record.size;
+ } else {
+ return -1; /* Unknown record - unsafe to process */
+ }
+ }
+
+ /* Log successfully recovered. Truncate. */
+ fsync(os->main_fd);
+ ftruncate(os->log_fd, 0);
+ /* Write initial commit record, for sequence number recovery */
+ record.magic = LOGRECORD_MAGIC;
+ record.record_type = osrec_commit;
+ record.size = os->fork_seq;
+ record.offset = 0x54494d43; /* For debugging */
+ if ( objstore_write(os->log_fd, &record, sizeof(record)) < sizeof(record) )
+ return -1;
+
+ fsync(os->log_fd); /* Indicate log recovery complete */
+
+ return 0;
+}
+
+/*
+ * Routine to do log recovery
+ */
+static int objstore_recover_log(void)
+{
+ struct ObjStore *os = objstore_os_struct;
+ struct flock lock;
+ int rv = 0;
+
+ /* First, lock the log file */
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ while ( fcntl(os->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ /* Do log recovery, and write initial commit record. */
+ rv = objstore_log_writeback();
+
+ /* Increase the sequence number, since we just wrote a commit. */
+ os->fork_seq++;
+
+ /* Unlock file and run. */
+ lock.l_type = F_UNLCK;
+ while ( fcntl(os->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ return rv;
+}
+
+/*
+ * Opens the object store. This includes log
+ * playback (crash recovery) if the log file exists
+ * and is nonempty.
+ */
+void *objstore_init(char *main_file, char *log_file, size_t *arena_len)
+{
+ struct ObjStore *os;
+ void *arena_ptr;
+ struct sigaction sigact;
+ struct flock lock;
+ off_t file_len, len = arena_len ? *arena_len : 0;
+ size_t file_pages, len_pages;
+
+ arena_ptr = ARENA_ADDRESS;
+
+ objstore_os_struct = os = malloc(sizeof(struct ObjStore));
+ if ( !os )
+ goto errx0;
+
+ os->fork_seq = 0; /* Initialize sequence counter */
+
+ os->main_fd = open(main_file, O_RDWR|O_CREAT, 0666);
+ if ( os->main_fd < 0 )
+ goto errx1;
+
+ os->pagesize = getpagesize();
+ if ( os->pagesize & (os->pagesize - 1) )
+ goto errx2; /* WTF -- pagesize not a power of 2? */
+
+ /* Compute log2(os->pagesize) */
+ os->pageshift = 0;
+ while ( (1 << os->pageshift) < os->pagesize )
+ os->pageshift++;
+
+ /*
+ * Open log file
+ */
+ os->log_fd = open(log_file, O_RDWR|O_APPEND|O_CREAT, 0666);
+ if ( os->log_fd < 0 )
+ goto errx3;
+
+ /* Now, do log recovery if needed */
+ if ( objstore_recover_log() )
+ goto errx3;
+
+ /* Allocate arena memory space */
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+ file_len = lseek(os->main_fd, 0, SEEK_END);
+ if ( len == 0 ) {
+ len = file_len;
+ }
+ len = (len + os->pagesize - 1) & ~(os->pagesize - 1);
+ if ( len > file_len ) {
+ ftruncate(os->main_fd, len); /* Extend file */
+ }
+ lock.l_type = F_UNLCK;
+ while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ os->arena = mmap(arena_ptr, len, PROT_NONE,
+ MAP_ANON|MAP_PRIVATE|MAP_FIXED, 0, 0);
+ if ( os->arena == MAP_FAILED )
+ goto errx3;
+
+ os->arena_len = len;
+ if ( *arena_len )
+ *arena_len = len;
+
+ os->pageinfo = malloc(len >> os->pageshift);
+ if ( !os->pageinfo )
+ goto errx4;
+
+ /* The pageinfo up to and including file_len is "unread"; beyond
+ file_len we know it must be zero and thus it can be marked "clean" */
+ file_len = (file_len + os->pagesize - 1) & ~(os->pagesize-1);
+ file_pages = file_len >> os->pageshift;
+ len_pages = len >> os->pageshift;
+
+ memset(os->pageinfo, page_unread, file_pages);
+
+ if ( len_pages > file_pages ) {
+ mprotect((char *)os->arena + file_len, len - file_len, PROT_READ);
+ memset(os->pageinfo + file_pages, page_clean, len_pages-file_pages);
+ }
+
+ sigact.sa_sigaction = objstore_sigsegv;
+ sigemptyset(&sigact.sa_mask);
+ sigact.sa_flags = SA_RESTART|SA_SIGINFO;
+ if ( sigaction(SIGSEGV, &sigact, &os->oldact) )
+ goto errx5;
+
+ return os->arena;
+
+ errx5:
+ munmap(os->pageinfo, len >> os->pageshift);
+ errx4:
+ munmap(arena_ptr, len);
+ errx3:
+ if ( os->log_fd >= 0 ) close(os->log_fd);
+ errx2:
+ close(os->main_fd);
+ errx1:
+ free(os);
+ errx0:
+
+ return NULL;
+}
+
+/*
+ * Object store checkpoint. Writes entries to the log file.
+ * The "gc_factor" is the factor of maximum log size file relative
+ * to the arena size. For example, if gc_factor == 0.5 then if the
+ * log size is more than 50% of the arena file size a writeback cycle
+ * will take place after the log has been written. This means other
+ * checkpoints will have to wait!
+ *
+ * Set gc_factor to 0.0 to force a gc, and to HUGE_VAL to inhibit gc.
+ */
+int objstore_checkpoint(double gc_factor)
+{
+ struct ObjStore *os = objstore_os_struct;
+ int f;
+ char *pi, *epi;
+ void *page;
+
+ pi = os->pageinfo;
+ epi = os->pageinfo + (os->arena_len >> os->pageshift);
+
+ f = fork();
+ if ( f < 0 )
+ return 1; /* Checkpoint failed! */
+ else if ( f > 0 ) {
+ /* Parent process -- just mark all dirty pages clean */
+
+ size_t size, count;
+ char *opi;
+ int found_dirty;
+
+ /* Aggregate both clean and dirty pages; this should allow the OS
+ to avoid keeping track of quite as many memory protect regions */
+ for ( pi = os->pageinfo ; pi < epi ; pi++ ) {
+ if ( *pi == page_dirty || *pi == page_clean ) {
+ found_dirty = (*pi == page_dirty);
+ page = (char *)os->arena +
+ ((uintptr_t)(pi - os->pageinfo) << os->pageshift);
+
+ opi = pi;
+ size = os->pagesize;
+ count = 1;
+ while ( pi+1 < epi &&
+ (pi[1] == page_dirty || pi[1] == page_clean) ) {
+ pi++;
+ found_dirty = found_dirty || (*pi == page_dirty);
+ count++;
+ size += os->pagesize;
+ }
+ if ( found_dirty ) {
+ mprotect(page, size, PROT_READ);
+ memset(opi, page_clean, count);
+ }
+ }
+ }
+ os->dirty_count = 0; /* No pages dirty */
+ os->fork_seq++; /* Increase next sequence number */
+ return 0;
+ } else {
+ /* Child process -- do the actual work of writing back dirty pages */
+
+ struct ObjStore_LogRecord record, last_rec;
+ struct flock lock;
+ off_t logsize;
+
+ record.magic = LOGRECORD_MAGIC;
+ record.record_type = osrec_page;
+
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ for (;;) {
+ /* First, lock the entire log file */
+ lock.l_type = F_WRLCK;
+ while ( fcntl(os->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ /* Make sure we were indeed next in turn */
+ lseek(os->log_fd, -(off_t)sizeof(last_rec), SEEK_END);
+ if ( objstore_read(os->log_fd, &last_rec, sizeof(last_rec)) < sizeof(last_rec)) {
+ kill(getppid(), SIGABRT); /* Kill main process */
+ _exit(99);
+ }
+ if ( last_rec.size+1 == os->fork_seq )
+ break; /* It's for us... */
+
+ /* Someone else is ahead of us in line. Yield to them. */
+ lock.l_type = F_UNLCK;
+ while ( fcntl(os->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ sched_yield(); /* Snore... */
+ }
+
+ /* Write dirty pages to log file */
+ for ( pi = os->pageinfo ; pi < epi ; pi++ ) {
+ if ( *pi == page_dirty ) {
+ page = (char *)os->arena +
+ ((uintptr_t)(pi - os->pageinfo) << os->pageshift);
+ record.offset = (char *)page - (char *)os->arena;
+
+ /* Aggregate contiguous pages into a single record */
+ record.size = os->pagesize;
+ while ( pi+1 < epi && pi[1] == page_dirty ) {
+ pi++;
+ record.size += os->pagesize;
+ }
+
+ if ( objstore_write(os->log_fd, &record, sizeof(record))
+ < sizeof(record) ||
+ objstore_write(os->log_fd, page, record.size) < record.size ) {
+ kill(getppid(), SIGABRT); /* Kill main process */
+ _exit(99);
+ }
+ }
+ }
+
+ /* This might be more efficiently done with fdatasync() */
+ fsync(os->log_fd); /* Make sure we have written everything */
+
+ /* Write commit record */
+ record.record_type = osrec_commit;
+ record.size = os->fork_seq;
+ record.offset = (off_t)0x54494d43;
+ if ( objstore_write(os->log_fd, &record, sizeof(record)) < sizeof(record) ) {
+ kill(getppid(), SIGABRT);
+ _exit(99);
+ }
+ fsync(os->log_fd);
+
+ /* Check to see if it's time for garbage collect */
+ logsize = lseek(os->log_fd, 0, SEEK_END);
+ if ( gc_factor < HUGE_VAL && (double)logsize >= gc_factor*os->arena_len ) {
+ /* Replaying the log isn't the most efficient way to do this.
+ We could also keep a status bit per page around, and flush
+ them out of the shadow array. The biggest problem with that
+ is that it probably can't be done in the background, unlike
+ this method. Leave this as-is for now. */
+ if ( objstore_log_writeback() ) {
+ kill(getppid(), SIGABRT);
+ _exit(99);
+ }
+ }
+
+ /* Drop lock on log file */
+ lock.l_type = F_UNLCK;
+ while ( fcntl(os->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ _exit(0); /* Done! */
+ }
+}
+
+/*
+ * Extend the size of the object store.
+ *
+ * This currently relies on several Linux-specific features,
+ * specifically mremap() and knowing that we probably can extend
+ * it without changing the virtual address.
+ */
+int objstore_extend(size_t new_size)
+{
+ struct ObjStore *os = objstore_os_struct;
+ struct flock lock;
+ void *newp, *infop;
+ off_t file_size;
+ int ft;
+ size_t add_size, old_size;
+ size_t add_pages, old_pages, new_pages, file_pages;
+
+ old_size = os->arena_len;
+
+ if ( new_size <= old_size )
+ return 0; /* No action */
+
+ new_size = (new_size + os->pagesize - 1) & ~(os->pagesize - 1);
+ add_size = new_size - old_size;
+
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ lock.l_type = F_UNLCK;
+ file_size = lseek(os->main_fd, 0, SEEK_END);
+ if ( file_size < new_size )
+ ft = ftruncate(os->main_fd, new_size);
+ else
+ ft = 0;
+
+ while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+ if ( ft )
+ return -1; /* Failure */
+
+ newp = mmap((char*)os->arena + old_size,
+ add_size,
+ PROT_NONE,
+ MAP_PRIVATE|MAP_ANON|MAP_FIXED, 0, 0);
+
+ if ( newp == MAP_FAILED )
+ return -1; /* Failure */
+
+ /* Since we specified MAP_FIXED, this should be guaranteed */
+ assert( newp == (char*)os->arena + old_size );
+
+ /* Convert sizes to pages */
+ file_size = (file_size + os->pagesize - 1) & ~(os->pagesize-1);
+ new_pages = new_size >> os->pageshift;
+ old_pages = old_size >> os->pageshift;
+ file_pages = file_size >> os->pageshift;
+ add_pages = new_pages - old_pages;
+
+ infop = realloc(os->pageinfo, new_pages);
+ if ( !infop ) {
+ munmap(newp, add_size);
+ return -1; /* Failure */
+ }
+
+ os->arena_len = new_size;
+ os->pageinfo = infop;
+
+ /* If we extended the file, the new area is known to contain
+ zero, and can thus be considered "clean"; otherwise we have
+ to consider it "unread". */
+ if ( file_pages > old_pages ) {
+ memset(os->pageinfo + old_pages, page_unread, file_pages-old_pages);
+ }
+ if ( file_pages < new_pages ) {
+ memset(os->pageinfo + file_pages, page_clean, new_pages-file_pages);
+ mprotect((char *)os->arena + file_size, new_size-file_size, PROT_READ);
+ }
+
+ return 0;
+}
diff --git a/ftrunctest.c b/ftrunctest.c
new file mode 100644
index 0000000..dfa58b3
--- /dev/null
+++ b/ftrunctest.c
@@ -0,0 +1,19 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <string.h>
+
+int main(int argc, char *argv[])
+{
+ int fd;
+ char *mapping;
+
+ fd = open("ftrunc.dat", O_RDWR|O_CREAT, 0666);
+ ftruncate(fd, 1024*1024*1024);
+ close(fd);
+
+ return 0;
+}
+
+
diff --git a/lpsm.h b/lpsm.h
new file mode 100644
index 0000000..6d80df6
--- /dev/null
+++ b/lpsm.h
@@ -0,0 +1,71 @@
+#ident "$Id$"
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2000 H. Peter Anvin - All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ * USA; either version 2 of the License, or (at your option) any later
+ * version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * objstore.h
+ *
+ * Header file for the persistent object store
+ */
+
+#ifndef OBJSTORE_H
+#define OBJSTORE_H
+
+#include <stdlib.h>
+#include <signal.h>
+#include <inttypes.h>
+
+#ifdef OBJSTORE_INTERNALS
+
+#include "system.h" /* System-specific constants */
+
+struct ObjStore {
+ int main_fd; /* Primary file descriptor */
+ int log_fd; /* Log file descriptor */
+ int pagesize; /* Page size */
+ int pageshift; /* log2(pagesize) */
+ void *arena; /* Mapped memory zone */
+ size_t arena_len; /* Length of arena */
+ char *pageinfo; /* Page info pointer */
+ size_t loaded_count; /* Loaded pages count (accounting) */
+ size_t dirty_count; /* Dirty pages count (accounting) */
+ struct sigaction oldact; /* Previous signal action */
+ size_t fork_seq; /* Sequence number of forked processes */
+};
+
+enum ObjStore_RecordType {
+ osrec_page, /* Page data */
+ osrec_commit, /* Commit record */
+};
+
+#define LOGRECORD_MAGIC 0x9247746e
+
+struct ObjStore_LogRecord {
+ unsigned int magic; /* Magic number; for verification */
+ unsigned int record_type; /* Record */
+ size_t size; /* Data byte count (sequence # for commit) */
+ off_t offset; /* Offset of data */
+};
+
+extern struct ObjStore *objstore_os_struct;
+
+#else /* not OBJSTORE_INTERNALS */
+
+struct Objstore;
+
+#endif /* not OBJSTORE_INTERNALS */
+
+void *objstore_init(char *main_file, char *log_file, size_t *arena_len);
+int objstore_checkpoint(double gc_factor);
+int objstore_extend(size_t new_size);
+
+#endif
diff --git a/system.h b/system.h
new file mode 100644
index 0000000..9d74409
--- /dev/null
+++ b/system.h
@@ -0,0 +1,29 @@
+#ident "$Id$"
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2000 H. Peter Anvin - All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ * USA; either version 2 of the License, or (at your option) any later
+ * version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * system.h
+ *
+ * System-specific constants.
+ */
+
+#ifndef SYSTEM_H
+#define SYSTEM_H
+
+/* These constants are appropriate for Linux/i386 */
+
+/* This is where we map the database file - must be constant */
+#define ARENA_ADDRESS ((void *)0x60000000)
+
+#endif /* SYSTEM_H */
+
diff --git a/test_mmap.c b/test_mmap.c
new file mode 100644
index 0000000..445870f
--- /dev/null
+++ b/test_mmap.c
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <errno.h>
+
+int main(int argc, char *argv[])
+{
+ char *mapping;
+
+ mapping = mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, 0, 0);
+ mmap(mapping+8192, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON|MAP_FIXED, 0, 0);
+ strcpy(mapping+0, "Foo mani padme hum");
+ strcpy(mapping+4096, "Foo mani padme hum");
+ errno = 0;
+ mmap(mapping+4096, 4096, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_PRIVATE|MAP_ANON, 0, 0);
+ munmap(mapping, 16384);
+}
+
+
diff --git a/teststore.c b/teststore.c
new file mode 100644
index 0000000..14c40fc
--- /dev/null
+++ b/teststore.c
@@ -0,0 +1,47 @@
+#ident "$Id$"
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2000 Transmeta Corporation - All Rights Reserved
+ *
+ * This source module contains confidential and proprietary information
+ * of Transmeta Corporation. It is not to be disclosed or used except
+ * in accordance with applicable agreements. This copyright notice does
+ * not evidence any actual or intended publication of such source code.
+ *
+ * ----------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "objstore.h"
+
+int main(int argc, char *argv[])
+{
+ void *buf;
+ int arena_len = 16384;
+
+ buf = objstore_init("test.dat", "test.log", &arena_len);
+
+ printf("Read from first page: %s\n", (char *)buf);
+
+ strcpy((char *)buf + 4096, "This is the second page!");
+ strcpy((char *)buf + 8192, "This is the third page!");
+
+ printf("Read from third page: %s\n", (char *)buf + 8192);
+ objstore_checkpoint(0.5);
+
+ strcpy((char *)buf + 8192, "This is also the third page!");
+
+ printf("Read from third page: %s\n", (char *)buf + 8192);
+ objstore_checkpoint(0.0);
+
+ objstore_extend(65536);
+ strcpy((char *)buf + 32768, "This is the ninth page!");
+ objstore_checkpoint(0.0);
+
+ sleep(5);
+
+ return 0;
+}
+