aboutsummaryrefslogtreecommitdiffstats
path: root/arena.c
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@zytor.com>2001-07-08 21:51:31 +0000
committerH. Peter Anvin <hpa@zytor.com>2001-07-08 21:51:31 +0000
commit7be4b5ec6f20e3ffe13e2a574549e8028faba526 (patch)
tree8e14b9522e61a08a1c09c69639c0797c52b4368b /arena.c
downloadlpsm-7be4b5ec6f20e3ffe13e2a574549e8028faba526.tar.gz
lpsm-7be4b5ec6f20e3ffe13e2a574549e8028faba526.tar.xz
lpsm-7be4b5ec6f20e3ffe13e2a574549e8028faba526.zip
Initial version under CVS control
Diffstat (limited to 'arena.c')
-rw-r--r--arena.c642
1 files changed, 642 insertions, 0 deletions
diff --git a/arena.c b/arena.c
new file mode 100644
index 0000000..60f9602
--- /dev/null
+++ b/arena.c
@@ -0,0 +1,642 @@
+#ident "$Id$"
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2000 H. Peter Anvin - All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ * USA; either version 2 of the License, or (at your option) any later
+ * version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * objstore.c
+ *
+ * Persistent object store implemented using memory-mapping tricks
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <math.h> /* HUGE_VAL */
+#define __USE_MISC 1 /* Needed to support mremap() */
+#define __USE_GNU 1 /* Needed to support mremap() */
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sched.h>
+
+#define OBJSTORE_INTERNALS 1
+#include "objstore.h"
+
+enum page_status {
+ page_unread = 0,
+ page_clean = 1,
+ page_dirty = 2,
+};
+
+/*
+ * This is the data structure for the object store. Note that only
+ * one active object store is supported, due to the need to trap
+ * SIGSEGV.
+ */
+struct ObjStore *objstore_os_struct;
+
+/* Wrappers for read() and write() which retries if incomplete */
+static ssize_t objstore_read(int fd, void *buf, size_t count)
+{
+ char *bufp = buf;
+ ssize_t total = 0;
+ ssize_t rv;
+
+ while ( count ) {
+ rv = read(fd, bufp, count);
+ if ( rv == -1 ) {
+ if ( errno == EINTR || errno == EAGAIN )
+ continue;
+ else
+ return total ? total : -1;
+ } else if ( rv == 0 ) {
+ return total;
+ }
+ bufp += rv;
+ count -= rv;
+ total += rv;
+ }
+
+ return total;
+}
+
+static ssize_t objstore_write(int fd, void *buf, size_t count)
+{
+ char *bufp = buf;
+ ssize_t total = 0;
+ ssize_t rv;
+
+ while ( count ) {
+ rv = write(fd, bufp, count);
+ if ( rv == -1 ) {
+ if ( errno == EINTR || errno == EAGAIN )
+ continue;
+ else
+ return total ? total : -1;
+ } else if ( rv == 0 ) {
+ return total;
+ }
+ bufp += rv;
+ count -= rv;
+ total += rv;
+ }
+
+ return total;
+}
+
+/*
+ * SIGSEGV handler for persistent object store
+ */
+static void objstore_sigsegv(int signal, siginfo_t *siginfo, void *ptr)
+{
+ struct ObjStore *os = objstore_os_struct;
+ void *page;
+ off_t offset;
+ char *pageinfo;
+ struct flock lock;
+ int old_errno = errno;
+#ifdef __linux__
+ struct sigcontext *ctxt;
+
+# ifdef __i386__ /* This is so specific to Linux/i386 */
+ if ( siginfo->si_code == 0 ) {
+ /* Old kernel. Fill in data to the best of our knowledge. */
+ /* Don't even begin to ask me where the 0x14 comes from */
+ ctxt = (struct sigcontext *)((char *)ptr + 0x14);
+ if ( ctxt->trapno == 14 ) {
+ /* Linux/i386 uses unmapped pages to mimic PROT_NONE, so we can't
+ tell ACCERR and MAPERR apart from the register state */
+ siginfo->si_code = SEGV_ACCERR;
+ siginfo->si_addr = (void *)ctxt->cr2;
+ }
+ }
+# endif /* __i386__ */
+#endif /* __linux__ */
+
+ if ( signal != SIGSEGV || siginfo->si_code != SEGV_ACCERR ||
+ ((uintptr_t)siginfo->si_addr - (uintptr_t)os->arena) >= os->arena_len ) {
+ struct sigaction dfl;
+
+ dfl.sa_handler = SIG_DFL;
+ sigemptyset(&dfl.sa_mask);
+ dfl.sa_flags = SA_ONESHOT;
+ sigaction(SIGSEGV, &dfl, NULL);
+
+ errno = old_errno;
+ return; /* Re-take fault */
+ }
+
+ page = (void *)((uintptr_t)siginfo->si_addr & ~(os->pagesize-1));
+ offset = (uintptr_t)page - (uintptr_t)os->arena;
+ pageinfo = os->pageinfo + (offset >> os->pageshift);
+
+ mprotect(page, os->pagesize, PROT_READ|PROT_WRITE);
+
+ switch ( (enum page_status) *pageinfo ) {
+ case page_unread:
+ lseek(os->main_fd, offset, SEEK_SET);
+
+ lock.l_type = F_RDLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = offset;
+ lock.l_len = os->pagesize;
+ while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+ if ( objstore_read(os->main_fd, page, os->pagesize) < os->pagesize )
+ abort(); /* Uh-oh... */
+
+ lock.l_type = F_UNLCK;
+ while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ mprotect(page, os->pagesize, PROT_READ); /* Make page readonly */
+ *pageinfo = page_clean; /* Page read and clean */
+ os->loaded_count++; /* For accounting purposes */
+ break;
+
+ case page_clean:
+ *pageinfo = page_dirty; /* Page now dirty */
+ os->dirty_count++; /* For accounting purposes */
+ /* Leave page r/w */
+ break;
+
+ default:
+ abort(); /* This shouldn't happen */
+ }
+
+ errno = old_errno;
+}
+
+/*
+ * Routine to do log writeback. Used by initial log recovery routine
+ * as well as during-execution garbage collect.
+ * THIS ROUTINE SHOULD BE INVOKED WITH LOCK HELD ON THE LOG FILE.
+ */
+static int objstore_log_writeback(void)
+{
+ struct ObjStore *os = objstore_os_struct;
+ struct ObjStore_LogRecord record;
+ off_t position, last_commit;
+ struct flock lockmain;
+
+ last_commit = 0; /* Last COMMIT record found */
+ position = lseek(os->log_fd, 0, SEEK_SET);
+
+ while ( objstore_read(os->log_fd, &record, sizeof(record)) == sizeof(record) ) {
+ if ( record.magic != LOGRECORD_MAGIC )
+ break; /* Bad magic, assume rest of log corrupt */
+ if ( record.record_type == osrec_commit ) {
+ /* NOTE: last_commit points to the final byte to examine, thus
+ at the *end* of the final commit record. */
+ position += sizeof(record);
+ last_commit = position; /* Found a commit record */
+ } else if ( record.record_type == osrec_page ) {
+ /* Advance past current page cluster */
+ position = lseek(os->log_fd, record.size, SEEK_CUR);
+ } else {
+ return -1; /* Unknown record - unsafe to process */
+ }
+ }
+
+ /* Now we know where the last commit was. Now we can process
+ everything up to that point. */
+
+ position = lseek(os->log_fd, 0, SEEK_SET);
+
+ while ( objstore_read(os->log_fd, &record, sizeof(record))
+ == sizeof(record) && position < last_commit ) {
+ if ( record.magic != LOGRECORD_MAGIC )
+ break; /* Bad magic, assume rest of log corrupt */
+ if ( record.record_type == osrec_commit ) {
+ /* Found a commit record, do nothing */
+ position += sizeof(record);
+ } else if ( record.record_type == osrec_page ) {
+ /* Write back data to file */
+ char *data;
+
+ position += sizeof(record);
+ if ( !data )
+ return -1; /* Badness... */
+
+ lockmain.l_type = F_WRLCK;
+ lockmain.l_whence = SEEK_SET;
+ lockmain.l_start = record.offset;
+ lockmain.l_len = record.size;
+ while ( fcntl(os->main_fd, F_SETLKW, &lockmain) == -1 && errno == EINTR );
+ data = mmap(NULL, record.size, PROT_WRITE, MAP_SHARED,
+ os->main_fd, record.offset);
+ if ( data == MAP_FAILED )
+ return -1;
+ if ( objstore_read(os->log_fd, data, record.size) != record.size )
+ return -1; /* Badness */
+ if ( munmap(data, record.size) )
+ return -1;
+
+ lockmain.l_type = F_UNLCK;
+ while ( fcntl(os->main_fd, F_SETLKW, &lockmain) == -1 && errno == EINTR );
+ position += record.size;
+ } else {
+ return -1; /* Unknown record - unsafe to process */
+ }
+ }
+
+ /* Log successfully recovered. Truncate. */
+ fsync(os->main_fd);
+ ftruncate(os->log_fd, 0);
+ /* Write initial commit record, for sequence number recovery */
+ record.magic = LOGRECORD_MAGIC;
+ record.record_type = osrec_commit;
+ record.size = os->fork_seq;
+ record.offset = 0x54494d43; /* For debugging */
+ if ( objstore_write(os->log_fd, &record, sizeof(record)) < sizeof(record) )
+ return -1;
+
+ fsync(os->log_fd); /* Indicate log recovery complete */
+
+ return 0;
+}
+
+/*
+ * Routine to do log recovery
+ */
+static int objstore_recover_log(void)
+{
+ struct ObjStore *os = objstore_os_struct;
+ struct flock lock;
+ int rv = 0;
+
+ /* First, lock the log file */
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ while ( fcntl(os->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ /* Do log recovery, and write initial commit record. */
+ rv = objstore_log_writeback();
+
+ /* Increase the sequence number, since we just wrote a commit. */
+ os->fork_seq++;
+
+ /* Unlock file and run. */
+ lock.l_type = F_UNLCK;
+ while ( fcntl(os->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ return rv;
+}
+
+/*
+ * Opens the object store. This includes log
+ * playback (crash recovery) if the log file exists
+ * and is nonempty.
+ */
+void *objstore_init(char *main_file, char *log_file, size_t *arena_len)
+{
+ struct ObjStore *os;
+ void *arena_ptr;
+ struct sigaction sigact;
+ struct flock lock;
+ off_t file_len, len = arena_len ? *arena_len : 0;
+ size_t file_pages, len_pages;
+
+ arena_ptr = ARENA_ADDRESS;
+
+ objstore_os_struct = os = malloc(sizeof(struct ObjStore));
+ if ( !os )
+ goto errx0;
+
+ os->fork_seq = 0; /* Initialize sequence counter */
+
+ os->main_fd = open(main_file, O_RDWR|O_CREAT, 0666);
+ if ( os->main_fd < 0 )
+ goto errx1;
+
+ os->pagesize = getpagesize();
+ if ( os->pagesize & (os->pagesize - 1) )
+ goto errx2; /* WTF -- pagesize not a power of 2? */
+
+ /* Compute log2(os->pagesize) */
+ os->pageshift = 0;
+ while ( (1 << os->pageshift) < os->pagesize )
+ os->pageshift++;
+
+ /*
+ * Open log file
+ */
+ os->log_fd = open(log_file, O_RDWR|O_APPEND|O_CREAT, 0666);
+ if ( os->log_fd < 0 )
+ goto errx3;
+
+ /* Now, do log recovery if needed */
+ if ( objstore_recover_log() )
+ goto errx3;
+
+ /* Allocate arena memory space */
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+ file_len = lseek(os->main_fd, 0, SEEK_END);
+ if ( len == 0 ) {
+ len = file_len;
+ }
+ len = (len + os->pagesize - 1) & ~(os->pagesize - 1);
+ if ( len > file_len ) {
+ ftruncate(os->main_fd, len); /* Extend file */
+ }
+ lock.l_type = F_UNLCK;
+ while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ os->arena = mmap(arena_ptr, len, PROT_NONE,
+ MAP_ANON|MAP_PRIVATE|MAP_FIXED, 0, 0);
+ if ( os->arena == MAP_FAILED )
+ goto errx3;
+
+ os->arena_len = len;
+ if ( *arena_len )
+ *arena_len = len;
+
+ os->pageinfo = malloc(len >> os->pageshift);
+ if ( !os->pageinfo )
+ goto errx4;
+
+ /* The pageinfo up to and including file_len is "unread"; beyond
+ file_len we know it must be zero and thus it can be marked "clean" */
+ file_len = (file_len + os->pagesize - 1) & ~(os->pagesize-1);
+ file_pages = file_len >> os->pageshift;
+ len_pages = len >> os->pageshift;
+
+ memset(os->pageinfo, page_unread, file_pages);
+
+ if ( len_pages > file_pages ) {
+ mprotect((char *)os->arena + file_len, len - file_len, PROT_READ);
+ memset(os->pageinfo + file_pages, page_clean, len_pages-file_pages);
+ }
+
+ sigact.sa_sigaction = objstore_sigsegv;
+ sigemptyset(&sigact.sa_mask);
+ sigact.sa_flags = SA_RESTART|SA_SIGINFO;
+ if ( sigaction(SIGSEGV, &sigact, &os->oldact) )
+ goto errx5;
+
+ return os->arena;
+
+ errx5:
+ munmap(os->pageinfo, len >> os->pageshift);
+ errx4:
+ munmap(arena_ptr, len);
+ errx3:
+ if ( os->log_fd >= 0 ) close(os->log_fd);
+ errx2:
+ close(os->main_fd);
+ errx1:
+ free(os);
+ errx0:
+
+ return NULL;
+}
+
+/*
+ * Object store checkpoint. Writes entries to the log file.
+ * The "gc_factor" is the factor of maximum log size file relative
+ * to the arena size. For example, if gc_factor == 0.5 then if the
+ * log size is more than 50% of the arena file size a writeback cycle
+ * will take place after the log has been written. This means other
+ * checkpoints will have to wait!
+ *
+ * Set gc_factor to 0.0 to force a gc, and to HUGE_VAL to inhibit gc.
+ */
+int objstore_checkpoint(double gc_factor)
+{
+ struct ObjStore *os = objstore_os_struct;
+ int f;
+ char *pi, *epi;
+ void *page;
+
+ pi = os->pageinfo;
+ epi = os->pageinfo + (os->arena_len >> os->pageshift);
+
+ f = fork();
+ if ( f < 0 )
+ return 1; /* Checkpoint failed! */
+ else if ( f > 0 ) {
+ /* Parent process -- just mark all dirty pages clean */
+
+ size_t size, count;
+ char *opi;
+ int found_dirty;
+
+ /* Aggregate both clean and dirty pages; this should allow the OS
+ to avoid keeping track of quite as many memory protect regions */
+ for ( pi = os->pageinfo ; pi < epi ; pi++ ) {
+ if ( *pi == page_dirty || *pi == page_clean ) {
+ found_dirty = (*pi == page_dirty);
+ page = (char *)os->arena +
+ ((uintptr_t)(pi - os->pageinfo) << os->pageshift);
+
+ opi = pi;
+ size = os->pagesize;
+ count = 1;
+ while ( pi+1 < epi &&
+ (pi[1] == page_dirty || pi[1] == page_clean) ) {
+ pi++;
+ found_dirty = found_dirty || (*pi == page_dirty);
+ count++;
+ size += os->pagesize;
+ }
+ if ( found_dirty ) {
+ mprotect(page, size, PROT_READ);
+ memset(opi, page_clean, count);
+ }
+ }
+ }
+ os->dirty_count = 0; /* No pages dirty */
+ os->fork_seq++; /* Increase next sequence number */
+ return 0;
+ } else {
+ /* Child process -- do the actual work of writing back dirty pages */
+
+ struct ObjStore_LogRecord record, last_rec;
+ struct flock lock;
+ off_t logsize;
+
+ record.magic = LOGRECORD_MAGIC;
+ record.record_type = osrec_page;
+
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ for (;;) {
+ /* First, lock the entire log file */
+ lock.l_type = F_WRLCK;
+ while ( fcntl(os->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ /* Make sure we were indeed next in turn */
+ lseek(os->log_fd, -(off_t)sizeof(last_rec), SEEK_END);
+ if ( objstore_read(os->log_fd, &last_rec, sizeof(last_rec)) < sizeof(last_rec)) {
+ kill(getppid(), SIGABRT); /* Kill main process */
+ _exit(99);
+ }
+ if ( last_rec.size+1 == os->fork_seq )
+ break; /* It's for us... */
+
+ /* Someone else is ahead of us in line. Yield to them. */
+ lock.l_type = F_UNLCK;
+ while ( fcntl(os->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ sched_yield(); /* Snore... */
+ }
+
+ /* Write dirty pages to log file */
+ for ( pi = os->pageinfo ; pi < epi ; pi++ ) {
+ if ( *pi == page_dirty ) {
+ page = (char *)os->arena +
+ ((uintptr_t)(pi - os->pageinfo) << os->pageshift);
+ record.offset = (char *)page - (char *)os->arena;
+
+ /* Aggregate contiguous pages into a single record */
+ record.size = os->pagesize;
+ while ( pi+1 < epi && pi[1] == page_dirty ) {
+ pi++;
+ record.size += os->pagesize;
+ }
+
+ if ( objstore_write(os->log_fd, &record, sizeof(record))
+ < sizeof(record) ||
+ objstore_write(os->log_fd, page, record.size) < record.size ) {
+ kill(getppid(), SIGABRT); /* Kill main process */
+ _exit(99);
+ }
+ }
+ }
+
+ /* This might be more efficiently done with fdatasync() */
+ fsync(os->log_fd); /* Make sure we have written everything */
+
+ /* Write commit record */
+ record.record_type = osrec_commit;
+ record.size = os->fork_seq;
+ record.offset = (off_t)0x54494d43;
+ if ( objstore_write(os->log_fd, &record, sizeof(record)) < sizeof(record) ) {
+ kill(getppid(), SIGABRT);
+ _exit(99);
+ }
+ fsync(os->log_fd);
+
+ /* Check to see if it's time for garbage collect */
+ logsize = lseek(os->log_fd, 0, SEEK_END);
+ if ( gc_factor < HUGE_VAL && (double)logsize >= gc_factor*os->arena_len ) {
+ /* Replaying the log isn't the most efficient way to do this.
+ We could also keep a status bit per page around, and flush
+ them out of the shadow array. The biggest problem with that
+ is that it probably can't be done in the background, unlike
+ this method. Leave this as-is for now. */
+ if ( objstore_log_writeback() ) {
+ kill(getppid(), SIGABRT);
+ _exit(99);
+ }
+ }
+
+ /* Drop lock on log file */
+ lock.l_type = F_UNLCK;
+ while ( fcntl(os->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ _exit(0); /* Done! */
+ }
+}
+
+/*
+ * Extend the size of the object store.
+ *
+ * This currently relies on several Linux-specific features,
+ * specifically mremap() and knowing that we probably can extend
+ * it without changing the virtual address.
+ */
+int objstore_extend(size_t new_size)
+{
+ struct ObjStore *os = objstore_os_struct;
+ struct flock lock;
+ void *newp, *infop;
+ off_t file_size;
+ int ft;
+ size_t add_size, old_size;
+ size_t add_pages, old_pages, new_pages, file_pages;
+
+ old_size = os->arena_len;
+
+ if ( new_size <= old_size )
+ return 0; /* No action */
+
+ new_size = (new_size + os->pagesize - 1) & ~(os->pagesize - 1);
+ add_size = new_size - old_size;
+
+ lock.l_type = F_WRLCK;
+ lock.l_whence = SEEK_SET;
+ lock.l_start = 0;
+ lock.l_len = 0;
+ while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+
+ lock.l_type = F_UNLCK;
+ file_size = lseek(os->main_fd, 0, SEEK_END);
+ if ( file_size < new_size )
+ ft = ftruncate(os->main_fd, new_size);
+ else
+ ft = 0;
+
+ while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR );
+ if ( ft )
+ return -1; /* Failure */
+
+ newp = mmap((char*)os->arena + old_size,
+ add_size,
+ PROT_NONE,
+ MAP_PRIVATE|MAP_ANON|MAP_FIXED, 0, 0);
+
+ if ( newp == MAP_FAILED )
+ return -1; /* Failure */
+
+ /* Since we specified MAP_FIXED, this should be guaranteed */
+ assert( newp == (char*)os->arena + old_size );
+
+ /* Convert sizes to pages */
+ file_size = (file_size + os->pagesize - 1) & ~(os->pagesize-1);
+ new_pages = new_size >> os->pageshift;
+ old_pages = old_size >> os->pageshift;
+ file_pages = file_size >> os->pageshift;
+ add_pages = new_pages - old_pages;
+
+ infop = realloc(os->pageinfo, new_pages);
+ if ( !infop ) {
+ munmap(newp, add_size);
+ return -1; /* Failure */
+ }
+
+ os->arena_len = new_size;
+ os->pageinfo = infop;
+
+ /* If we extended the file, the new area is known to contain
+ zero, and can thus be considered "clean"; otherwise we have
+ to consider it "unread". */
+ if ( file_pages > old_pages ) {
+ memset(os->pageinfo + old_pages, page_unread, file_pages-old_pages);
+ }
+ if ( file_pages < new_pages ) {
+ memset(os->pageinfo + file_pages, page_clean, new_pages-file_pages);
+ mprotect((char *)os->arena + file_size, new_size-file_size, PROT_READ);
+ }
+
+ return 0;
+}