diff options
author | H. Peter Anvin <hpa@zytor.com> | 2001-07-08 21:51:31 +0000 |
---|---|---|
committer | H. Peter Anvin <hpa@zytor.com> | 2001-07-08 21:51:31 +0000 |
commit | 7be4b5ec6f20e3ffe13e2a574549e8028faba526 (patch) | |
tree | 8e14b9522e61a08a1c09c69639c0797c52b4368b /arena.c | |
download | lpsm-7be4b5ec6f20e3ffe13e2a574549e8028faba526.tar.gz lpsm-7be4b5ec6f20e3ffe13e2a574549e8028faba526.tar.xz lpsm-7be4b5ec6f20e3ffe13e2a574549e8028faba526.zip |
Initial version under CVS control
Diffstat (limited to 'arena.c')
-rw-r--r-- | arena.c | 642 |
1 files changed, 642 insertions, 0 deletions
@@ -0,0 +1,642 @@ +#ident "$Id$" +/* ----------------------------------------------------------------------- * + * + * Copyright 2000 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * objstore.c + * + * Persistent object store implemented using memory-mapping tricks + */ + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <unistd.h> +#include <signal.h> +#include <inttypes.h> +#include <limits.h> +#include <fcntl.h> +#include <math.h> /* HUGE_VAL */ +#define __USE_MISC 1 /* Needed to support mremap() */ +#define __USE_GNU 1 /* Needed to support mremap() */ +#include <sys/mman.h> +#include <sys/stat.h> +#include <sched.h> + +#define OBJSTORE_INTERNALS 1 +#include "objstore.h" + +enum page_status { + page_unread = 0, + page_clean = 1, + page_dirty = 2, +}; + +/* + * This is the data structure for the object store. Note that only + * one active object store is supported, due to the need to trap + * SIGSEGV. + */ +struct ObjStore *objstore_os_struct; + +/* Wrappers for read() and write() which retries if incomplete */ +static ssize_t objstore_read(int fd, void *buf, size_t count) +{ + char *bufp = buf; + ssize_t total = 0; + ssize_t rv; + + while ( count ) { + rv = read(fd, bufp, count); + if ( rv == -1 ) { + if ( errno == EINTR || errno == EAGAIN ) + continue; + else + return total ? total : -1; + } else if ( rv == 0 ) { + return total; + } + bufp += rv; + count -= rv; + total += rv; + } + + return total; +} + +static ssize_t objstore_write(int fd, void *buf, size_t count) +{ + char *bufp = buf; + ssize_t total = 0; + ssize_t rv; + + while ( count ) { + rv = write(fd, bufp, count); + if ( rv == -1 ) { + if ( errno == EINTR || errno == EAGAIN ) + continue; + else + return total ? total : -1; + } else if ( rv == 0 ) { + return total; + } + bufp += rv; + count -= rv; + total += rv; + } + + return total; +} + +/* + * SIGSEGV handler for persistent object store + */ +static void objstore_sigsegv(int signal, siginfo_t *siginfo, void *ptr) +{ + struct ObjStore *os = objstore_os_struct; + void *page; + off_t offset; + char *pageinfo; + struct flock lock; + int old_errno = errno; +#ifdef __linux__ + struct sigcontext *ctxt; + +# ifdef __i386__ /* This is so specific to Linux/i386 */ + if ( siginfo->si_code == 0 ) { + /* Old kernel. Fill in data to the best of our knowledge. */ + /* Don't even begin to ask me where the 0x14 comes from */ + ctxt = (struct sigcontext *)((char *)ptr + 0x14); + if ( ctxt->trapno == 14 ) { + /* Linux/i386 uses unmapped pages to mimic PROT_NONE, so we can't + tell ACCERR and MAPERR apart from the register state */ + siginfo->si_code = SEGV_ACCERR; + siginfo->si_addr = (void *)ctxt->cr2; + } + } +# endif /* __i386__ */ +#endif /* __linux__ */ + + if ( signal != SIGSEGV || siginfo->si_code != SEGV_ACCERR || + ((uintptr_t)siginfo->si_addr - (uintptr_t)os->arena) >= os->arena_len ) { + struct sigaction dfl; + + dfl.sa_handler = SIG_DFL; + sigemptyset(&dfl.sa_mask); + dfl.sa_flags = SA_ONESHOT; + sigaction(SIGSEGV, &dfl, NULL); + + errno = old_errno; + return; /* Re-take fault */ + } + + page = (void *)((uintptr_t)siginfo->si_addr & ~(os->pagesize-1)); + offset = (uintptr_t)page - (uintptr_t)os->arena; + pageinfo = os->pageinfo + (offset >> os->pageshift); + + mprotect(page, os->pagesize, PROT_READ|PROT_WRITE); + + switch ( (enum page_status) *pageinfo ) { + case page_unread: + lseek(os->main_fd, offset, SEEK_SET); + + lock.l_type = F_RDLCK; + lock.l_whence = SEEK_SET; + lock.l_start = offset; + lock.l_len = os->pagesize; + while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR ); + if ( objstore_read(os->main_fd, page, os->pagesize) < os->pagesize ) + abort(); /* Uh-oh... */ + + lock.l_type = F_UNLCK; + while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR ); + + mprotect(page, os->pagesize, PROT_READ); /* Make page readonly */ + *pageinfo = page_clean; /* Page read and clean */ + os->loaded_count++; /* For accounting purposes */ + break; + + case page_clean: + *pageinfo = page_dirty; /* Page now dirty */ + os->dirty_count++; /* For accounting purposes */ + /* Leave page r/w */ + break; + + default: + abort(); /* This shouldn't happen */ + } + + errno = old_errno; +} + +/* + * Routine to do log writeback. Used by initial log recovery routine + * as well as during-execution garbage collect. + * THIS ROUTINE SHOULD BE INVOKED WITH LOCK HELD ON THE LOG FILE. + */ +static int objstore_log_writeback(void) +{ + struct ObjStore *os = objstore_os_struct; + struct ObjStore_LogRecord record; + off_t position, last_commit; + struct flock lockmain; + + last_commit = 0; /* Last COMMIT record found */ + position = lseek(os->log_fd, 0, SEEK_SET); + + while ( objstore_read(os->log_fd, &record, sizeof(record)) == sizeof(record) ) { + if ( record.magic != LOGRECORD_MAGIC ) + break; /* Bad magic, assume rest of log corrupt */ + if ( record.record_type == osrec_commit ) { + /* NOTE: last_commit points to the final byte to examine, thus + at the *end* of the final commit record. */ + position += sizeof(record); + last_commit = position; /* Found a commit record */ + } else if ( record.record_type == osrec_page ) { + /* Advance past current page cluster */ + position = lseek(os->log_fd, record.size, SEEK_CUR); + } else { + return -1; /* Unknown record - unsafe to process */ + } + } + + /* Now we know where the last commit was. Now we can process + everything up to that point. */ + + position = lseek(os->log_fd, 0, SEEK_SET); + + while ( objstore_read(os->log_fd, &record, sizeof(record)) + == sizeof(record) && position < last_commit ) { + if ( record.magic != LOGRECORD_MAGIC ) + break; /* Bad magic, assume rest of log corrupt */ + if ( record.record_type == osrec_commit ) { + /* Found a commit record, do nothing */ + position += sizeof(record); + } else if ( record.record_type == osrec_page ) { + /* Write back data to file */ + char *data; + + position += sizeof(record); + if ( !data ) + return -1; /* Badness... */ + + lockmain.l_type = F_WRLCK; + lockmain.l_whence = SEEK_SET; + lockmain.l_start = record.offset; + lockmain.l_len = record.size; + while ( fcntl(os->main_fd, F_SETLKW, &lockmain) == -1 && errno == EINTR ); + data = mmap(NULL, record.size, PROT_WRITE, MAP_SHARED, + os->main_fd, record.offset); + if ( data == MAP_FAILED ) + return -1; + if ( objstore_read(os->log_fd, data, record.size) != record.size ) + return -1; /* Badness */ + if ( munmap(data, record.size) ) + return -1; + + lockmain.l_type = F_UNLCK; + while ( fcntl(os->main_fd, F_SETLKW, &lockmain) == -1 && errno == EINTR ); + position += record.size; + } else { + return -1; /* Unknown record - unsafe to process */ + } + } + + /* Log successfully recovered. Truncate. */ + fsync(os->main_fd); + ftruncate(os->log_fd, 0); + /* Write initial commit record, for sequence number recovery */ + record.magic = LOGRECORD_MAGIC; + record.record_type = osrec_commit; + record.size = os->fork_seq; + record.offset = 0x54494d43; /* For debugging */ + if ( objstore_write(os->log_fd, &record, sizeof(record)) < sizeof(record) ) + return -1; + + fsync(os->log_fd); /* Indicate log recovery complete */ + + return 0; +} + +/* + * Routine to do log recovery + */ +static int objstore_recover_log(void) +{ + struct ObjStore *os = objstore_os_struct; + struct flock lock; + int rv = 0; + + /* First, lock the log file */ + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + while ( fcntl(os->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR ); + + /* Do log recovery, and write initial commit record. */ + rv = objstore_log_writeback(); + + /* Increase the sequence number, since we just wrote a commit. */ + os->fork_seq++; + + /* Unlock file and run. */ + lock.l_type = F_UNLCK; + while ( fcntl(os->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR ); + + return rv; +} + +/* + * Opens the object store. This includes log + * playback (crash recovery) if the log file exists + * and is nonempty. + */ +void *objstore_init(char *main_file, char *log_file, size_t *arena_len) +{ + struct ObjStore *os; + void *arena_ptr; + struct sigaction sigact; + struct flock lock; + off_t file_len, len = arena_len ? *arena_len : 0; + size_t file_pages, len_pages; + + arena_ptr = ARENA_ADDRESS; + + objstore_os_struct = os = malloc(sizeof(struct ObjStore)); + if ( !os ) + goto errx0; + + os->fork_seq = 0; /* Initialize sequence counter */ + + os->main_fd = open(main_file, O_RDWR|O_CREAT, 0666); + if ( os->main_fd < 0 ) + goto errx1; + + os->pagesize = getpagesize(); + if ( os->pagesize & (os->pagesize - 1) ) + goto errx2; /* WTF -- pagesize not a power of 2? */ + + /* Compute log2(os->pagesize) */ + os->pageshift = 0; + while ( (1 << os->pageshift) < os->pagesize ) + os->pageshift++; + + /* + * Open log file + */ + os->log_fd = open(log_file, O_RDWR|O_APPEND|O_CREAT, 0666); + if ( os->log_fd < 0 ) + goto errx3; + + /* Now, do log recovery if needed */ + if ( objstore_recover_log() ) + goto errx3; + + /* Allocate arena memory space */ + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR ); + file_len = lseek(os->main_fd, 0, SEEK_END); + if ( len == 0 ) { + len = file_len; + } + len = (len + os->pagesize - 1) & ~(os->pagesize - 1); + if ( len > file_len ) { + ftruncate(os->main_fd, len); /* Extend file */ + } + lock.l_type = F_UNLCK; + while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR ); + + os->arena = mmap(arena_ptr, len, PROT_NONE, + MAP_ANON|MAP_PRIVATE|MAP_FIXED, 0, 0); + if ( os->arena == MAP_FAILED ) + goto errx3; + + os->arena_len = len; + if ( *arena_len ) + *arena_len = len; + + os->pageinfo = malloc(len >> os->pageshift); + if ( !os->pageinfo ) + goto errx4; + + /* The pageinfo up to and including file_len is "unread"; beyond + file_len we know it must be zero and thus it can be marked "clean" */ + file_len = (file_len + os->pagesize - 1) & ~(os->pagesize-1); + file_pages = file_len >> os->pageshift; + len_pages = len >> os->pageshift; + + memset(os->pageinfo, page_unread, file_pages); + + if ( len_pages > file_pages ) { + mprotect((char *)os->arena + file_len, len - file_len, PROT_READ); + memset(os->pageinfo + file_pages, page_clean, len_pages-file_pages); + } + + sigact.sa_sigaction = objstore_sigsegv; + sigemptyset(&sigact.sa_mask); + sigact.sa_flags = SA_RESTART|SA_SIGINFO; + if ( sigaction(SIGSEGV, &sigact, &os->oldact) ) + goto errx5; + + return os->arena; + + errx5: + munmap(os->pageinfo, len >> os->pageshift); + errx4: + munmap(arena_ptr, len); + errx3: + if ( os->log_fd >= 0 ) close(os->log_fd); + errx2: + close(os->main_fd); + errx1: + free(os); + errx0: + + return NULL; +} + +/* + * Object store checkpoint. Writes entries to the log file. + * The "gc_factor" is the factor of maximum log size file relative + * to the arena size. For example, if gc_factor == 0.5 then if the + * log size is more than 50% of the arena file size a writeback cycle + * will take place after the log has been written. This means other + * checkpoints will have to wait! + * + * Set gc_factor to 0.0 to force a gc, and to HUGE_VAL to inhibit gc. + */ +int objstore_checkpoint(double gc_factor) +{ + struct ObjStore *os = objstore_os_struct; + int f; + char *pi, *epi; + void *page; + + pi = os->pageinfo; + epi = os->pageinfo + (os->arena_len >> os->pageshift); + + f = fork(); + if ( f < 0 ) + return 1; /* Checkpoint failed! */ + else if ( f > 0 ) { + /* Parent process -- just mark all dirty pages clean */ + + size_t size, count; + char *opi; + int found_dirty; + + /* Aggregate both clean and dirty pages; this should allow the OS + to avoid keeping track of quite as many memory protect regions */ + for ( pi = os->pageinfo ; pi < epi ; pi++ ) { + if ( *pi == page_dirty || *pi == page_clean ) { + found_dirty = (*pi == page_dirty); + page = (char *)os->arena + + ((uintptr_t)(pi - os->pageinfo) << os->pageshift); + + opi = pi; + size = os->pagesize; + count = 1; + while ( pi+1 < epi && + (pi[1] == page_dirty || pi[1] == page_clean) ) { + pi++; + found_dirty = found_dirty || (*pi == page_dirty); + count++; + size += os->pagesize; + } + if ( found_dirty ) { + mprotect(page, size, PROT_READ); + memset(opi, page_clean, count); + } + } + } + os->dirty_count = 0; /* No pages dirty */ + os->fork_seq++; /* Increase next sequence number */ + return 0; + } else { + /* Child process -- do the actual work of writing back dirty pages */ + + struct ObjStore_LogRecord record, last_rec; + struct flock lock; + off_t logsize; + + record.magic = LOGRECORD_MAGIC; + record.record_type = osrec_page; + + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + for (;;) { + /* First, lock the entire log file */ + lock.l_type = F_WRLCK; + while ( fcntl(os->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR ); + + /* Make sure we were indeed next in turn */ + lseek(os->log_fd, -(off_t)sizeof(last_rec), SEEK_END); + if ( objstore_read(os->log_fd, &last_rec, sizeof(last_rec)) < sizeof(last_rec)) { + kill(getppid(), SIGABRT); /* Kill main process */ + _exit(99); + } + if ( last_rec.size+1 == os->fork_seq ) + break; /* It's for us... */ + + /* Someone else is ahead of us in line. Yield to them. */ + lock.l_type = F_UNLCK; + while ( fcntl(os->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR ); + + sched_yield(); /* Snore... */ + } + + /* Write dirty pages to log file */ + for ( pi = os->pageinfo ; pi < epi ; pi++ ) { + if ( *pi == page_dirty ) { + page = (char *)os->arena + + ((uintptr_t)(pi - os->pageinfo) << os->pageshift); + record.offset = (char *)page - (char *)os->arena; + + /* Aggregate contiguous pages into a single record */ + record.size = os->pagesize; + while ( pi+1 < epi && pi[1] == page_dirty ) { + pi++; + record.size += os->pagesize; + } + + if ( objstore_write(os->log_fd, &record, sizeof(record)) + < sizeof(record) || + objstore_write(os->log_fd, page, record.size) < record.size ) { + kill(getppid(), SIGABRT); /* Kill main process */ + _exit(99); + } + } + } + + /* This might be more efficiently done with fdatasync() */ + fsync(os->log_fd); /* Make sure we have written everything */ + + /* Write commit record */ + record.record_type = osrec_commit; + record.size = os->fork_seq; + record.offset = (off_t)0x54494d43; + if ( objstore_write(os->log_fd, &record, sizeof(record)) < sizeof(record) ) { + kill(getppid(), SIGABRT); + _exit(99); + } + fsync(os->log_fd); + + /* Check to see if it's time for garbage collect */ + logsize = lseek(os->log_fd, 0, SEEK_END); + if ( gc_factor < HUGE_VAL && (double)logsize >= gc_factor*os->arena_len ) { + /* Replaying the log isn't the most efficient way to do this. + We could also keep a status bit per page around, and flush + them out of the shadow array. The biggest problem with that + is that it probably can't be done in the background, unlike + this method. Leave this as-is for now. */ + if ( objstore_log_writeback() ) { + kill(getppid(), SIGABRT); + _exit(99); + } + } + + /* Drop lock on log file */ + lock.l_type = F_UNLCK; + while ( fcntl(os->log_fd, F_SETLKW, &lock) == -1 && errno == EINTR ); + + _exit(0); /* Done! */ + } +} + +/* + * Extend the size of the object store. + * + * This currently relies on several Linux-specific features, + * specifically mremap() and knowing that we probably can extend + * it without changing the virtual address. + */ +int objstore_extend(size_t new_size) +{ + struct ObjStore *os = objstore_os_struct; + struct flock lock; + void *newp, *infop; + off_t file_size; + int ft; + size_t add_size, old_size; + size_t add_pages, old_pages, new_pages, file_pages; + + old_size = os->arena_len; + + if ( new_size <= old_size ) + return 0; /* No action */ + + new_size = (new_size + os->pagesize - 1) & ~(os->pagesize - 1); + add_size = new_size - old_size; + + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR ); + + lock.l_type = F_UNLCK; + file_size = lseek(os->main_fd, 0, SEEK_END); + if ( file_size < new_size ) + ft = ftruncate(os->main_fd, new_size); + else + ft = 0; + + while ( fcntl(os->main_fd, F_SETLKW, &lock) == -1 && errno == EINTR ); + if ( ft ) + return -1; /* Failure */ + + newp = mmap((char*)os->arena + old_size, + add_size, + PROT_NONE, + MAP_PRIVATE|MAP_ANON|MAP_FIXED, 0, 0); + + if ( newp == MAP_FAILED ) + return -1; /* Failure */ + + /* Since we specified MAP_FIXED, this should be guaranteed */ + assert( newp == (char*)os->arena + old_size ); + + /* Convert sizes to pages */ + file_size = (file_size + os->pagesize - 1) & ~(os->pagesize-1); + new_pages = new_size >> os->pageshift; + old_pages = old_size >> os->pageshift; + file_pages = file_size >> os->pageshift; + add_pages = new_pages - old_pages; + + infop = realloc(os->pageinfo, new_pages); + if ( !infop ) { + munmap(newp, add_size); + return -1; /* Failure */ + } + + os->arena_len = new_size; + os->pageinfo = infop; + + /* If we extended the file, the new area is known to contain + zero, and can thus be considered "clean"; otherwise we have + to consider it "unread". */ + if ( file_pages > old_pages ) { + memset(os->pageinfo + old_pages, page_unread, file_pages-old_pages); + } + if ( file_pages < new_pages ) { + memset(os->pageinfo + file_pages, page_clean, new_pages-file_pages); + mprotect((char *)os->arena + file_size, new_size-file_size, PROT_READ); + } + + return 0; +} |