Martin Cracauer's userfaultfd examples
This is an example program illustrating the use of Linux' new API for userlevel handling of page fault.
Begin copy'n'paste
/* * Example program about using userfaultfd(2) for garbage collection. * * This establishes a couple pages, all of which are filled from * compressed files on disk when first accessed. For simplicity these are * one file per page. Files are written at the beginning of the program. * * Later, this program demonstrates the use of write protection to get * a notification on write access, analogous to using mprotect(!PROT_WRITE) * and doing the bookkeeping in a SIGSEGV handler. * */ #include <linux/userfaultfd.h> #include <stdio.h> #include <stdlib.h> #include <errno.h> #include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <sys/ioctl.h> #include <time.h> #include <math.h> #include <unistd.h> #include <strings.h> #include <unistd.h> #include <asm/unistd.h> #include <poll.h> #include <pthread.h> const int pagesize = 4096; // Bookkeeping about pages. Part of the GC, not of userfaultfd handling struct page_attr { int has_been_brought_in_p; int has_been_written_when_wp_p; }; // Random parameters we pass to threads instead of using globals struct pass2uffd_thread { int fd; long long *begin; size_t size; size_t n_pages; struct page_attr *pages; }; void floatsleep(float seconds) { struct timespec req; req.tv_sec = floor(seconds); req.tv_nsec = (float)(seconds - req.tv_sec) * 1000000000.0; nanosleep(&req, NULL); } // This is doing the work in the uffd handler thread void *handler(void *data) { struct pass2uffd_thread *params = data; int fd = params->fd; printf("thread: fd is %d\n",fd); for (;;) { struct uffd_msg msg; struct pollfd pollfd[1]; pollfd[0].fd = params->fd; pollfd[0].events = POLLIN; int pollres; pollres = poll(pollfd, 1, -1); switch (pollres) { case -1: perror("poll userfaultfd"); continue; break; case 0: continue; break; case 1: break; default: fprintf(stderr, "got %d fds out of poll\n", pollres); exit(2); } if (pollfd[0].revents & POLLERR) { fprintf(stderr, "POLLERR on userfaultfd\n"); exit(1); } if (!(pollfd[0].revents & POLLIN)) { continue; } int readret; readret = read(fd, &msg, sizeof(msg)); if (readret == -1) { if (errno == EAGAIN) continue; perror("read userfaultfd"); } if (readret != sizeof(msg)) { fprintf(stderr, "short read, not expected, exiting\n"); exit(1); } if (msg.event & UFFD_EVENT_PAGEFAULT) printf("==> Event is pagefault on %p flags 0x%llx write? 0x%llx wp? 0x%llx\n" , (void *)msg.arg.pagefault.address , msg.arg.pagefault.flags , msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE , msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP ); long long addr = msg.arg.pagefault.address; long long page_begin = addr - (addr % pagesize); long long whichpage = (page_begin - (long long)params->begin) / pagesize; if (whichpage > params->n_pages) { fprintf(stderr, "Page %lld too high\n", whichpage); exit(1); } printf("Messing with page %lld\n", whichpage); /* * Proper sequence is important here. * * For the GC we expect that write-protected pages can only * be pages already backed by physical pages. * Regular writes into unprotected pages that come before * reads need the page be filled. * * So we do the WP case first and get it out of the way. * Then both of the other cases need the page read. */ if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { // send write unlock struct uffdio_writeprotect wp; wp.range.start = (long long)params->begin; wp.range.len = (long long)params->size; wp.mode = 0; printf("sending !UFFDIO_WRITEPROTECT event to userfaultfd\n"); fflush(stdout); if (ioctl(fd, UFFDIO_WRITEPROTECT, &wp) == -1) { perror("ioctl(UFFDIO_WRITEPROTECT)"); } params->pages[whichpage].has_been_written_when_wp_p = 1; continue; } // Page has never been filled, so do that now. // Note that this relies on user only write-protecting pages // after they have been filled. That won't be the case // in a real GC. FILE *f; char cmdname[8192]; snprintf(cmdname, sizeof(cmdname), "zcat tmp%lld.gz", whichpage); f = popen(cmdname, "r"); if (f == NULL) { perror("popen zcat"); exit(1); } char buf[pagesize]; if (fread(buf, pagesize, 1, f) == 0) { perror("fread"); exit(1); } if (fclose(f)) { perror("fclose"); exit(1); } struct uffdio_copy cp; cp.src = (long long)buf; cp.dst = (long long)addr; cp.len = (long long)pagesize; cp.mode = 0; // fixme - is there a symbol for this? printf("sending UFFDIO_COPY event to userfaultfd\n"); fflush(stdout); if (ioctl(fd, UFFDIO_COPY, &cp) == -1) { perror("ioctl(UFFDIO_COPY)"); } } return NULL; } // this function can be thread main body or directly run void *do_some_work(void *data) { struct pass2uffd_thread *params = data; long long *region = params->begin; floatsleep(0.2); printf("worker writing into write-protected area at %p\n", region); fflush(stdout); *region = 43; printf("I survived that\n"); fflush(stdout); floatsleep(0.2); return NULL; } void write_testfiles() { long long word = 0xDEADBEEFDEADBEEF; FILE *f; char *cmds[] = { "gzip > tmp0.gz", "gzip > tmp1.gz", "gzip > tmp2.gz", "gzip > tmp3.gz", NULL }; char **cmd; for (cmd = cmds; *cmd; cmd++) { f = popen(*cmd, "w"); if (f == NULL) { perror("popen gzip"); } int i; for (i = 0; i < pagesize / sizeof(word); i++) { if (!fwrite(&word, sizeof(word), 1, f)) { perror("fwrite"); exit(1); } } if (fclose(f)) { perror("fclose"); exit(1); } } } int main(int argc, char *argv[]) { long long *region; const int n_pages = 128; pthread_t uffd_thread; int uffd; write_testfiles(); printf("userfaultf syscall #: %d\n", __NR_userfaultfd); uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); if (uffd == -1) { perror("syscall"); exit(2); } int uffd_flags; uffd_flags = fcntl(uffd, F_GETFD, NULL); printf("userfaultfd flags: 0x%llX, fd is %d\n", (long long)uffd_flags, uffd); struct uffdio_api uffdio_api; uffdio_api.api = UFFD_API; uffdio_api.features = 0; if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { fprintf(stderr, "UFFDIO_API\n"); return 1; } printf("Features: 0x%llx\n", uffdio_api.features); if (uffdio_api.api != UFFD_API) { fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api); return 1; } #if 0 printf("userfaultfd api: 0x%llX -> 0x%llX\n", UFFD_API, uffdio_api.api); printf("userfaultfd ioctls: 0x%llX (0x%llx)\n", uffdio_api.ioctls, (long long)UFFDIO_REGISTER); #endif region = (long long *) mmap(NULL, pagesize * n_pages, PROT_READ|PROT_WRITE , MAP_PRIVATE|MAP_ANON, -1, 0); if (!region) { perror("mmap"); exit(2); } if (posix_memalign((void **)®ion, pagesize, pagesize * n_pages)) { fprintf(stderr, "cannot align by pagesize %d\n", pagesize); exit(1); } printf("mapped at %p - %p\n", region, region + pagesize * n_pages); struct uffdio_register uffdio_register; uffdio_register.range.start = (unsigned long)region; uffdio_register.range.len = pagesize * n_pages; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING | UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { perror("ioctl(UFFDIO_REGISTER)"); exit(1); } printf("userfaultfd ioctls: 0x%llx\n", uffdio_register.ioctls); int expected = UFFD_API_RANGE_IOCTLS; if ((uffdio_register.ioctls & expected) != expected) { fprintf(stderr, "ioctl set is incorrect\n"); exit(1); } // Our bookkeeping. Part of the GC, has nothing to do with // userfaultfd. Updated in the uffd thread. struct page_attr *pages; pages = malloc(n_pages * sizeof(struct page_attr)); if (pages == NULL) { perror("malloc"); exit(1); } bzero(pages, n_pages * sizeof(struct page_attr)); /* * Set up and start uffd thread. */ struct pass2uffd_thread thr_params; thr_params.fd = uffd; thr_params.begin = region; thr_params.size = pagesize * n_pages; thr_params.n_pages = n_pages; thr_params.pages = pages; pthread_create(&uffd_thread, NULL, handler, &thr_params); printf("mainline testing read on page 0\n"); fflush(stdout); printf("region first word currently is: 0x%llx\n", *region); printf("mainline testing read on page 2\n"); fflush(stdout); printf("region first word currently is: 0x%llx\n" , *(region + 2 * pagesize / sizeof(*region))); printf("mainline writing writable page 1\n"); fflush(stdout); *(region + pagesize / sizeof(*region)) = 0x42; // test write protect on first page struct uffdio_writeprotect wp; wp.range.start = (unsigned long)region; wp.range.len = pagesize * n_pages; wp.mode = UFFDIO_WRITEPROTECT_MODE_WP; if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp) == -1) { perror("ioctl(UFFDIO_WRITEPROTECT)"); exit(1); } printf("worker writing into write-protected page 0\n"); fflush(stdout); *region = 0x43; printf("I survived that\n"); fflush(stdout); if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) { fprintf(stderr, "ioctl unregister failure\n"); return 1; } printf("pages first words currently are:\n" "0x%llx/0x%llx 0x%llx/0x%llx 0x%llx/0x%llx \n" , *region , *(region + 1) , *(region + pagesize / sizeof(*region)) , *(region + 1 + pagesize / sizeof(*region)) , *(region + 2 * pagesize / sizeof(*region)) , *(region + 1 + 2* pagesize / sizeof(*region)) ); free(pages); // fixme - various other cleanup return 0; }---------- end copy and paste -----
Documentation diff
Here is a documentation diff I submitted that IIRC hasn't been merged yet. It mentions the pitfalls I encountered and goes into details on how to deal with write protection faults.diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt index 70a3c94..2dd533b 100644 --- a/Documentation/vm/userfaultfd.txt +++ b/Documentation/vm/userfaultfd.txt @@ -81,6 +81,55 @@ UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an half copied page since it'll keep userfaulting until the copy has finished. +Notes: +- if you requested UFFDIO_REGISTER_MODE_MISSING when registering then + you must provide some kind of page in your thread after reading from + the uffd. You must provide either UFFDIO_COPY or UFFDIO_ZEROPAGE. + The normal behavior of the OS automatically providing a zero page on + an annonymous mmaping is not in place. +- none of the page-delivering ioctls default to the range that you + registered with. You must fill in all fields for the appropriate + ioctl struct including the range. +- you get the address of the access that triggered the missing page + event out of a struct uffd_msg that you read in the thread from the + uffd. You can supply as many pages as you want with UFFDIO_COPY or + UFFDIO_ZEROPAGE. Keep in mind that unless you used DONTWAKE then + the first of any of those IOCTLs wakes up the faulting thread. +- be sure to test for all errors including + (pollfd[0].revents & POLLERR). This can happen, e.g. when ranges + supplied were incorrect. + + +== Workflow to get notification of written pages == + +This is equivalent to (but faster than) using mprotect and a SIGSEGV +signal handler. + +Register a range with UFFDIO_REGISTER_MODE_WP. Instead of using +mprotect(2) you use +ioctl(uffd, UFFDIO_WRITEPROTECT, struct *uffdio_writeprotect) while +mode = UFFDIO_WRITEPROTECT_MODE_WP in the struct passed in. +The range does not default to and does not have to be identical to the +range you registered with. You can write protect as many ranges as +you like (inside the registered range). Then, in the thread reading +from uffd the struct will have +msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP set. Now you send +ioctl(uffd, UFFDIO_WRITEPROTECT, struct *uffdio_writeprotect) again +while pagefault.mode does not have UFFDIO_WRITEPROTECT_MODE_WP set. +This wakes up the thread which will continue to run with writes now +allowsed You can do the bookkeeping about the write in the uffd +reading thread before the ioctl. + +If you registered with both +UFFDIO_REGISTER_MODE_MISSING | UFFDIO_REGISTER_MODE_WP then you +need to think about the sequence in which you supply a page and undo +write protect. Note that there is a difference between writes into a +WP area and into a !WP area. The former will have +UFFD_PAGEFAULT_FLAG_WP set, the latter UFFD_PAGEFAULT_FLAG_WRITE. +The latter did not fail on protection but you still need to supply a +page when UFFDIO_REGISTER_MODE_MISSING was used. + + == QEMU/KVM == QEMU/KVM is using the userfaultfd syscall to implement postcopy live