#!/bin/sh

# Copyright 2019 Johannes 'josch' Schauer <josch@debian.org>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

set -exu

# In this script we are setting up a qemu virtual machine inside which we are
# starting an unprivileged lxc container and test its capabilities to be
# accessible from the outside, reach the outside network itself and generally
# run everything as another user than root.
#
# This could all be implemented as an isolation-machine autopkgtest. Instead we
# prepare a qemu virtual machine because then this script...
#   - can also be run with autopkgtest backends that do not support the
#     isolation-machine restriction
#   - can be run without superuser privileges outside of autopkgtest
#   - allows running tests in an extremely minimal environment with just
#     Essential:yes and other bare necessities
#
# Documentation found anywhere online easily becomes outdated. This script
# serves as a living demonstration of which bits are needed to successfully
# set up unprivileged lxc containers. As it will be executed regularly, one
# can rely on its content being up-to-date and in working order.
#
# We do not use debootstrap to create the chroot tarballs, because debootstrap
# is not able to create a system containing only essential:yes packages and
# their dependencies. We do want to create the most minimal system so that we
# can also find possibly missing dependencies.
#
# We don't use multistrap because it's unmaintained. Thus we create the initial
# tarballs using mmdebstrap.

if [ -z ${AUTOPKGTEST_TMP+x} ]; then
	# if AUTOPKGTEST_TMP is not set, then this script is probably not
	# executed under autopkgtest
	TMPDIR=$(mktemp --directory)
else
	# since AUTOPKGTEST_TMP is set, we assume that this script is executed
	# under autopkgtest --> switch to the temporary directory
	TMPDIR="$AUTOPKGTEST_TMP"
	mkdir -p "$TMPDIR"
fi

# We would like to run mmdebstrap without superuser privileges but we cannot
# use fakechroot mode because of #944929, proot mode produces wrong permissions
# and unshare mode only works if kernel.unprivileged_userns_clone is set to 1
if [ "$(cat /proc/sys/kernel/unprivileged_userns_clone)" = "1" ]; then
	MODE="unshare"
	# the temporary directory might not have read permissions for the
	# unshared user
	chmod a+rx "$TMPDIR"
else
	MODE="root"
fi

# generate a new ssh key for us, so that we can authenticate ourselves to the
# setup system, as well as the cryptsystem (both dropbear and openssh) via
# public key instead of using passwords
if [ ! -e "$TMPDIR/id_rsa" ]; then
ssh-keygen -q -t rsa -f "$TMPDIR/id_rsa" -N ""
fi

cat << SCRIPT > "$TMPDIR/customize.sh"
#!/bin/sh
set -exu

rootfs="\$1"

# setup various files in /etc
echo host > "\$rootfs/etc/hostname"
echo "127.0.0.1 localhost host" > "\$rootfs/etc/hosts"
echo "/dev/vda1 / auto errors=remount-ro 0 1" > "\$rootfs/etc/fstab"
cat /etc/resolv.conf > "\$rootfs/etc/resolv.conf"
echo 'net.ipv4.ip_forward=1' > "\$rootfs/etc/sysctl.conf"

# give a trivial password to the root user for easy debugging in case something fails
echo root:abcdef | chroot "\$rootfs" /usr/sbin/chpasswd

# These iptable rules connect the main network interface of the host with the
# lxcbridge to which the container is connected
cat << END > "\$rootfs/etc/iptables/rules.v4"
*nat
:PREROUTING ACCEPT [0:0]
:INPUT ACCEPT [0:0]
:OUTPUT ACCEPT [0:0]
:POSTROUTING ACCEPT [0:0]
-A PREROUTING -i eth0 -p tcp -m tcp --dport 8002 -j DNAT --to-destination 10.0.0.2:8003
-A POSTROUTING -o eth0 -j MASQUERADE
COMMIT
*filter
:INPUT ACCEPT [0:0]
:FORWARD ACCEPT [0:0]
:OUTPUT ACCEPT [0:0]
COMMIT
END

# extlinux config to boot from /dev/vda1 with predictable network interface
# naming and a serial console for logging
cat << END > "\$rootfs/extlinux.conf"
default linux
timeout 0

label linux
kernel /vmlinuz
append initrd=/initrd.img root=/dev/vda1 net.ifnames=0 console=ttyS0
END

# network interface config
# we can use eth0 because we boot with net.ifnames=0 for predictable interface
# names
cat << END > "\$rootfs/etc/network/interfaces"
auto lo
iface lo inet loopback

auto eth0
iface eth0 inet dhcp

auto lxcbridge
iface lxcbridge inet static
  address 10.0.0.1
  netmask 255.255.255.0
  bridge_stp off
  bridge_waitport 0
  bridge_fd 0
  bridge_ports none
END

# add a user and group id range for the root user that can be used by the
# container
echo 'root:1000000:65536' >> "\$rootfs/etc/subuid"
echo 'root:1000000:65536' >> "\$rootfs/etc/subgid"

# write out a config file for the container with only the bare necessities
mkdir "\$rootfs/var/lib/lxc/container"
cat << END > "\$rootfs/var/lib/lxc/container/config"
lxc.include = /usr/share/lxc/config/common.conf
lxc.include = /usr/share/lxc/config/userns.conf
lxc.uts.name = container
lxc.rootfs.path = dir:/srv/container
# enable unprivileged operation
lxc.idmap = u 0 1000000 65536
lxc.idmap = g 0 1000000 65536
# configure networking
lxc.net.0.ipv4.address.address = 10.0.0.2/24
lxc.net.0.ipv4.gateway = 10.0.0.1
lxc.net.0.link = lxcbridge
lxc.net.0.type = veth
lxc.net.0.flags = up
lxc.net.0.name = eth0
# enable autostart
lxc.start.auto = 1
# disable apparmor because of Debian bug #947863
lxc.apparmor.profile = unconfined
END

# copy in the public key
mkdir "\$rootfs/root/.ssh"
cp "$TMPDIR/id_rsa.pub" "\$rootfs/root/.ssh/authorized_keys"
chroot "\$rootfs" chown 0:0 /root/.ssh/authorized_keys
SCRIPT
chmod +x "$TMPDIR/customize.sh"

# This creates the host system running inside qemu.
#
# in addition to Essential:yes packages we require:
#   openssh-server:      to communicate with the host
#   systemd-sysv:        since init is not Essential:yes anymore, we have to pick
#                        an init system ourselves. We pick the one that is
#                        currently the default
#   ifupdown:            for /etc/network/interfaces
#   netbase:             for networking
#   isc-dhcp-client:     to acquire the IP address from qemu via dhcp
#   udev:                to populate /dev
#   policykit-1:         or otherwise systemctl will error with
#                        "Failed to connect to bus: No such file or directory"
#   linux-image-amd64:   as kernel
#   lxc:                 that's what we are testing ;)
#   bridge-utils:        required to bring lxcbridge network interface up
#   procps:              for /bin/ps
#   iptables-persistent: for /etc/iptables/rules.v4
if [ ! -e "$TMPDIR/debian-unstable-host.tar" ]; then
mmdebstrap --mode=$MODE --variant=apt \
	--include=openssh-server,systemd-sysv,ifupdown,netbase,isc-dhcp-client,udev,policykit-1,linux-image-amd64,lxc,bridge-utils,procps,iptables-persistent,dnsmasq-base \
	--customize-hook="$TMPDIR/customize.sh" \
	unstable "$TMPDIR/debian-unstable-host.tar"
fi

# we prepare a second tarball now instead of later inside qemu because
# running mmdebstrap without kvm just wastes cpu cycles
#
# The tarball produced by mmdebstrap is piped to a Python script that shifts
# the user id and group id of all tar members by 1000000
if [ ! -e "$TMPDIR/debian-unstable-container.tar" ]; then
mmdebstrap --mode=$MODE --variant=apt --include=systemd-sysv \
	--customize-hook='echo container > "$1/etc/hostname"' \
	--customize-hook='cat /etc/resolv.conf > "$1/etc/resolv.conf"' \
	unstable - \
	| python3 -c '
import tarfile
import sys

with tarfile.open(fileobj=sys.stdin.buffer, mode="r|*") as in_tar, \
    tarfile.open(fileobj=sys.stdout.buffer, mode="w|") as out_tar:
    for member in in_tar:
        member.uid += 1000000
        member.gid += 1000000
        if member.isfile():
            with in_tar.extractfile(member) as file:
                out_tar.addfile(member, file)
        else:
            out_tar.addfile(member)
' > "$TMPDIR/debian-unstable-container.tar"
fi

# use guestfish to prepare the host system
#
#  - create a single 2G partition and unpack the rootfs tarball into it
#  - unpack the tarball of the container into /srv/container
#  - put a syslinux MBR into the first 440 bytes of the drive
#  - install extlinux and make partition bootable
#
# useful stuff to debug any errors:
#   LIBGUESTFS_BACKEND_SETTINGS=force_tcg
#   libguestfs-test-tool || true
#   export LIBGUESTFS_DEBUG=1 LIBGUESTFS_TRACE=1
guestfish -N "$TMPDIR/host.img"=disk:2G -- \
	part-disk /dev/sda mbr : \
	mkfs ext2 /dev/sda1 : \
	mount /dev/sda1 / : \
	tar-in "$TMPDIR/debian-unstable-host.tar" / : \
	mkdir /srv/container : \
	tar-in "$TMPDIR/debian-unstable-container.tar" /srv/container : \
	upload /usr/lib/SYSLINUX/mbr.bin /mbr.bin : \
	copy-file-to-device /mbr.bin /dev/sda size:440 : \
	rm /mbr.bin : \
	extlinux / : \
	sync : \
	umount / : \
	part-set-bootable /dev/sda 1 true : \
	shutdown

# start the host system
# prefer using kvm but fall back to tcg if not available
# avoid entropy starvation by feeding the crypt system with random bits from /dev/urandom
# the default memory size of 128 MiB is not enough for Debian, so we go with 1G
# use a virtio network card instead of emulating a real network device
# we don't need any graphics
# this also multiplexes the console and the monitor to stdio
# creates a multiplexed stdio backend connected to the serial port and the qemu
# monitor
# redirect tcp connections on port 10022 localhost to the host system port 22
# redirect tcp connections on port 8001 localhost to the host system port 8002
# redirect all output to a file
# run in the background
qemu-system-x86_64 \
	-M accel=kvm:tcg \
	-no-user-config \
	-object rng-random,filename=/dev/urandom,id=rng0 -device virtio-rng-pci,rng=rng0 \
	-m 1G \
	-net nic,model=virtio \
	-nographic \
	-serial mon:stdio \
	-net user,hostfwd=tcp:127.0.0.1:10022-:22,hostfwd=tcp:127.0.0.1:8001-:8002 \
	-drive file="$TMPDIR/host.img",format=raw,if=virtio \
	> "$TMPDIR/qemu.log" </dev/null 2>&1 &

# store the pid
QEMUPID=$!

onerror() {
	cat --show-nonprinting $TMPDIR/qemu.log
	# attempt poweroff
	$ssh -o ConnectTimeout=$TIMEOUT root@localhost systemctl poweroff
	# give a few seconds for poweroff
	sleep 10
	kill $QEMUPID || true
	# turn off verbose output
	set +x
	echo "script failed -- temporary files are stored in $TMPDIR:"
	echo
	ls -lha "$TMPDIR"
	echo
	echo "to test yourself, run qemu with:"
	echo
	echo "    $ qemu-system-x86_64 -no-user-config -m 1G -net nic,model=virtio -nographic -serial mon:stdio -net user,hostfwd=tcp:127.0.0.1:10022-:22,hostfwd=tcp:127.0.0.1:8001-:8002 -drive file=\"$TMPDIR/host.img\",format=raw,if=virtio"
	echo
	echo "and log in using:"
	echo
	echo "    user: root"
	echo "    pass: abcdef"
	echo
	echo "or connect to it via ssh:"
	echo
	echo "    $ $ssh root@localhost"
	echo
	echo "when you are done, cleanup temporary files with:"
	echo
	echo "    $ rm -r \"$TMPDIR\""
}

# show the log and kill qemu in case the script exits first
trap onerror EXIT

# the default ssh command does not store known hosts and even ignores host keys
# it identifies itself with the rsa key generated above
# pseudo terminal allocation is disabled or otherwise, programs executed via
# ssh might wait for input on stdin of the ssh process
ssh="ssh -oUserKnownHostsFile=/dev/null -oStrictHostKeyChecking=no -i "$TMPDIR/id_rsa" -T -p 10022"

# we use sleepenh to make sure that we wait the right number of seconds
# independent on how long the command took beforehand
TIMESTAMP=$(sleepenh 0 || [ $? -eq 1 ])
# the timeout in seconds
TIMEOUT=5
# the maximum number of tries
NUM_TRIES=20
i=0
while true; do
	rv=0
	$ssh -o ConnectTimeout=$TIMEOUT root@localhost echo success || rv=1
	# with an exit code of zero, the ssh connection was successful
	# and we break out of the loop
	[ $rv -eq 0 ] && break
	# if the command before took less than $TIMEOUT seconds, wait the remaining time
	TIMESTAMP=$(sleepenh $TIMESTAMP $TIMEOUT || [ $? -eq 1 ]);
	# increment the counter and break out of the loop if we tried
	# too often
	i=$((i+1))
	if [ $i -ge $NUM_TRIES ]; then
		break
	fi
done

# if all tries were exhausted, the process failed
if [ $i -eq $NUM_TRIES ]; then
	echo "timeout reached: unable to connect to qemu via ssh"
	exit 1
fi

# write out a file with the expected output of lxc-ls
cat << END > "$TMPDIR/expected"
NAME      STATE   AUTOSTART IPV4     UNPRIVILEGED 
container RUNNING 1         10.0.0.2 true         
END

# compare the actual output of lxc-ls to the expected one
$ssh root@localhost lxc-ls --fancy --fancy-format=NAME,STATE,AUTOSTART,IPV4,UNPRIVILEGED | diff -u "$TMPDIR/expected" -

# test restarting the container
$ssh root@localhost lxc-stop --name=container
$ssh root@localhost lxc-start --name=container --daemon

# install netcat and procps to test that the container can reach the internet
$ssh root@localhost lxc-attach --name=container -- apt-get update
$ssh root@localhost lxc-attach --name=container -- apt-get install --no-install-recommends --yes netcat-traditional procps

# start listening on port 8003 inside the container to test that services
# inside the container are reachable from the outside
$ssh root@localhost lxc-attach --name=container -- nc -lp 8003 | grep foobar &
NCPID=$!
# sleep five seconds to give above command time to start
sleep 5
echo foobar | nc -q0 127.0.0.1 8001
wait $NCPID

# check that the processes spawned by lxc are indeed not run by the root user
LXCPID=$($ssh root@localhost lxc-info --name container | awk '/^PID:/ { print $2; }')
$ssh root@localhost ps -jf -u 1000000 | awk "\$2 == $LXCPID" | grep /sbin/init
$ssh root@localhost lxc-attach --name=container -- ps -jf -u 0 | grep /sbin/init

# shut the system off
trap - EXIT
$ssh root@localhost systemctl poweroff || true
wait $QEMUPID

# cleanup
for f in debian-unstable-container.tar debian-unstable-host.tar id_rsa id_rsa.pub \
	expected qemu.log host.img customize.sh; do
	rm "$TMPDIR/$f"
done
if [ -z ${AUTOPKGTEST_TMP+x} ]; then
	rmdir "$TMPDIR"
fi
