vx32

Local 9vx git repository for patches.
git clone git://r-36.net/vx32
Log | Files | Refs

commit b0d887c52d091ae3c62316cfc54ff178f64ab850
parent a700164cb1d15543535ee17fbff2f91fe4b4f595
Author: John (EBo) David <ebo@users.sourceforge.net>
Date:   Mon, 21 Jun 2010 03:17:15 -0500

attempting merge

--HG--
branch : yy-int-branch

Diffstat:
M.hgignore | 2++
MCONTRIBUTORS | 6++++++
Adoc/9vx.1 | 127+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/9vx-tap | 27+++++++++++++++++++++++++++
Msrc/9vx/LICENSE | 2++
Msrc/9vx/Makefrag | 62+++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Asrc/9vx/a/aoe.h | 84+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/9vx/a/chan.c | 2+-
Asrc/9vx/a/devaoe.c | 2575+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/9vx/a/devcons.c | 1+
Asrc/9vx/a/devether.c | 542+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/9vx/a/devsd.c | 16+++++++++++++++-
Asrc/9vx/a/dosfs.h | 62++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/etherif.h | 39+++++++++++++++++++++++++++++++++++++++
Msrc/9vx/a/fns.ed | 50++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/9vx/a/fns.h | 53+++++++++++++++++++++++++++++++++++++++++++++++++----
Asrc/9vx/a/fs.h | 38++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip.ed | 2297+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/arp.c | 684+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/chandial.c | 124+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/devip.c | 1439+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/eipconvtest.c | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/esp.c | 951+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/ethermedium.c | 766+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/gre.c | 283+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/icmp.c | 490+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/icmp6.c | 946+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/igmp.c | 294+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/il.c | 1408+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/inferno.c | 46++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/ip.c | 776+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/ip.h | 677+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/ipaux.c | 368+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/ipifc.c | 1654+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/ipmux.c | 842+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/iproute.c | 854+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/ipv6.c | 718+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/ipv6.h | 185+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/loopbackmedium.c | 120+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/netdevmedium.c | 153+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/netlog.c | 261+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/nullmedium.c | 39+++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/pktmedium.c | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/ptclbsum.c | 72++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/rudp.c | 1055+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/tcp.c | 3209+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/tripmedium.c | 398+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/ip/udp.c | 619+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/kfs.h | 57+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/a/netif.c | 761+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/9vx/a/netif.h | 8++++----
Asrc/9vx/a/part.c | 341+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/9vx/a/pgrp.c | 2+-
Msrc/9vx/a/portfns.h | 20++++++++++----------
Msrc/9vx/a/qlock.c | 18++++++++++--------
Msrc/9vx/a/sd.h | 5+++++
Asrc/9vx/a/sdaoe.c | 652+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/9vx/bootcode.9 | 0
Msrc/9vx/devip.c | 2+-
Msrc/9vx/devtab.c | 9++++++---
Asrc/9vx/etherpcap.c | 189+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/ethertap.c | 185+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/fossil.9 | 0
Msrc/9vx/main.c | 243+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Msrc/9vx/mmu.c | 28++++++++++++++++++++--------
Msrc/9vx/sched.c | 8++++----
Msrc/9vx/sdloop.c | 59++++++++++++++++++++++++++++++++++++++++++++++++++---------
Msrc/9vx/u.h | 1+
Asrc/9vx/venti.9 | 0
Asrc/9vx/vether.c | 122+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/9vx/vether.h | 15+++++++++++++++
Msrc/libvx32/Makefrag | 4++++
Msrc/libvx32/freebsd.c | 162+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
Msrc/libvx32/run64.S | 4++++
74 files changed, 28454 insertions(+), 87 deletions(-)

diff --git a/.hgignore b/.hgignore @@ -25,6 +25,8 @@ src/vxa/bz2/*ebz2 src/vxlinux/vxlinux src/9vx/9vx src/9vx/bootcode.S +src/9vx/fossil.S +src/9vx/venti.S src/9vx/data2s src/9vx/a/errstr.h src/9vx/kerndate.h diff --git a/CONTRIBUTORS b/CONTRIBUTORS @@ -3,3 +3,9 @@ The following people have contributed source code to vx32. Bryan Ford <baford@pdos.csail.mit.edu> Michael Teichgräber <mt4swm@googlemail.com> Russ Cox <rsc@swtch.com> +Jesus Galan Lopez <yiyu.jgl@gmail.com> +Tuly Gray +Devon H. O'Dell +Ron Minnich +Erik Quantrom +Brian L. Stuart diff --git a/doc/9vx.1 b/doc/9vx.1 @@ -0,0 +1,127 @@ +.TH 9VX 1 +.SH NAME +9vx, 9vx-tap \- Plan9 port to the virtual execution environment vx32 +.SH SYNOPSIS +.B 9vx +[ +.I option ... +] +[ +.I -p 9vx.ini +] +[ +.I -r root +] +[ +.I -u user +] +.PP +.B 9vx-tap +[ +.I option ... +] +[ +.I -p 9vx.ini +] +[ +.I -r root +] +[ +.I -u user +] +.SH DESCRIPTION +Plan 9 VX (or +.I 9vx +for short) is a port of the Plan 9 operating system to run on top of commodity operating systems, allowing the use of both Plan 9 and the host system simultaneously. To run user programs, +.I 9vx +creates an appropriate address space in a window within its own address space and invokes vx32 to simulate user mode execution. Some hardware devices are replaced by virtual versions, depending on the options given to +.I 9vx. +.I 9vx-tap +is a shell script that sets up a tap device with tunctl(1), launches +.I 9vx, +and removes the tap device when finished. +.PP +Options can be passed to +.I 9vx +as command line arguments or in a configuration file with the +.I -p +option (see below). If no +.I root +argument is present, the current directory or +.I /usr/local/9vx +is used. +Alternatively, a file system can be specified in the 9vx.ini file. +If an +.I user +is not specified, the current user in the host operating system will be used. +Other options are: +.nr xx \w'\fL-m\f2name\ \ ' +.TP \n(xxu +.BI -b +Run /boot/boot instead of bootscript +.TP +.BI -f +Do not fork at init +.TP +.BI -g +Do not start the gui +.TP +.BI -i +Run rc instead of init +.TP +.BI -t +Use tty for input/output +.TP +.BI -n " [ tap ] [ device ]" +Create virtual ethernet devices. The +.I tap +option tells that +.I device +is a tap device. Else, the virtual device will use pcap(3) to intercept packets going to +.I device, +and will therefore need root privileges. If a host +.I device +is not specified, pcap will use the first one available, and tap will use the +.I tap0 +device. More than one virtual ethernet device can be used. In absence of virtual devices, the network stack of the host system will be used. +.TP +.BI -m " macaddress" +Use the hardware address +.I macaddress +for the last given virtual network device. +.SS 9vx.ini configuration files +Configuration parameters can also be given to +.I 9vx +in the configuration file specified with the +.I -p +command line option. +The file name +.L - +means the standard input. +The file +.I 9vx.ini +has to contain a list of +.I parameter=value +pairs in a similar fasion to plan9.ini(8). Available options are +.I bootboot, +.I nofork, +.I nogui, +.I initrc, +.I usetty, +.I net, +.I macaddr, +.I localroot +and +.I user. +Other options will be passed to the boot process as environment variables. +.SH BUGS +The menu system of plan9.ini(8) is not supported in +.I 9vx.ini +files. +.P +.I 9vx +is not so stable as native Plan9 systems. +.SH "SEE ALSO" +.br +Bryan Ford and Russ Cox, +``Vx32: Lightweight User-level Sandboxing on the x86' diff --git a/src/9vx/9vx-tap b/src/9vx/9vx-tap @@ -0,0 +1,27 @@ +#!/bin/sh + +USERID=`whoami` + +# Create the tap device with tunctl +IFACE=`sudo tunctl -b -u $USERID` +# or openvpn +#IFACE=tap0 +#sudo openvpn --mktun --dev $IFACE --user $USERID + +# Bring the tap device up +sudo /sbin/ifconfig $IFACE 0.0.0.0 up + +# Add it to the bridge +sudo /usr/sbin/brctl addif br0 $IFACE + +# Launch 9vx (use -f to not fork) +9vx -f -n tap $IFACE $* + +# Bring the tap device down and disconnect from br0 +sudo /sbin/ifconfig $IFACE down +sudo /usr/sbin/brctl delif br0 $IFACE + +# Remove the tap device with tunctl +sudo tunctl -d $IFACE &> /dev/null +# or openvpn +#sudo openvpn --rmtun --dev $1 diff --git a/src/9vx/LICENSE b/src/9vx/LICENSE @@ -4,6 +4,8 @@ Plan 9 from Bell Labs distribution, which carries this license. The local changes are Copyright (c) 2006-2008 Russ Cox and are distributed as contributions under the terms of this license. +Other contributors are listed on the AUTHORS file. + =================================================================== diff --git a/src/9vx/Makefrag b/src/9vx/Makefrag @@ -29,13 +29,12 @@ PLAN9_OBJS = \ devaudio.o \ devaudio-$(PLAN9AUDIO).o \ devfs-posix.o \ - devip.o \ - devip-posix.o \ devmntloop.o \ devmouse.o \ devram.o \ devtab.o \ factotum.o \ + fossil.o \ kprocdev.o \ label.o \ main.o \ @@ -47,6 +46,7 @@ PLAN9_OBJS = \ time.o \ trap.o \ tty.o \ + venti.o \ vx32.o \ ) @@ -58,7 +58,6 @@ PLAN9_A_OBJS = \ $(addprefix 9vx/a/, \ allocb.o \ auth.o \ - bo.o \ chan.o \ classmask.o \ cleanname.o \ @@ -91,6 +90,7 @@ PLAN9_A_OBJS = \ page.o \ parse.o \ parseip.o \ + part.o \ pgrp.o \ print.o \ proc.o \ @@ -111,6 +111,48 @@ PLAN9_A_OBJS = \ utf.o \ ) +PLAN9_IP_OBJS = \ + $(addprefix 9vx/,\ + devip.o \ + devip-posix.o \ + etherpcap.o \ + ethertap.o \ + vether.o \ + ) \ + $(addprefix 9vx/a/,\ + devaoe.o \ + devether.o \ + netif.o \ + sdaoe.o \ + ) \ + $(addprefix 9vx/a/ip/,\ + arp.o \ + chandial.o \ + devip.o \ + esp.o \ + ethermedium.o \ + gre.o \ + icmp.o \ + icmp6.o \ + il.o \ + inferno.o \ + ip.o \ + ipaux.o \ + ipifc.o \ + ipmux.o \ + iproute.o \ + ipv6.o \ + loopbackmedium.o \ + netdevmedium.o \ + netlog.o \ + nullmedium.o \ + pktmedium.o \ + ptclbsum.o \ + tcp.o \ + udp.o \ + ) +PLAN9_IP_LIBS = -lpcap + PLAN9_nogui_OBJS = \ $(addprefix 9vx/,\ nogui.o \ @@ -142,6 +184,7 @@ PLAN9_GUI_LIBS = $(PLAN9_$(PLAN9GUI)_LIBS) PLAN9_DEPS = \ $(PLAN9_OBJS) \ $(PLAN9_A_OBJS) \ + $(PLAN9_IP_OBJS) \ $(PLAN9_GUI_OBJS) \ 9vx/libsec/libsec.a \ 9vx/libmemlayer/libmemlayer.a \ @@ -150,7 +193,7 @@ PLAN9_DEPS = \ libvx32/libvx32.a \ 9vx/9vx: $(PLAN9_DEPS) - $(HOST_CC) -o $@ $(PLAN9_DEPS) $(PLAN9_GUI_LIBS) -lpthread + $(HOST_CC) -o $@ $(PLAN9_DEPS) $(PLAN9_GUI_LIBS) $(PLAN9_IP_LIBS) -lpthread 9vx/a/%.o: 9vx/a/%.c $(HOST_CC) $(HOST_CFLAGS) -I. -I9vx -I9vx/a -Wall -Wno-missing-braces -c -o $@ $< @@ -176,6 +219,12 @@ PLAN9_DEPS = \ 9vx/factotum.S: 9vx/data2s 9vx/factotum.9 ./9vx/data2s factotum < 9vx/factotum.9 >$@_ && mv $@_ $@ +9vx/fossil.S: 9vx/data2s 9vx/fossil.9 + ./9vx/data2s fossil < 9vx/fossil.9 >$@_ && mv $@_ $@ + +9vx/venti.S: 9vx/data2s 9vx/venti.9 + ./9vx/data2s venti < 9vx/venti.9 > $@_ && mv $@_ $@ + 9vx/a/errstr.h: 9vx/a/error.h sed 's/extern //; s!;.*/\* ! = "!; s! \*\/!";!' 9vx/a/error.h >9vx/a/errstr.h @@ -199,7 +248,10 @@ CLEAN_FILES += \ 9vx/a/errstr.h \ 9vx/9vx \ 9vx/data2s \ - 9vx/bootcode.S + 9vx/bootcode.S \ + 9vx/factotum.S \ + 9vx/fossil.S \ + 9vx/venti.S include 9vx/libdraw/Makefrag include 9vx/libmemlayer/Makefrag diff --git a/src/9vx/a/aoe.h b/src/9vx/a/aoe.h @@ -0,0 +1,84 @@ +enum { + ACata, + ACconfig, +}; + +enum { + AQCread, + AQCtest, + AQCprefix, + AQCset, + AQCfset, +}; + +enum { + AEcmd = 1, + AEarg, + AEdev, + AEcfg, + AEver, +}; + +enum { + Aoetype = 0x88a2, + Aoesectsz = 512, + Szaoeata = 24+12, + Szaoeqc = 24+8, + Aoever = 1, + + AFerr = 1<<2, + AFrsp = 1<<3, + + AAFwrite= 1, + AAFext = 1<<6, +}; + +typedef struct { + uchar dst[Eaddrlen]; + uchar src[Eaddrlen]; + uchar type[2]; + uchar verflag; + uchar error; + uchar major[2]; + uchar minor; + uchar cmd; + uchar tag[4]; +} Aoehdr; + +typedef struct { + uchar dst[Eaddrlen]; + uchar src[Eaddrlen]; + uchar type[2]; + uchar verflag; + uchar error; + uchar major[2]; + uchar minor; + uchar cmd; + uchar tag[4]; + uchar aflag; + uchar errfeat; + uchar scnt; + uchar cmdstat; + uchar lba[6]; + uchar res[2]; +} Aoeata; + +typedef struct { + uchar dst[Eaddrlen]; + uchar src[Eaddrlen]; + uchar type[2]; + uchar verflag; + uchar error; + uchar major[2]; + uchar minor; + uchar cmd; + uchar tag[4]; + uchar bufcnt[2]; + uchar fwver[2]; + uchar scnt; + uchar verccmd; + uchar cslen[2]; +} Aoeqc; + +extern char Echange[]; +extern char Enotup[]; diff --git a/src/9vx/a/chan.c b/src/9vx/a/chan.c @@ -28,7 +28,7 @@ struct Elemlist { char *aname; /* original name */ char *name; /* copy of name, so '/' can be overwritten */ - int nelems; + uint nelems; char **elems; int *off; int mustbedir; diff --git a/src/9vx/a/devaoe.c b/src/9vx/a/devaoe.c @@ -0,0 +1,2575 @@ +/* + * © 2005-8 coraid + * aoe storage initiator + */ + +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "io.h" +#include "ureg.h" +#include "error.h" +#include "netif.h" +#include "etherif.h" +#include "ip/ip.h" +#include "aoe.h" + +#define WAKEUP(x) wakeup(&((x)->rend)) +#define SLEEP(a,b,c) sleep(&(a->rend), b, c) + +//#pragma varargck argpos eventlog 1 + +#define dprint(...) if(debug) eventlog(__VA_ARGS__); else USED(debug); +#define uprint(...) snprint(up->genbuf, sizeof up->genbuf, __VA_ARGS__); + +enum { + Maxunits = 0xff, + Maxframes = 128, + Maxmtu = 100000, + Ndevlink = 6, + Nea = 6, + Nnetlink = 6, +}; + +#define TYPE(q) ((ulong)(q).path & 0xf) +#define UNIT(q) (((ulong)(q).path>>4) & 0xff) +#define L(q) (((ulong)(q).path>>12) & 0xf) +#define QID(u, t) ((u)<<4 | (t)) +#define Q3(l, u, t) ((l)<<8 | QID(u, t)) +#define UP(d) ((d)->flag & Dup) + +#define Ticks msec() +#define Ms2tk(t) (((t)*HZ)/1000) +#define Tk2ms(t) (((t)*1000)/HZ) + +enum { + Qzero, + Qtopdir = 1, + Qtopbase, + Qtopctl = Qtopbase, + Qtoplog, + Qtopend, + + Qunitdir, + Qunitbase, + Qctl = Qunitbase, + Qdata, + Qconfig, + Qident, + + Qdevlinkdir, + Qdevlinkbase, + Qdevlink = Qdevlinkbase, + Qdevlinkend, + + Qtopfiles = Qtopend-Qtopbase, + Qdevlinkfiles = Qdevlinkend-Qdevlinkbase, + + Eventlen = 256, + Nevents = 64, + + Fread = 0, + Fwrite, + Tfree = -1, + Tmgmt, + + /* round trip bounds, timeouts, in ticks */ + Rtmax = Ms2tk(320), + Rtmin = Ms2tk(20), + Srbtimeout = 45*HZ, + + Dbcnt = 1024, + + Crd = 0x20, + Crdext = 0x24, + Cwr = 0x30, + Cwrext = 0x34, + Cid = 0xec, +}; + +enum { + Read, + Write, +}; + +/* + * unified set of flags + * a Netlink + Aoedev most both be jumbo capable + * to send jumbograms to that interface. + */ +enum { + /* sync with ahci.h */ + Dllba = 1<<0, + Dsmart = 1<<1, + Dpower = 1<<2, + Dnop = 1<<3, + Datapi = 1<<4, + Datapi16= 1<<5, + + /* aoe specific */ + Dup = 1<<6, + Djumbo = 1<<7, +}; + +static char *flagname[] = { + "llba", + "smart", + "power", + "nop", + "atapi", + "atapi16", + + "up", + "jumbo", +}; + +typedef struct { + uchar flag; + uchar lostjumbo; + int datamtu; + + Chan *cc; + Chan *dc; + Chan *mtu; /* open early to prevent bind issues. */ + char path[Maxpath]; + uchar ea[Eaddrlen]; +} Netlink; + +typedef struct { + Netlink *nl; + int nea; + ulong eaidx; + uchar eatab[Nea][Eaddrlen]; + int datamtu; + ulong npkt; + ulong resent; + uchar flag; + + ulong rttavg; + ulong mintimer; +} Devlink; + +typedef struct Srb Srb; +struct Srb { + Rendez rend; + Srb *next; + ulong ticksent; + ulong len; + vlong sector; + short write; + short nout; + char *error; + void *dp; + void *data; +}; + +typedef struct { + int tag; + ulong bcnt; + ulong dlen; + vlong lba; + ulong ticksent; + int nhdr; + uchar hdr[ETHERMINTU]; + void *dp; + Devlink *dl; + Netlink *nl; + int eaidx; + Srb *srb; +} Frame; + +typedef struct Aoedev Aoedev; +struct Aoedev { + QLock qlock; + Aoedev *next; + + ulong vers; + + int ndl; + ulong dlidx; + Devlink *dl; + Devlink dltab[Ndevlink]; + + ushort fwver; + uchar flag; + int nopen; + int major; + int minor; + int unit; + int lasttag; + int nframes; + Frame *frames; + vlong bsize; + vlong realbsize; + + uint maxbcnt; + uint maxmtu; + ulong lostjumbo; + ushort nout; + ushort maxout; + ulong lastwadj; + Srb *head; + Srb *tail; + Srb *inprocess; + + char serial[20+1]; + char firmware[8+1]; + char model[40+1]; + int nconfig; + uchar config[1024]; + uchar ident[512]; +}; + +//#pragma varargck type "æ" Aoedev* + +static struct { + Lock lk; + QLock qlock; + Rendez rend; + char buf[Eventlen*Nevents]; + char *rp; + char *wp; +} events; + +static struct { + RWlock rwlock; + int nd; + Aoedev *d; +} devs; + +static struct { + Lock lk; + int reader[Nnetlink]; /* reader is running. */ + Rendez rendez[Nnetlink]; /* confirm exit. */ + Netlink nl[Nnetlink]; +} netlinks; + +extern Dev aoedevtab; +static Ref units; +static Ref drivevers; +static int debug; +static int autodiscover = 1; +static int rediscover; + char Enotup[] = "aoe device is down"; + char Echange[] = "media or partition has changed"; + +static Srb* +srballoc(ulong sz) +{ + Srb *srb; + + srb = malloc(sizeof *srb+sz); + srb->dp = srb->data = srb+1; + srb->ticksent = Ticks; + return srb; +} + +static Srb* +srbkalloc(void *db, ulong dummy) +{ + Srb *srb; + + srb = malloc(sizeof *srb); + srb->dp = srb->data = db; + srb->ticksent = Ticks; + return srb; +} + +#define srbfree(srb) free(srb) + +static void +srberror(Srb *srb, char *s) +{ + srb->error = s; + srb->nout--; + WAKEUP(srb); +} + +static void +frameerror(Aoedev *d, Frame *f, char *s) +{ + Srb *srb; + + srb = f->srb; + if(f->tag == Tfree) + return; + f->srb = nil; + f->tag = Tfree; /* don't get fooled by way-slow responses */ + if(!srb) + return; + srberror(srb, s); + d->nout--; +} + +static char* +unitname(Aoedev *d) +{ + uprint("%d.%d", d->major, d->minor); + return up->genbuf; +} + +static long +eventlogread(void *a, long n) +{ + int len; + char *p, *buf; + + buf = smalloc(Eventlen); + QLOCK(&events); + LOCK(&events); + p = events.rp; + len = *p; + if(len == 0){ + n = 0; + UNLOCK(&events); + } else { + if(n > len) + n = len; + /* can't move directly into pageable space with events lock held */ + memmove(buf, p+1, n); + *p = 0; + events.rp = p += Eventlen; + if(p >= events.buf + sizeof events.buf) + events.rp = events.buf; + UNLOCK(&events); + + /* the concern here is page faults in memmove below */ + if(waserror()){ + free(buf); + QUNLOCK(&events); + nexterror(); + } + memmove(a, buf, n); + poperror(); + } + free(buf); + QUNLOCK(&events); + return n; +} + +static int +eventlog(char *fmt, ...) +{ + int dragrp, n; + char *p; + va_list arg; + + LOCK(&events); + p = events.wp; + dragrp = *p++; + va_start(arg, fmt); + n = vsnprint(p, Eventlen-1, fmt, arg); + *--p = n; + p = events.wp += Eventlen; + if(p >= events.buf + sizeof events.buf) + p = events.wp = events.buf; + if(dragrp) + events.rp = p; + UNLOCK(&events); + WAKEUP(&events); + return n; +} + +static int +eventcount(void) +{ + int n; + + LOCK(&events); + if(*events.rp == 0) + n = 0; + else if(events.wp < events.rp) + n = Nevents - (events.rp - events.wp); + else + n = events.wp - events.rp; + UNLOCK(&events); + return n/Eventlen; +} + +static int +tsince(int tag) +{ + int n; + + n = Ticks & 0xffff; + n -= tag & 0xffff; + if(n < 0) + n += 1<<16; + return n; +} + +static int +newtag(Aoedev *d) +{ + int t; + + do { + t = ++d->lasttag << 16; + t |= Ticks & 0xffff; + } while (t == Tfree || t == Tmgmt); + return t; +} + +static void +downdev(Aoedev *d, char *err) +{ + Frame *f, *e; + + d->flag &= ~Dup; + f = d->frames; + e = f + d->nframes; + for(; f < e; f->tag = Tfree, f->srb = nil, f++) + frameerror(d, f, Enotup); + d->inprocess = nil; + eventlog("%æ: removed; %s\n", d, err); +} + +static Block* +allocfb(Frame *f) +{ + int len; + Block *b; + + len = f->nhdr + f->dlen; + if(len < ETHERMINTU) + len = ETHERMINTU; + b = allocb(len); + memmove(b->wp, f->hdr, f->nhdr); + if(f->dlen) + memmove(b->wp + f->nhdr, f->dp, f->dlen); + b->wp += len; + return b; +} + +static void +putlba(Aoeata *a, vlong lba) +{ + uchar *c; + + c = a->lba; + c[0] = lba; + c[1] = lba >> 8; + c[2] = lba >> 16; + c[3] = lba >> 24; + c[4] = lba >> 32; + c[5] = lba >> 40; +} + +static Devlink* +pickdevlink(Aoedev *d) +{ + ulong i, n; + Devlink *l; + + for(i = 0; i < d->ndl; i++){ + n = d->dlidx++ % d->ndl; + l = d->dl + n; + if(l && l->flag & Dup) + return l; + } + return 0; +} + +static int +pickea(Devlink *l) +{ + if(l == 0) + return -1; + if(l->nea == 0) + return -1; + return l->eaidx++ % l->nea; +} + +static int +hset(Aoedev *d, Frame *f, Aoehdr *h, int cmd) +{ + int i; + Devlink *l; + + if(f->srb) + if((long)(Ticks-f->srb->ticksent) > Srbtimeout){ + eventlog("%æ: srb timeout\n", d); + frameerror(d, f, Etimedout); + return -1; + } + l = pickdevlink(d); + i = pickea(l); + if(i == -1){ + downdev(d, "resend fails; no netlink/ea"); + return -1; + } + memmove(h->dst, l->eatab[i], Eaddrlen); + memmove(h->src, l->nl->ea, sizeof h->src); + hnputs(h->type, Aoetype); + h->verflag = Aoever << 4; + h->error = 0; + hnputs(h->major, d->major); + h->minor = d->minor; + h->cmd = cmd; + + hnputl(h->tag, f->tag = newtag(d)); + f->dl = l; + f->nl = l->nl; + f->eaidx = i; + f->ticksent = Ticks; + + return f->tag; +} + +static int +resend(Aoedev *d, Frame *f) +{ + ulong n; + Aoeata *a; + + a = (Aoeata*)f->hdr; + if(hset(d, f, (Aoehdr*)a, a->cmd) == -1) + return -1; + n = f->bcnt; + if(n > d->maxbcnt){ + n = d->maxbcnt; /* mtu mismatch (jumbo fail?) */ + if(f->dlen > n) + f->dlen = n; + } + a->scnt = n / Aoesectsz; + f->dl->resent++; + f->dl->npkt++; + if(waserror()) + /* should remove the netlink */ + return -1; + devtab[f->nl->dc->type]->bwrite(f->nl->dc, allocfb(f), 0); + poperror(); + return 0; +} + +static void +discover(int major, int minor) +{ + Aoehdr *h; + Block *b; + Netlink *nl, *e; + + nl = netlinks.nl; + e = nl + nelem(netlinks.nl); + for(; nl < e; nl++){ + if(nl->cc == nil) + continue; + b = allocb(ETHERMINTU); + if(waserror()){ + freeb(b); + nexterror(); + } + b->wp = b->rp + ETHERMINTU; + memset(b->rp, 0, ETHERMINTU); + h = (Aoehdr*)b->rp; + memset(h->dst, 0xff, sizeof h->dst); + memmove(h->src, nl->ea, sizeof h->src); + hnputs(h->type, Aoetype); + h->verflag = Aoever << 4; + hnputs(h->major, major); + h->minor = minor; + h->cmd = ACconfig; + poperror(); + devtab[nl->dc->type]->bwrite(nl->dc, b, 0); + } +} + +/* + * Check all frames on device and resend any frames that have been + * outstanding for 200% of the device round trip time average. + */ +static void +aoesweepproc(void *dummy) +{ + ulong i, tx, timeout, nbc; + vlong starttick; + enum { Nms = 100, Nbcms = 30*1000, }; + uchar *ea; + Aoeata *a; + Aoedev *d; + Devlink *l; + Frame *f, *e; + + nbc = Nbcms/Nms; +loop: + if(nbc-- == 0){ + if(rediscover && !waserror()){ + discover(0xffff, 0xff); + poperror(); + } + nbc = Nbcms/Nms; + } + starttick = Ticks; + RLOCK(&devs); + for(d = devs.d; d; d = d->next){ + if(!CANQLOCK(d)) + continue; + if(!UP(d)){ + QUNLOCK(d); + continue; + } + tx = 0; + f = d->frames; + e = f + d->nframes; + for (; f < e; f++){ + if(f->tag == Tfree) + continue; + l = f->dl; + timeout = l->rttavg << 1; + i = tsince(f->tag); + if(i < timeout) + continue; + if(d->nout == d->maxout){ + if(d->maxout > 1) + d->maxout--; + d->lastwadj = Ticks; + } + a = (Aoeata*)f->hdr; + if(a->scnt > Dbcnt / Aoesectsz && + ++f->nl->lostjumbo > (d->nframes << 1)){ + ea = f->dl->eatab[f->eaidx]; + eventlog("%æ: jumbo failure on %s:%E; lba%lld\n", + d, f->nl->path, ea, f->lba); + d->maxbcnt = Dbcnt; + d->flag &= ~Djumbo; + } + resend(d, f); + if(tx++ == 0){ + if((l->rttavg <<= 1) > Rtmax) + l->rttavg = Rtmax; + eventlog("%æ: rtt %ldms\n", d, Tk2ms(l->rttavg)); + } + } + if(d->nout == d->maxout && d->maxout < d->nframes && + TK2MS(Ticks-d->lastwadj) > 10*1000){ + d->maxout++; + d->lastwadj = Ticks; + } + QUNLOCK(d); + } + RUNLOCK(&devs); + i = Nms - TK2MS(Ticks - starttick); + if(i > 0) + tsleep(&up->sleep, return0, 0, i); + goto loop; +} + +static int +fmtaoe(Fmt *f) +{ + char buf[16]; + Aoedev *d; + + d = va_arg(f->args, Aoedev*); + snprint(buf, sizeof buf, "aoe%d.%d", d->major, d->minor); + return fmtstrcpy(f, buf); +} + +static void netbind(char *path); + +static void +aoecfg(void) +{ + int n, i; + char *p, *f[32], buf[24]; + + if(1) +// if((p = getconf("aoeif")) == nil || (n = tokenize(p, f, nelem(f))) < 1) + return; + /* goo! */ + for(i = 0; i < n; i++){ + p = f[i]; + if(strncmp(p, "ether", 5) == 0) + snprint(buf, sizeof buf, "#l%c/ether%c", p[5], p[5]); + else if(strncmp(p, "#l", 2) == 0) + snprint(buf, sizeof buf, "#l%c/ether%c", p[2], p[2]); + else + continue; + if(!waserror()){ + netbind(buf); + poperror(); + } + } +} + +static void +aoeinit(void) +{ + static int init; + static QLock l; + + if(!canqlock(&l)) + return; + if(init == 0){ + fmtinstall(L'æ', fmtaoe); + events.rp = events.wp = events.buf; + kproc("aoesweep", aoesweepproc, nil); + aoecfg(); + init = 1; + } + qunlock(&l); +} + +static Chan* +aoeattach(char *spec) +{ + Chan *c; + + if(*spec) + error(Enonexist); + aoeinit(); + c = devattach(L'æ', spec); + mkqid(&c->qid, Qzero, 0, QTDIR); + return c; +} + +static Aoedev* +unitseq(ulong unit) +{ + int i; + Aoedev *d; + + i = 0; + RLOCK(&devs); + for(d = devs.d; d; d = d->next) + if(i++ == unit) + break; + RUNLOCK(&devs); + return d; +} + +static Aoedev* +unit2dev(ulong unit) +{ + Aoedev *d; + + RLOCK(&devs); + for(d = devs.d; d; d = d->next) + if(d->unit == unit){ + RUNLOCK(&devs); + return d; + } + RUNLOCK(&devs); + error("unit lookup failure"); + return nil; +} + +static int +unitgen(Chan *c, ulong type, Dir *dp) +{ + int perm, t; + ulong vers; + vlong size; + char *p; + Aoedev *d; + Qid q; + + d = unit2dev(UNIT(c->qid)); + perm = 0644; + size = 0; + vers = d->vers; + t = QTFILE; + + switch(type){ + default: + return -1; + case Qctl: + p = "ctl"; + break; + case Qdata: + p = "data"; + perm = 0640; + if(UP(d)) + size = d->bsize; + break; + case Qconfig: + p = "config"; + if(UP(d)) + size = d->nconfig; + break; + case Qident: + p = "ident"; + if(UP(d)) + size = sizeof d->ident; + break; + case Qdevlinkdir: + p = "devlink"; + t = QTDIR; + perm = 0555; + break; + } + mkqid(&q, QID(UNIT(c->qid), type), vers, t); + devdir(c, q, p, size, eve, perm, dp); + return 1; +} + +static int +topgen(Chan *c, ulong type, Dir *d) +{ + int perm; + vlong size; + char *p; + Qid q; + + perm = 0444; + size = 0; + switch(type){ + default: + return -1; + case Qtopctl: + p = "ctl"; + perm = 0644; + break; + case Qtoplog: + p = "log"; + size = eventcount(); + break; + } + mkqid(&q, type, 0, QTFILE); + devdir(c, q, p, size, eve, perm, d); + return 1; +} + +static int +aoegen(Chan *c, char *d0, Dirtab *d1, int d2, int s, Dir *dp) +{ + int i; + Aoedev *d; + Qid q; + + if(c->qid.path == 0){ + switch(s){ + case DEVDOTDOT: + q.path = 0; + q.type = QTDIR; + devdir(c, q, "#æ", 0, eve, 0555, dp); + break; + case 0: + q.path = Qtopdir; + q.type = QTDIR; + devdir(c, q, "aoe", 0, eve, 0555, dp); + break; + default: + return -1; + } + return 1; + } + + switch(TYPE(c->qid)){ + default: + return -1; + case Qtopdir: + if(s == DEVDOTDOT){ + mkqid(&q, Qzero, 0, QTDIR); + devdir(c, q, "aoe", 0, eve, 0555, dp); + return 1; + } + if(s < Qtopfiles) + return topgen(c, Qtopbase + s, dp); + s -= Qtopfiles; + if((d = unitseq(s)) == 0) + return -1; + mkqid(&q, QID(d->unit, Qunitdir), 0, QTDIR); + devdir(c, q, unitname(d), 0, eve, 0555, dp); + return 1; + case Qtopctl: + case Qtoplog: + return topgen(c, TYPE(c->qid), dp); + case Qunitdir: + if(s == DEVDOTDOT){ + mkqid(&q, QID(0, Qtopdir), 0, QTDIR); + uprint("%uld", UNIT(c->qid)); + devdir(c, q, up->genbuf, 0, eve, 0555, dp); + return 1; + } + return unitgen(c, Qunitbase+s, dp); + case Qctl: + case Qdata: + case Qconfig: + case Qident: + return unitgen(c, TYPE(c->qid), dp); + case Qdevlinkdir: + i = UNIT(c->qid); + if(s == DEVDOTDOT){ + mkqid(&q, QID(i, Qunitdir), 0, QTDIR); + devdir(c, q, "devlink", 0, eve, 0555, dp); + return 1; + } + if(i >= units.ref) + return -1; + d = unit2dev(i); + if(s >= d->ndl) + return -1; + uprint("%d", s); + mkqid(&q, Q3(s, i, Qdevlink), 0, QTFILE); + devdir(c, q, up->genbuf, 0, eve, 0755, dp); + return 1; + case Qdevlink: + uprint("%d", s); + mkqid(&q, Q3(s, UNIT(c->qid), Qdevlink), 0, QTFILE); + devdir(c, q, up->genbuf, 0, eve, 0755, dp); + return 1; + } +} + +static Walkqid* +aoewalk(Chan *c, Chan *nc, char **name, int nname) +{ + return devwalk(c, nc, name, nname, nil, 0, aoegen); +} + +static int +aoestat(Chan *c, uchar *db, int n) +{ + return devstat(c, db, n, nil, 0, aoegen); +} + +static Chan* +aoeopen(Chan *c, int omode) +{ + Aoedev *d; + + if(TYPE(c->qid) != Qdata) + return devopen(c, omode, 0, 0, aoegen); + + d = unit2dev(UNIT(c->qid)); + QLOCK(d); + if(waserror()){ + QUNLOCK(d); + nexterror(); + } + if(!UP(d)) + error(Enotup); + c = devopen(c, omode, 0, 0, aoegen); + d->nopen++; + poperror(); + QUNLOCK(d); + return c; +} + +static void +aoeclose(Chan *c) +{ + Aoedev *d; + + if(TYPE(c->qid) != Qdata || (c->flag&COPEN) == 0) + return; + + d = unit2dev(UNIT(c->qid)); + QLOCK(d); + if(--d->nopen == 0 && !waserror()){ + discover(d->major, d->minor); + poperror(); + } + QUNLOCK(d); +} + +static void +atarw(Aoedev *d, Frame *f) +{ + ulong bcnt; + char extbit, writebit; + Aoeata *ah; + Srb *srb; + + extbit = 0x4; + writebit = 0x10; + + srb = d->inprocess; + bcnt = d->maxbcnt; + if(bcnt > srb->len) + bcnt = srb->len; + f->nhdr = Szaoeata; + memset(f->hdr, 0, f->nhdr); + ah = (Aoeata*)f->hdr; + if(hset(d, f, (Aoehdr*)ah, ACata) == -1) + return; + f->dp = srb->dp; + f->bcnt = bcnt; + f->lba = srb->sector; + f->srb = srb; + + ah->scnt = bcnt / Aoesectsz; + putlba(ah, f->lba); + if(d->flag & Dllba) + ah->aflag |= AAFext; + else { + extbit = 0; + ah->lba[3] &= 0x0f; + ah->lba[3] |= 0xe0; /* LBA bit+obsolete 0xa0 */ + } + if(srb->write){ + ah->aflag |= AAFwrite; + f->dlen = bcnt; + }else{ + writebit = 0; + f->dlen = 0; + } + ah->cmdstat = 0x20 | writebit | extbit; + + /* mark tracking fields and load out */ + srb->nout++; + srb->dp = (uchar*)srb->dp + bcnt; + srb->len -= bcnt; + srb->sector += bcnt / Aoesectsz; + if(srb->len == 0) + d->inprocess = nil; + d->nout++; + f->dl->npkt++; + if(waserror()){ + f->tag = Tfree; + d->inprocess = nil; + nexterror(); + } + devtab[f->nl->dc->type]->bwrite(f->nl->dc, allocfb(f), 0); + poperror(); +} + +static char* +aoeerror(Aoehdr *h) +{ + int n; + static char *errs[] = { + "aoe protocol error: unknown", + "aoe protocol error: bad command code", + "aoe protocol error: bad argument param", + "aoe protocol error: device unavailable", + "aoe protocol error: config string present", + "aoe protocol error: unsupported version", + }; + + if((h->verflag & AFerr) == 0) + return 0; + n = h->error; + if(n > nelem(errs)) + n = 0; + return errs[n]; +} + +static void +rtupdate(Devlink *l, int rtt) +{ + int n; + + n = rtt; + if(rtt < 0){ + n = -rtt; + if(n < Rtmin) + n = Rtmin; + else if(n > Rtmax) + n = Rtmax; + l->mintimer += (n - l->mintimer) >> 1; + } else if(n < l->mintimer) + n = l->mintimer; + else if(n > Rtmax) + n = Rtmax; + + /* g == .25; cf. Congestion Avoidance and Control, Jacobson&Karels; 1988 */ + n -= l->rttavg; + l->rttavg += n >> 2; +} + +static int +srbready(void *v) +{ + Srb *s; + + s = v; + return s->error || (!s->nout && !s->len); +} + +static Frame* +getframe(Aoedev *d, int tag) +{ + Frame *f, *e; + + f = d->frames; + e = f + d->nframes; + for(; f < e; f++) + if(f->tag == tag) + return f; + return nil; +} + +static Frame* +freeframe(Aoedev *d) +{ + if(d->nout < d->maxout) + return getframe(d, Tfree); + return nil; +} + +static void +work(Aoedev *d) +{ + Frame *f; + + while(f = freeframe(d)) { + if(d->inprocess == nil){ + if(d->head == nil) + return; + d->inprocess = d->head; + d->head = d->head->next; + if(d->head == nil) + d->tail = nil; + } + atarw(d, f); + } +} + +static void +strategy(Aoedev *d, Srb *srb) +{ + QLOCK(d); + if(waserror()){ + QUNLOCK(d); + nexterror(); + } + srb->next = nil; + if(d->tail) + d->tail->next = srb; + d->tail = srb; + if(d->head == nil) + d->head = srb; + work(d); + poperror(); + QUNLOCK(d); + + while(waserror()) + ; + SLEEP(srb, srbready, srb); + poperror(); +} + +#define iskaddr(a) (!up || (uintptr)(a) > up->pmmu.uzero+USTKTOP) + +static long +rw(Aoedev *d, int write, uchar *db, long len, uvlong off) +{ + long n, nlen, copy; + enum { Srbsz = 1<<19, }; + Srb *srb; + + if((off|len) & (Aoesectsz-1)) + error("offset and length must be sector multiple.\n"); + if(off >= d->bsize) + return 0; + if(off + len > d->bsize) + len = d->bsize - off; + copy = 0; + if(iskaddr(db)){ +panic("iskaddr %p %p\n", db); + srb = srbkalloc(db, len); + copy = 1; + }else + srb = srballoc(Srbsz <= len? Srbsz: len); + if(waserror()){ + srbfree(srb); + nexterror(); + } + srb->write = write; + for(nlen = len; nlen; nlen -= n){ + if(!UP(d)) + error(Eio); + srb->sector = off / Aoesectsz; + srb->dp = srb->data; + n = nlen; + if(n > Srbsz) + n = Srbsz; + srb->len = n; + if(write && !copy) + memmove(srb->data, db, n); + strategy(d, srb); + if(srb->error) + error(srb->error); + if(!write && !copy) + memmove(db, srb->data, n); + db += n; + off += n; + } + poperror(); + srbfree(srb); + return len; +} + +static long +readmem(ulong off, void *dst, long n, void *src, long size) +{ + if(off >= size) + return 0; + if(off + n > size) + n = size - off; + memmove(dst, (uchar*)src + off, n); + return n; +} + +static char* +pflag(char *s, char *e, uchar f) +{ + uchar i; + + for(i = 0; i < nelem(flagname); i++) + if(f & 1 << i) + s = seprint(s, e, "%s ", flagname[i]); + return seprint(s, e, "\n"); +} + +static int +pstat(Aoedev *d, char *db, int len, int off) +{ + int i; + char *state, *s, *p, *e; + + s = p = malloc(1024); + e = p + 1024; + + state = "down"; + if(UP(d)) + state = "up"; + + p = seprint(p, e, + "state: %s\n" "nopen: %d\n" "nout: %d\n" + "nmaxout: %d\n" "nframes: %d\n" "maxbcnt: %d [maxmtu %d]\n" + "fw: %.4ux\n" + "model: %s\n" "serial: %s\n" "firmware: %s\n", + state, d->nopen, d->nout, + d->maxout, d->nframes, d->maxbcnt, d->maxmtu, + d->fwver, + d->model, d->serial, d->firmware); + p = seprint(p, e, "flag: "); + p = pflag(p, e, d->flag); + + if(p - s < len) + len = p - s; + i = readstr(off, db, len, s); + free(s); + return i; +} + +static long +unitread(Chan *c, void *db, long len, vlong off) +{ + Aoedev *d; + + d = unit2dev(UNIT(c->qid)); + if(d->vers != c->qid.vers) + error(Echange); + switch(TYPE(c->qid)){ + default: + error(Ebadarg); + case Qctl: + return pstat(d, db, len, off); + case Qdata: + return rw(d, Read, db, len, off); + case Qconfig: + if(!UP(d)) + error(Enotup); + return readmem(off, db, len, d->config, d->nconfig); + case Qident: + if(!UP(d)) + error(Enotup); + return readmem(off, db, len, d->ident, sizeof d->ident); + } +} + +static int +devlinkread(Chan *c, void *db, int len, int off) +{ + int i; + char *s, *p, *e; + Aoedev *d; + Devlink *l; + + d = unit2dev(UNIT(c->qid)); + i = L(c->qid); + if(i >= d->ndl) + return 0; + l = d->dl + i; + + s = p = malloc(1024); + e = s + 1024; + + p = seprint(p, e, "addr: "); + for(i = 0; i < l->nea; i++) + p = seprint(p, e, "%E ", l->eatab[i]); + p = seprint(p, e, "\n"); + p = seprint(p, e, "npkt: %uld\n", l->npkt); + p = seprint(p, e, "resent: %uld\n", l->resent); + p = seprint(p, e, "flag: "); p = pflag(p, e, l->flag); + p = seprint(p, e, "rttavg: %uld\n", Tk2ms(l->rttavg)); + p = seprint(p, e, "mintimer: %uld\n", Tk2ms(l->mintimer)); + + p = seprint(p, e, "nl path: %s\n", l->nl->path); + p = seprint(p, e, "nl ea: %E\n", l->nl->ea); + p = seprint(p, e, "nl flag: "); p = pflag(p, e, l->flag); + p = seprint(p, e, "nl lostjumbo: %d\n", l->nl->lostjumbo); + p = seprint(p, e, "nl datamtu: %d\n", l->nl->datamtu); + + if(p - s < len) + len = p - s; + i = readstr(off, db, len, s); + free(s); + return i; +} + +static long +topctlread(Chan *d0, void *db, int len, int off) +{ + int i; + char *s, *p, *e; + Netlink *n; + + s = p = malloc(1024); + e = s + 1024; + + p = seprint(p, e, "debug: %d\n", debug); + p = seprint(p, e, "autodiscover: %d\n", autodiscover); + p = seprint(p, e, "rediscover: %d\n", rediscover); + + for(i = 0; i < Nnetlink; i++){ + n = netlinks.nl+i; + if(n->cc == 0) + continue; + p = seprint(p, e, "if%d path: %s\n", i, n->path); + p = seprint(p, e, "if%d ea: %E\n", i, n->ea); + p = seprint(p, e, "if%d flag: ", i); p = pflag(p, e, n->flag); + p = seprint(p, e, "if%d lostjumbo: %d\n", i, n->lostjumbo); + p = seprint(p, e, "if%d datamtu: %d\n", i, n->datamtu); + } + + if(p - s < len) + len = p - s; + i = readstr(off, db, len, s); + free(s); + return i; +} + +static long +aoeread(Chan *c, void *db, long n, vlong off) +{ + switch(TYPE(c->qid)){ + default: + error(Eperm); + case Qzero: + case Qtopdir: + case Qunitdir: + case Qdevlinkdir: + return devdirread(c, db, n, 0, 0, aoegen); + case Qtopctl: + return topctlread(c, db, n, off); + case Qtoplog: + return eventlogread(db, n); + case Qctl: + case Qdata: + case Qconfig: + case Qident: + return unitread(c, db, n, off); + case Qdevlink: + return devlinkread(c, db, n, off); + } +} + +static long +configwrite(Aoedev *d, void *db, long len) +{ + char *s; + Aoeqc *ch; + Frame *f; + Srb *srb; + + if(!UP(d)) + error(Enotup); + if(len > sizeof d->config) + error(Etoobig); + srb = srballoc(len); + s = malloc(len); + memmove(s, db, len); + if(waserror()){ + srbfree(srb); + free(s); + nexterror(); + } + for (;;) { + QLOCK(d); + if(waserror()){ + QUNLOCK(d); + nexterror(); + } + f = freeframe(d); + if(f != nil) + break; + poperror(); + QUNLOCK(d); + if(waserror()) + nexterror(); + tsleep(&up->sleep, return0, 0, 100); + poperror(); + } + f->nhdr = Szaoeqc; + memset(f->hdr, 0, f->nhdr); + ch = (Aoeqc*)f->hdr; + if(hset(d, f, (Aoehdr*)ch, ACconfig) == -1) + return 0; + f->srb = srb; + f->dp = s; + ch->verccmd = AQCfset; + hnputs(ch->cslen, len); + d->nout++; + srb->nout++; + f->dl->npkt++; + f->dlen = len; + /* + * these refer to qlock & waserror in the above for loop. + * there's still the first waserror outstanding. + */ + poperror(); + QUNLOCK(d); + + devtab[f->nl->dc->type]->bwrite(f->nl->dc, allocfb(f), 0); + SLEEP(srb, srbready, srb); + if(srb->error) + error(srb->error); + + QLOCK(d); + if(waserror()){ + QUNLOCK(d); + nexterror(); + } + memmove(d->config, s, len); + d->nconfig = len; + poperror(); + QUNLOCK(d); + + poperror(); /* pop first waserror */ + + srbfree(srb); + memmove(db, s, len); + free(s); + return len; +} + +static int +getmtu(Chan *m) +{ + int n, mtu; + char buf[36]; + + mtu = 1514; + if(m == nil || waserror()) + return mtu; + n = devtab[m->type]->read(m, buf, sizeof buf - 1, 0); + poperror(); + if(n > 12){ + buf[n] = 0; + mtu = strtoul(buf + 12, 0, 0); + } + return mtu; +} + +static int +devmaxdata(Aoedev *d) +{ + int i, m, mtu; + Devlink *l; + Netlink *n; + + mtu = 100000; + for(i = 0; i < d->ndl; i++){ + l = d->dl + i; + n = l->nl; + if((l->flag & Dup) == 0 || (n->flag & Dup) == 0) + continue; + m = getmtu(n->mtu); + if(m > l->datamtu) + m = l->datamtu; + if(m < mtu) + mtu = m; + } + if(mtu == 100000) + mtu = 1514; + mtu -= Szaoeata; + mtu -= mtu % Aoesectsz; + return mtu; +} + +static int +toggle(char *s, int init) +{ + if(s == nil) + return init ^ 1; + return strcmp(s, "on") == 0; +} + +static void ataident(Aoedev*); + +static long +unitctlwrite(Aoedev *d, void *db, long n) +{ + uint maxbcnt, m; + uvlong bsize; + enum { + Failio, + Ident, + Jumbo, + Maxbno, + Mtu, + Setsize, + }; + Cmdbuf *cb; + Cmdtab *ct; + static Cmdtab cmds[] = { + {Failio, "failio", 1 }, + {Ident, "identify", 1 }, + {Jumbo, "jumbo", 0 }, + {Maxbno, "maxbno", 0 }, + {Mtu, "mtu", 0 }, + {Setsize, "setsize", 0 }, + }; + + cb = parsecmd(db, n); + QLOCK(d); + if(waserror()){ + QUNLOCK(d); + free(cb); + nexterror(); + } + ct = lookupcmd(cb, cmds, nelem(cmds)); + switch(ct->index){ + case Failio: + downdev(d, "i/o failure"); + break; + case Ident: + ataident(d); + break; + case Jumbo: + m = 0; + if(d->flag & Djumbo) + m = 1; + toggle(cb->f[1], m); + if(m) + d->flag |= Djumbo; + else + d->flag &= ~Djumbo; + break; + case Maxbno: + case Mtu: + maxbcnt = devmaxdata(d); + if(cb->nf > 2) + error(Ecmdargs); + if(cb->nf == 2){ + m = strtoul(cb->f[1], 0, 0); + if(ct->index == Maxbno) + m *= Aoesectsz; + else{ + m -= Szaoeata; + m &= ~(Aoesectsz-1); + } + if(m == 0 || m > maxbcnt) + cmderror(cb, "invalid mtu"); + maxbcnt = m; + d->maxmtu = m; + } else + d->maxmtu = Maxmtu; + d->maxbcnt = maxbcnt; + break; + case Setsize: + bsize = d->realbsize; + if(cb->nf > 2) + error(Ecmdargs); + if(cb->nf == 2){ + bsize = strtoull(cb->f[1], 0, 0); + if(bsize % Aoesectsz) + cmderror(cb, "disk size must be sector aligned"); + } + d->bsize = bsize; + break; + default: + cmderror(cb, "unknown aoe control message"); + } + poperror(); + QUNLOCK(d); + free(cb); + return n; +} + +static long +unitwrite(Chan *c, void *db, long n, vlong off) +{ + long rv; + char *buf; + Aoedev *d; + + d = unit2dev(UNIT(c->qid)); + switch(TYPE(c->qid)){ + default: + error(Ebadarg); + case Qctl: + return unitctlwrite(d, db, n); + case Qident: + error(Eperm); + case Qdata: + return rw(d, Write, db, n, off); + case Qconfig: + if(off + n > sizeof d->config) + error(Etoobig); + buf = malloc(sizeof d->config); + if(waserror()){ + free(buf); + nexterror(); + } + memmove(buf, d->config, d->nconfig); + memmove(buf + off, db, n); + rv = configwrite(d, buf, n + off); + poperror(); + free(buf); + return rv; + } +} + +static Netlink* +addnet(char *path, Chan *cc, Chan *dc, Chan *mtu, uchar *ea) +{ + Netlink *nl, *e; + + LOCK(&netlinks); + if(waserror()){ + UNLOCK(&netlinks); + nexterror(); + } + nl = netlinks.nl; + e = nl + nelem(netlinks.nl); + for(; nl < e && nl->cc; nl++) + continue; + if(nl == e) + error("out of netlink structures"); + nl->cc = cc; + nl->dc = dc; + nl->mtu = mtu; + strncpy(nl->path, path, sizeof nl->path); + memmove(nl->ea, ea, sizeof nl->ea); + poperror(); + nl->flag |= Dup; + UNLOCK(&netlinks); + return nl; +} + +static int +newunit(void) +{ + int x; + + LOCK(&units); + if(units.ref == Maxunits) + x = -1; + else + x = units.ref++; + UNLOCK(&units); + return x; +} + +static int +dropunit(void) +{ + int x; + + LOCK(&units); + x = --units.ref; + UNLOCK(&units); + return x; +} + +/* + * always allocate max frames. maxout may change. + */ +static Aoedev* +newdev(long major, long minor, int n) +{ + Aoedev *d; + Frame *f, *e; + + d = malloc(sizeof *d); + f = malloc(sizeof *f*Maxframes); + if(!d || !f) { + free(d); + free(f); + error("aoe device allocation failure"); + } + d->nframes = n; + d->frames = f; + for (e = f + Maxframes; f < e; f++) + f->tag = Tfree; + d->maxout = n; + d->major = major; + d->minor = minor; + d->maxbcnt = Dbcnt; + d->flag = Djumbo; + d->maxmtu = Maxmtu; + d->unit = newunit(); /* bzzt. inaccurate if units removed */ + if(d->unit == -1){ + free(d); + free(d->frames); + error("too many units"); + } + d->dl = d->dltab; + return d; +} + +static Aoedev* +mm2dev(int major, int minor) +{ + Aoedev *d; + + RLOCK(&devs); + for(d = devs.d; d; d = d->next) + if(d->major == major && d->minor == minor){ + RUNLOCK(&devs); + return d; + } + RUNLOCK(&devs); + eventlog("mm2dev: %d.%d not found\n", major, minor); + return nil; +} + +/* Find the device in our list. If not known, add it */ +static Aoedev* +getdev(long major, long minor, int n) +{ + Aoedev *d; + + if(major == 0xffff || minor == 0xff) + return 0; + WLOCK(&devs); + if(waserror()){ + WUNLOCK(&devs); + nexterror(); + } + for(d = devs.d; d; d = d->next) + if(d->major == major && d->minor == minor) + break; + if(d == nil) { + d = newdev(major, minor, n); + d->next = devs.d; + devs.d = d; + } + poperror(); + WUNLOCK(&devs); + return d; +} + +static ushort +gbit16(void *a) +{ + uchar *i; + + i = a; + return i[1] << 8 | i[0]; +} + +static ulong +gbit32(void *a) +{ + ulong j; + uchar *i; + + i = a; + j = i[3] << 24; + j |= i[2] << 16; + j |= i[1] << 8; + j |= i[0]; + return j; +} + +static uvlong +gbit64(void *a) +{ + uchar *i; + + i = a; + return (uvlong)gbit32(i+4) << 32 | gbit32(a); +} + +static void +ataident(Aoedev *d) +{ + Aoeata *a; + Block *b; + Frame *f; + + f = freeframe(d); + if(f == nil) + return; + f->nhdr = Szaoeata; + memset(f->hdr, 0, f->nhdr); + a = (Aoeata*)f->hdr; + if(hset(d, f, (Aoehdr*)a, ACata) == -1) + return; + f->srb = srbkalloc(0, 0); + a->cmdstat = Cid; /* ata 6, page 110 */ + a->scnt = 1; + a->lba[3] = 0xa0; + d->nout++; + f->dl->npkt++; + f->bcnt = 512; + f->dlen = 0; + b = allocfb(f); + devtab[f->nl->dc->type]->bwrite(f->nl->dc, b, 0); +} + +static int +newdlea(Devlink *l, uchar *ea) +{ + int i; + uchar *t; + + for(i = 0; i < Nea; i++){ + t = l->eatab[i]; + if(i == l->nea){ + memmove(t, ea, Eaddrlen); + return l->nea++; + } + if(memcmp(t, ea, Eaddrlen) == 0) + return i; + } + return -1; +} + +static Devlink* +newdevlink(Aoedev *d, Netlink *n, Aoeqc *c) +{ + int i; + Devlink *l; + + for(i = 0; i < Ndevlink; i++){ + l = d->dl + i; + if(i == d->ndl){ + d->ndl++; + newdlea(l, c->src); + l->datamtu = c->scnt*Aoesectsz; + l->nl = n; + l->flag |= Dup; + l->mintimer = Rtmin; + l->rttavg = Rtmax; + return l; + } + if(l->nl == n){ + newdlea(l, c->src); + l->datamtu = c->scnt*Aoesectsz; + l->flag |= Dup; + return l; + } + } + eventlog("%æ: out of links: %s:%E to %E\n", d, n->path, n->ea, c->src); + return 0; +} + +static void +errrsp(Block *b, char *s) +{ + int n; + Aoedev *d; + Aoehdr *h; + Frame *f; + + h = (Aoehdr*)b->rp; + n = nhgetl(h->tag); + if(n == Tmgmt || n == Tfree) + return; + d = mm2dev(nhgets(h->major), h->minor); + if(d == 0) + return; + if(f = getframe(d, n)) + frameerror(d, f, s); +} + +static void +qcfgrsp(Block *b, Netlink *nl) +{ + int major, cmd, cslen, blen; + unsigned n; + Aoedev *d; + Aoeqc *ch; + Devlink *l; + Frame *f; + + ch = (Aoeqc*)b->rp; + major = nhgets(ch->major); + n = nhgetl(ch->tag); + if(n != Tmgmt){ + d = mm2dev(major, ch->minor); + if(d == nil) + return; + QLOCK(d); + f = getframe(d, n); + if(f == nil){ + QUNLOCK(d); + eventlog("%æ: unknown response tag %ux\n", d, n); + return; + } + cslen = nhgets(ch->cslen); + blen = BLEN(b) - Szaoeqc; + if(cslen < blen) + eventlog("%æ: cfgrsp: tag %.8ux oversized %d %d\n", + d, n, cslen, blen); + if(cslen > blen){ + eventlog("%æ: cfgrsp: tag %.8ux runt %d %d\n", + d, n, cslen, blen); + cslen = blen; + } + memmove(f->dp, ch + 1, cslen); + f->srb->nout--; + WAKEUP(f->srb); + d->nout--; + f->srb = nil; + f->tag = Tfree; + QUNLOCK(d); + return; + } + + cmd = ch->verccmd & 0xf; + if(cmd != 0){ + eventlog("aoe%d.%d: cfgrsp: bad command %d\n", major, ch->minor, cmd); + return; + } + n = nhgets(ch->bufcnt); + if(n > Maxframes) + n = Maxframes; + + if(waserror()){ + eventlog("getdev: %d.%d ignored: %s\n", major, ch->minor, up->errstr); + return; + } + d = getdev(major, ch->minor, n); + poperror(); + if(d == 0) + return; + + QLOCK(d); + *up->errstr = 0; + if(waserror()){ + QUNLOCK(d); + eventlog("%æ: %s\n", d, up->errstr); + nexterror(); + } + + l = newdevlink(d, nl, ch); /* add this interface. */ + + d->fwver = nhgets(ch->fwver); + n = nhgets(ch->cslen); + if(n > sizeof d->config) + n = sizeof d->config; + d->nconfig = n; + memmove(d->config, ch + 1, n); + + /* manually set mtu may be reset lower if conditions warrant */ + if(l){ + n = devmaxdata(d); + if(!(d->flag & Djumbo)) + n = Dbcnt; + if(n > d->maxmtu) + n = d->maxmtu; + if(n != d->maxbcnt){ + eventlog("%æ: setting %d byte mtu on %s:%E\n", + d, n, nl->path, nl->ea); + d->maxbcnt = n; + } + } + if(d->nopen == 0) + ataident(d); + poperror(); + QUNLOCK(d); +} + +static void +idmove(char *p, ushort *a, unsigned n) +{ + int i; + char *op, *e; + + op = p; + for(i = 0; i < n / 2; i++){ + *p++ = a[i] >> 8; + *p++ = a[i]; + } + *p = 0; + while(p > op && *--p == ' ') + *p = 0; + e = p; + p = op; + while(*p == ' ') + p++; + memmove(op, p, n - (e - p)); +} + +static vlong +aoeidentify(Aoedev *d, ushort *id) +{ + int i; + vlong s; + + d->flag &= ~(Dllba|Dpower|Dsmart|Dnop|Dup); + + i = gbit16(id+83) | gbit16(id+86); + if(i & (1<<10)){ + d->flag |= Dllba; + s = gbit64(id+100); + }else + s = gbit32(id+60); + + i = gbit16(id+83); + if((i>>14) == 1) { + if(i & (1<<3)) + d->flag |= Dpower; + i = gbit16(id+82); + if(i & 1) + d->flag |= Dsmart; + if(i & (1<<14)) + d->flag |= Dnop; + } +// eventlog("%æ up\n", d); + d->flag |= Dup; + memmove(d->ident, id, sizeof d->ident); + return s; +} + +static void +newvers(Aoedev *d) +{ + LOCK(&drivevers); + d->vers = drivevers.ref++; + UNLOCK(&drivevers); +} + +static int +identify(Aoedev *d, ushort *id) +{ + vlong osectors, s; + uchar oserial[21]; + + s = aoeidentify(d, id); + if(s == -1) + return -1; + osectors = d->realbsize; + memmove(oserial, d->serial, sizeof d->serial); + + idmove(d->serial, id+10, 20); + idmove(d->firmware, id+23, 8); + idmove(d->model, id+27, 40); + + s *= Aoesectsz; + if(osectors != s || memcmp(oserial, d->serial, sizeof oserial)){ + d->bsize = s; + d->realbsize = s; +// d->mediachange = 1; + newvers(d); + } + return 0; +} + +static void +atarsp(Block *b) +{ + unsigned n; + short major; + Aoeata *ahin, *ahout; + Aoedev *d; + Frame *f; + Srb *srb; + + ahin = (Aoeata*)b->rp; + major = nhgets(ahin->major); + d = mm2dev(major, ahin->minor); + if(d == nil) + return; + QLOCK(d); + if(waserror()){ + QUNLOCK(d); + nexterror(); + } + n = nhgetl(ahin->tag); + f = getframe(d, n); + if(f == nil){ + dprint("%æ: unexpected response; tag %ux\n", d, n); + goto bail; + } + rtupdate(f->dl, tsince(f->tag)); + ahout = (Aoeata*)f->hdr; + srb = f->srb; + + if(ahin->cmdstat & 0xa9){ + eventlog("%æ: ata error cmd %.2ux stat %.2ux\n", + d, ahout->cmdstat, ahin->cmdstat); + if(srb) + srb->error = Eio; + } else { + n = ahout->scnt * Aoesectsz; + switch(ahout->cmdstat){ + case Crd: + case Crdext: + if(BLEN(b) - Szaoeata < n){ + eventlog("%æ: runt read blen %ld expect %d\n", + d, BLEN(b), n); + goto bail; + } + memmove(f->dp, b->rp + Szaoeata, n); + case Cwr: + case Cwrext: + if(n > Dbcnt) + f->nl->lostjumbo = 0; + if(f->bcnt -= n){ + f->lba += n / Aoesectsz; + f->dp = (uchar*)f->dp + n; + resend(d, f); + goto bail; + } + break; + case Cid: + if(BLEN(b) - Szaoeata < 512){ + eventlog("%æ: runt identify blen %ld expect %d\n", + d, BLEN(b), n); + goto bail; + } + identify(d, (ushort*)(b->rp + Szaoeata)); + break; + default: + eventlog("%æ: unknown ata command %.2ux \n", + d, ahout->cmdstat); + } + } + + if(srb && --srb->nout == 0 && srb->len == 0) + WAKEUP(srb); + f->srb = nil; + f->tag = Tfree; + d->nout--; + + work(d); +bail: + poperror(); + QUNLOCK(d); +} + +static void +netrdaoeproc(void *v) +{ + int idx; + char name[Maxpath+1], *s; + Aoehdr *h; + Block *b; + Netlink *nl; + + nl = (Netlink*)v; + idx = nl - netlinks.nl; + netlinks.reader[idx] = 1; + kstrcpy(name, nl->path, Maxpath); + + if(waserror()){ + eventlog("netrdaoe@%s: exiting: %s\n", name, up->errstr); + netlinks.reader[idx] = 0; + wakeup(netlinks.rendez + idx); + pexit(up->errstr, 1); + } + if(autodiscover) + discover(0xffff, 0xff); + for (;;) { + if(!(nl->flag & Dup)) + error("netlink is down"); + if(nl->dc == nil) + panic("netrdaoe: nl->dc == nil"); + b = devtab[nl->dc->type]->bread(nl->dc, 1<<16, 0); + if(b == nil) + error("network read"); + h = (Aoehdr*)b->rp; + if(h->verflag & AFrsp) + if(s = aoeerror(h)){ + eventlog("%s: %s\n", nl->path, s); + errrsp(b, s); + }else if(h->cmd == ACata) + atarsp(b); + else if(h->cmd == ACconfig) + qcfgrsp(b, nl); + else if((h->cmd & 0xf0) == 0){ + eventlog("%s: unknown cmd %d\n", + nl->path, h->cmd); + errrsp(b, "unknown command"); + } + freeb(b); + } +} + +static void +getaddr(char *path, uchar *ea) +{ + int n; + char buf[2*Eaddrlen+1]; + Chan *c; + + uprint("%s/addr", path); + c = namec(up->genbuf, Aopen, OREAD, 0); + if(waserror()) { + cclose(c); + nexterror(); + } + if(c == nil) + panic("æ: getaddr: c == nil"); + n = devtab[c->type]->read(c, buf, sizeof buf-1, 0); + poperror(); + cclose(c); + buf[n] = 0; + if(parseether(ea, buf) < 0) + error("parseether failure"); +} + +static void +netbind(char *path) +{ + char addr[Maxpath]; + uchar ea[2*Eaddrlen+1]; + Chan *dc, *cc, *mtu; + Netlink *nl; + + snprint(addr, sizeof addr, "%s!0x%x", path, Aoetype); + dc = chandial(addr, nil, nil, &cc); + snprint(addr, sizeof addr, "%s/mtu", path); + if(waserror()) + mtu = nil; + else { + mtu = namec(addr, Aopen, OREAD, 0); + poperror(); + } + + if(waserror()){ + cclose(dc); + cclose(cc); + if(mtu) + cclose(mtu); + nexterror(); + } + if(dc == nil || cc == nil) + error(Enonexist); + getaddr(path, ea); + nl = addnet(path, cc, dc, mtu, ea); + snprint(addr, sizeof addr, "netrdaoe@%s", path); + kproc(addr, netrdaoeproc, nl); + poperror(); +} + +static int +unbound(void *v) +{ + return *(int*)v != 0; +} + +static void +netunbind(char *path) +{ + int i, idx; + Aoedev *d, *p, *next; + Chan *dc, *cc; + Devlink *l; + Frame *f; + Netlink *n, *e; + + n = netlinks.nl; + e = n + nelem(netlinks.nl); + + LOCK(&netlinks); + for(; n < e; n++) + if(n->dc && strcmp(n->path, path) == 0) + break; + UNLOCK(&netlinks); + if(n == e) + error("device not bound"); + + /* + * hunt down devices using this interface; disable + * this also terminates the reader. + */ + idx = n - netlinks.nl; + WLOCK(&devs); + for(d = devs.d; d; d = d->next){ + QLOCK(d); + for(i = 0; i < d->ndl; i++){ + l = d->dl + i; + if(l->nl == n) + l->flag &= ~Dup; + } + QUNLOCK(d); + } + n->flag &= ~Dup; + WUNLOCK(&devs); + + /* confirm reader is down. */ + while(waserror()) + ; + sleep(netlinks.rendez + idx, unbound, netlinks.reader + idx); + poperror(); + + /* reschedule packets. */ + WLOCK(&devs); + for(d = devs.d; d; d = d->next){ + QLOCK(d); + for(i = 0; i < d->nframes; i++){ + f = d->frames + i; + if(f->tag != Tfree && f->nl == n) + resend(d, f); + } + QUNLOCK(d); + } + WUNLOCK(&devs); + + /* squeeze devlink pool. (we assert nobody is using them now) */ + WLOCK(&devs); + for(d = devs.d; d; d = d->next){ + QLOCK(d); + for(i = 0; i < d->ndl; i++){ + l = d->dl + i; + if(l->nl == n) + memmove(l, l + 1, sizeof *l * (--d->ndl - i)); + } + QUNLOCK(d); + } + WUNLOCK(&devs); + + /* close device link. */ + LOCK(&netlinks); + dc = n->dc; + cc = n->cc; + if(n->mtu) + cclose(n->mtu); + memset(n, 0, sizeof *n); + UNLOCK(&netlinks); + + cclose(dc); + cclose(cc); + + /* squeeze orphan devices */ + WLOCK(&devs); + for(p = d = devs.d; d; d = next){ + next = d->next; + if(d->ndl > 0){ + p = d; + continue; + } + QLOCK(d); + downdev(d, "orphan"); + QUNLOCK(d); + if(p != devs.d) + p->next = next; + else{ + devs.d = next; + p = devs.d; + } + free(d->frames); + free(d); + dropunit(); + } + WUNLOCK(&devs); +} + +static void +strtoss(char *f, ushort *shelf, ushort *slot) +{ + ulong sh; + char *s; + + *shelf = 0xffff; + *slot = 0xff; + if(!f) + return; + *shelf = sh = strtol(f, &s, 0); + if(s == f || sh > 0xffff) + error("bad shelf"); + f = s; + if(*f++ == '.'){ + *slot = strtol(f, &s, 0); + if(s == f || *slot > 0xff) + error("bad shelf"); + }else + *slot = 0xff; +} + +static void +discoverstr(char *f) +{ + ushort shelf, slot; + + strtoss(f, &shelf, &slot); + discover(shelf, slot); +} + +static void +removedev(Aoedev *d) +{ + int i; + Aoedev *p; + + WLOCK(&devs); + p = 0; + if(d != devs.d) + for(p = devs.d; p; p = p->next) + if(p->next == d) + break; + QLOCK(d); + d->flag &= ~Dup; + newvers(d); + d->ndl = 0; + QUNLOCK(d); + for(i = 0; i < d->nframes; i++) + frameerror(d, d->frames+i, Enotup); + + if(p) + p->next = d->next; + else + devs.d = d->next; + free(d->frames); + free(d); + dropunit(); + WUNLOCK(&devs); +} + + +static void +aoeremove(Chan *c) +{ + switch(TYPE(c->qid)){ + default: + case Qzero: + case Qtopdir: + case Qtoplog: + case Qtopctl: + case Qctl: + case Qdata: + case Qconfig: + case Qident: + error(Eperm); + case Qunitdir: + removedev(unit2dev(UNIT(c->qid))); + break; + } +} + +static void +removestr(char *f) +{ + ushort shelf, slot; + Aoedev *d; + + strtoss(f, &shelf, &slot); + WLOCK(&devs); + for(d = devs.d; d; d = d->next) + if(shelf == d->major && slot == d->minor){ + WUNLOCK(&devs); /* BOTCH */ + removedev(d); + return; + } + WUNLOCK(&devs); + error("device not bound"); +} + +static long +topctlwrite(void *db, long n) +{ + enum { + Autodiscover, + Bind, + Debug, + Discover, + Closewait, + Rediscover, + Remove, + Unbind, + }; + char *f; + Cmdbuf *cb; + Cmdtab *ct; + static Cmdtab cmds[] = { + { Autodiscover, "autodiscover", 0 }, + { Bind, "bind", 2 }, + { Debug, "debug", 0 }, + { Discover, "discover", 0 }, + { Rediscover, "rediscover", 0 }, + { Remove, "remove", 2 }, + { Unbind, "unbind", 2 }, + }; + + cb = parsecmd(db, n); + if(waserror()){ + free(cb); + nexterror(); + } + ct = lookupcmd(cb, cmds, nelem(cmds)); + f = cb->f[1]; + switch(ct->index){ + case Autodiscover: + autodiscover = toggle(f, autodiscover); + break; + case Bind: + netbind(f); + break; + case Debug: + debug = toggle(f, debug); + break; + case Discover: + discoverstr(f); + break; + case Rediscover: + rediscover = toggle(f, rediscover); + break; + case Remove: + removestr(f); /* depricated */ + break; + case Unbind: + netunbind(f); + break; + default: + cmderror(cb, "unknown aoe control message"); + } + poperror(); + free(cb); + return n; +} + +static long +aoewrite(Chan *c, void *db, long n, vlong off) +{ + switch(TYPE(c->qid)){ + default: + case Qzero: + case Qtopdir: + case Qunitdir: + case Qtoplog: + error(Eperm); + case Qtopctl: + return topctlwrite(db, n); + case Qctl: + case Qdata: + case Qconfig: + case Qident: + return unitwrite(c, db, n, off); + } +} + +Dev aoedevtab = { + L'æ', + "aoe", + + devreset, + devinit, + devshutdown, + aoeattach, + aoewalk, + aoestat, + aoeopen, + devcreate, + aoeclose, + aoeread, + devbread, + aoewrite, + devbwrite, + aoeremove, + devwstat, + devpower, + devconfig, +}; diff --git a/src/9vx/a/devcons.c b/src/9vx/a/devcons.c @@ -784,6 +784,7 @@ consread(Chan *c, void *buf, long n, vlong off) while(!qcanread(lineq)){ if(qread(kbdq, &ch, 1) == 0) continue; + //XXX TODO: startup blocks here send = 0; if(ch == 0){ /* flush output on rawoff -> rawon */ diff --git a/src/9vx/a/devether.c b/src/9vx/a/devether.c @@ -0,0 +1,542 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "io.h" +#include "ureg.h" +#include "error.h" +#include "netif.h" + +#include "etherif.h" + +#define MEMSIZE (256<<20) // same as ../mmu.c:29 (TODO: var) + +static Ether *etherxx[MaxEther]; + +Chan* +etherattach(char* spec) +{ + ulong ctlrno; + char *p; + Chan *chan; + + ctlrno = 0; + if(spec && *spec){ + ctlrno = strtoul(spec, &p, 0); + if((ctlrno == 0 && p == spec) || *p || (ctlrno >= MaxEther)) + error(Ebadarg); + } + if(etherxx[ctlrno] == 0) + error(Enodev); + + chan = devattach('l', spec); + if(waserror()){ + chanfree(chan); + nexterror(); + } + chan->dev = ctlrno; + if(etherxx[ctlrno]->attach) + etherxx[ctlrno]->attach(etherxx[ctlrno]); + poperror(); + return chan; +} + +static Walkqid* +etherwalk(Chan* chan, Chan* nchan, char** name, int nname) +{ + return netifwalk(&etherxx[chan->dev]->ni, chan, nchan, name, nname); +} + +static int +etherstat(Chan* chan, uchar* dp, int n) +{ + return netifstat(&etherxx[chan->dev]->ni, chan, dp, n); +} + +static Chan* +etheropen(Chan* chan, int omode) +{ + return netifopen(&etherxx[chan->dev]->ni, chan, omode); +} + +static void +ethercreate(Chan* ch, char* c, int i, ulong ul) +{ +} + +static void +etherclose(Chan* chan) +{ + netifclose(&etherxx[chan->dev]->ni, chan); +} + +static long +etherread(Chan* chan, void* buf, long n, vlong off) +{ + Ether *ether; + ulong offset = off; + + ether = etherxx[chan->dev]; + if((chan->qid.type & QTDIR) == 0 && ether->ifstat){ + /* + * With some controllers it is necessary to reach + * into the chip to extract statistics. + */ + if(NETTYPE(chan->qid.path) == Nifstatqid) + return ether->ifstat(ether, buf, n, offset); + else if(NETTYPE(chan->qid.path) == Nstatqid) + ether->ifstat(ether, buf, 0, offset); + } + + return netifread(&ether->ni, chan, buf, n, offset); +} + +static Block* +etherbread(Chan* chan, long n, ulong offset) +{ + return netifbread(&etherxx[chan->dev]->ni, chan, n, offset); +} + +static int +etherwstat(Chan* chan, uchar* dp, int n) +{ + return netifwstat(&etherxx[chan->dev]->ni, chan, dp, n); +} + +static void +etherrtrace(Netfile* f, Etherpkt* pkt, int len) +{ + int i, n; + Block *bp; + + if(qwindow(f->in) <= 0) + return; + if(len > 58) + n = 58; + else + n = len; + bp = iallocb(64); + if(bp == nil) + return; + memmove(bp->wp, pkt->d, n); + i = TK2MS(MACHP(0)->tscticks); + bp->wp[58] = len>>8; + bp->wp[59] = len; + bp->wp[60] = i>>24; + bp->wp[61] = i>>16; + bp->wp[62] = i>>8; + bp->wp[63] = i; + bp->wp += 64; + qpass(f->in, bp); +} + +Block* +etheriq(Ether* ether, Block* bp, int fromwire) +{ + Etherpkt *pkt; + ushort type; + int len, multi, tome, fromme; + Netfile **ep, *f, **fp, *fx; + Block *xbp; + + ether->ni.inpackets++; + + pkt = (Etherpkt*)bp->rp; + len = BLEN(bp); + type = (pkt->type[0]<<8)|pkt->type[1]; + fx = 0; + ep = &ether->ni.f[Ntypes]; + + multi = pkt->d[0] & 1; + /* check for valid multicast addresses */ + if(multi && memcmp(pkt->d, ether->ni.bcast, sizeof(pkt->d)) != 0 && ether->ni.prom == 0){ + if(!activemulti(&ether->ni, pkt->d, sizeof(pkt->d))){ + if(fromwire){ + freeb(bp); + bp = 0; + } + return bp; + } + } + + /* is it for me? */ + tome = memcmp(pkt->d, ether->ea, sizeof(pkt->d)) == 0; + fromme = memcmp(pkt->s, ether->ea, sizeof(pkt->s)) == 0; + // if(tome||fromme) + // iprint("XXX PACK: %2.2ux:%2.2ux:%2.2ux:%2.2ux:%2.2ux:%2.2ux -> %2.2ux:%2.2ux:%2.2ux:%2.2ux:%2.2ux:%2.2ux%s[%d]%s\n", + // pkt->s[0], pkt->s[1], pkt->s[2],pkt->s[3], pkt->s[4], pkt->s[5], + // pkt->d[0], pkt->d[1], pkt->d[2],pkt->d[3], pkt->d[4], pkt->d[5], + // (tome ? " <<--" : ""), len, (fromme ? " -->>" : "")); + /* + * Multiplex the packet to all the connections which want it. + * If the packet is not to be used subsequently (fromwire != 0), + * attempt to simply pass it into one of the connections, thereby + * saving a copy of the data (usual case hopefully). + */ + for(fp = ether->ni.f; fp < ep; fp++){ + if((f = *fp) != nil) + if(f->type == type || f->type < 0) + if(tome || multi || f->prom){ + /* Don't want to hear bridged packets */ + if(f->bridge && !fromwire && !fromme) + continue; + if(!f->headersonly){ + if(fromwire && fx == 0) + fx = f; + else if((xbp = iallocb(len)) != nil){ + memmove(xbp->wp, pkt, len); + xbp->wp += len; + if(qpass(f->in, xbp) < 0) + ether->ni.soverflows++; + } + else + ether->ni.soverflows++; + } + else + etherrtrace(f, pkt, len); + } + } + + if(fx){ + if(qpass(fx->in, bp) < 0) + ether->ni.soverflows++; + return 0; + } + if(fromwire){ + freeb(bp); + return 0; + } + + return bp; +} + +static int +etheroq(Ether* ether, Block* bp) +{ + int len, loopback, s; + Etherpkt *pkt; + + ether->ni.outpackets++; + + /* + * Check if the packet has to be placed back onto the input queue, + * i.e. if it's a loopback or broadcast packet or the interface is + * in promiscuous mode. + * If it's a loopback packet indicate to etheriq that the data isn't + * needed and return, etheriq will pass-on or free the block. + * To enable bridging to work, only packets that were originated + * by this interface are fed back. + */ + pkt = (Etherpkt*)bp->rp; + len = BLEN(bp); + loopback = memcmp(pkt->d, ether->ea, sizeof(pkt->d)) == 0; + if(loopback || memcmp(pkt->d, ether->ni.bcast, sizeof(pkt->d)) == 0 || ether->ni.prom){ + s = splhi(); + etheriq(ether, bp, 0); + splx(s); + } + + if(!loopback){ + qbwrite(ether->oq, bp); + if(ether->transmit != nil) + ether->transmit(ether); + } else + freeb(bp); + + return len; +} + +static long +etherwrite(Chan* chan, void* buf, long n, vlong v) +{ + Ether *ether; + Block *bp; + int nn, onoff; + Cmdbuf *cb; + + ether = etherxx[chan->dev]; + if(NETTYPE(chan->qid.path) != Ndataqid) { + nn = netifwrite(&ether->ni, chan, buf, n); + if(nn >= 0) + return nn; + cb = parsecmd(buf, n); + if(cb->f[0] && strcmp(cb->f[0], "nonblocking") == 0){ + if(cb->nf <= 1) + onoff = 1; + else + onoff = atoi(cb->f[1]); + qnoblock(ether->oq, onoff); + free(cb); + return n; + } + free(cb); + if(ether->ctl!=nil) + return ether->ctl(ether,buf,n); + + error(Ebadctl); + } + + if(n > ether->maxmtu) + error(Etoobig); + if(n < ether->minmtu) + error(Etoosmall); + + bp = allocb(n); + if(waserror()){ + freeb(bp); + nexterror(); + } + memmove(bp->rp, buf, n); + memmove(bp->rp+Eaddrlen, ether->ea, Eaddrlen); + poperror(); + bp->wp += n; + + return etheroq(ether, bp); +} + +static long +etherbwrite(Chan* chan, Block* bp, ulong u) +{ + Ether *ether; + long n; + + n = BLEN(bp); + if(NETTYPE(chan->qid.path) != Ndataqid){ + if(waserror()) { + freeb(bp); + nexterror(); + } + n = etherwrite(chan, bp->rp, n, 0); + poperror(); + freeb(bp); + return n; + } + ether = etherxx[chan->dev]; + + if(n > ether->maxmtu){ + freeb(bp); + error(Etoobig); + } + if(n < ether->minmtu){ + freeb(bp); + error(Etoosmall); + } + + return etheroq(ether, bp); +} + +static struct { + char* type; + int (*reset)(Ether*); +} cards[MaxEther+1]; + +void +addethercard(char* t, int (*r)(Ether*)) +{ + static int ncard; + + if(ncard == MaxEther) + panic("too many ether cards"); + cards[ncard].type = t; + cards[ncard].reset = r; + ncard++; +} + +int +parseether(uchar *to, char *from) +{ + char nip[4]; + char *p; + int i; + + p = from; + for(i = 0; i < Eaddrlen; i++){ + if(*p == 0) + return -1; + nip[0] = *p++; + if(*p == 0) + return -1; + nip[1] = *p++; + nip[2] = 0; + to[i] = strtoul(nip, 0, 16); + if(*p == ':') + p++; + } + return 0; +} + +static Ether* +etherprobe(int cardno, int ctlrno) +{ + int i, lg; + ulong mb, bsz; + Ether *ether; + char buf[128], name[32]; + + ether = malloc(sizeof(Ether)); + memset(ether, 0, sizeof(Ether)); + ether->ctlrno = ctlrno; + ether->tbdf = BUSUNKNOWN; + ether->ni.mbps = 100; + ether->minmtu = ETHERMINTU; + ether->maxmtu = ETHERMAXTU; + + if(cardno < 0){ + for(cardno = 0; cards[cardno].type; cardno++){ + for(i = 0; i < ether->isac.nopt; i++){ + if(strncmp(ether->isac.opt[i], "ea=", 3)) + continue; + if(parseether(ether->ea, &ether->isac.opt[i][3])) + memset(ether->ea, 0, Eaddrlen); + } + break; + } + } + + if(cardno >= MaxEther || cards[cardno].type == nil){ + free(ether); + return nil; + } + if(cards[cardno].reset(ether) < 0){ + free(ether); + return nil; + } + + /* + * IRQ2 doesn't really exist, it's used to gang the interrupt + * controllers together. A device set to IRQ2 will appear on + * the second interrupt controller as IRQ9. + */ + if(ether->isac.irq == 2) + ether->isac.irq = 9; + snprint(name, sizeof(name), "ether%d", ctlrno); + + i = sprint(buf, "#l%d: %s: %dMbps port 0x%luX irq %d", + ctlrno, cards[cardno].type, ether->ni.mbps, ether->isac.port, ether->isac.irq); + if(ether->isac.mem) + i += sprint(buf+i, " addr 0x%luX", ether->isac.mem); + if(ether->isac.size) + i += sprint(buf+i, " size 0x%luX", ether->isac.size); + i += sprint(buf+i, ": %2.2ux%2.2ux%2.2ux%2.2ux%2.2ux%2.2ux", + ether->ea[0], ether->ea[1], ether->ea[2], + ether->ea[3], ether->ea[4], ether->ea[5]); + sprint(buf+i, "\n"); + print(buf); + + /* compute log10(ether->ni.mbps) into lg */ + for(lg = 0, mb = ether->ni.mbps; mb >= 10; lg++) + mb /= 10; + if (lg > 0) + lg--; + if (lg > 14) /* 2^(14+17) = 2⁳ⁱ */ + lg = 14; + /* allocate larger output queues for higher-speed interfaces */ + bsz = 1UL << (lg + 17); /* 2ⁱ⁷ = 128K, bsz = 2ⁿ × 128K */ + while (bsz > MEMSIZE && bsz >= 128*1024) + bsz /= 2; + + netifinit(&ether->ni, name, Ntypes, bsz); + while (ether->oq == nil && bsz >= 128*1024) { + bsz /= 2; + ether->oq = qopen(bsz, Qmsg, 0, 0); + ether->ni.limit = bsz; + } + if(ether->oq == nil) + panic("etherreset %s", name); + ether->ni.alen = Eaddrlen; + memmove(ether->ni.addr, ether->ea, Eaddrlen); + memset(ether->ni.bcast, 0xFF, Eaddrlen); + + // iprint("XXX EADDR: %2.2ux:%2.2ux:%2.2ux:%2.2ux:%2.2ux:%2.2ux\n", + // ether->ea[0], ether->ea[1], ether->ea[2],ether->ea[3], ether->ea[4], ether->ea[5]); + + return ether; +} + +static void +etherreset(void) +{ + Ether *ether; + int cardno, ctlrno; + + for(ctlrno = 0; ctlrno < MaxEther; ctlrno++){ + if((ether = etherprobe(-1, ctlrno)) == nil) + continue; + etherxx[ctlrno] = ether; + } + + cardno = ctlrno = 0; + while(cards[cardno].type != nil && ctlrno < MaxEther){ + if(etherxx[ctlrno] != nil){ + ctlrno++; + continue; + } + if((ether = etherprobe(cardno, ctlrno)) == nil){ + cardno++; + continue; + } + etherxx[ctlrno] = ether; + ctlrno++; + } +} + +static void +ethershutdown(void) +{ + Ether *ether; + int i; + + for(i = 0; i < MaxEther; i++){ + ether = etherxx[i]; + if(ether == nil) + continue; + if(ether->shutdown == nil) { + print("#l%d: no shutdown fuction\n", i); + continue; + } + (*ether->shutdown)(ether); + } +} + + +#define POLY 0xedb88320 + +/* really slow 32 bit crc for ethers */ +ulong +ethercrc(uchar *p, int len) +{ + int i, j; + ulong crc, b; + + crc = 0xffffffff; + for(i = 0; i < len; i++){ + b = *p++; + for(j = 0; j < 8; j++){ + crc = (crc>>1) ^ (((crc^b) & 1) ? POLY : 0); + b >>= 1; + } + } + return crc; +} + +Dev etherdevtab = { + 'l', + "ether", + + etherreset, + devinit, + ethershutdown, + etherattach, + etherwalk, + etherstat, + etheropen, + ethercreate, + etherclose, + etherread, + etherbread, + etherwrite, + etherbwrite, + devremove, + etherwstat, +}; diff --git a/src/9vx/a/devsd.c b/src/9vx/a/devsd.c @@ -72,7 +72,7 @@ enum { ((p)<<PartSHIFT)|((t)<<TypeSHIFT)) -static void +void sdaddpart(SDunit* unit, char* name, uvlong start, uvlong end) { SDpart *pp; @@ -135,6 +135,19 @@ sdaddpart(SDunit* unit, char* name, uvlong start, uvlong end) pp->valid = 1; } +SDpart* +sdfindpart(SDunit *unit, char *name) +{ + int i; + + for(i=0; i<unit->npart; i++) { + if(strcmp(unit->part[i].perm.name, name) == 0){ + return &unit->part[i]; + } + } + return nil; +} + static void sddelpart(SDunit* unit, char* name) { @@ -198,6 +211,7 @@ sdinitpart(SDunit* unit) if(unit->sectors){ sdincvers(unit); sdaddpart(unit, "data", 0, unit->sectors); + partition(unit); #if 0 /* * Use partitions passed from boot program, diff --git a/src/9vx/a/dosfs.h b/src/9vx/a/dosfs.h @@ -0,0 +1,62 @@ +typedef struct Dosboot Dosboot; +typedef struct Dos Dos; +typedef struct Dosdir Dosdir; +typedef struct Dosfile Dosfile; +typedef struct Dospart Dospart; + +struct Dospart +{ + uchar flag; /* active flag */ + uchar shead; /* starting head */ + uchar scs[2]; /* starting cylinder/sector */ + uchar type; /* partition type */ + uchar ehead; /* ending head */ + uchar ecs[2]; /* ending cylinder/sector */ + uchar start[4]; /* starting sector */ + uchar len[4]; /* length in sectors */ +}; + +#define FAT12 0x01 +#define FAT16 0x04 +#define EXTEND 0x05 +#define FATHUGE 0x06 +#define FAT32 0x0b +#define FAT32X 0x0c +#define EXTHUGE 0x0f +#define DMDDO 0x54 +#define PLAN9 0x39 +#define LEXTEND 0x85 + +struct Dosfile{ + Dos *dos; /* owning dos file system */ + char name[8]; + char ext[3]; + uchar attr; + long length; + long pstart; /* physical start cluster address */ + long pcurrent; /* physical current cluster address */ + long lcurrent; /* logical current cluster address */ + long offset; +}; + +struct Dos{ + long start; /* start of file system */ + int sectsize; /* in bytes */ + int clustsize; /* in sectors */ + int clustbytes; /* in bytes */ + int nresrv; /* sectors */ + int nfats; /* usually 2 */ + int rootsize; /* number of entries */ + int volsize; /* in sectors */ + int mediadesc; + int fatsize; /* in sectors */ + int fatclusters; + int fatbits; /* 12 or 16 */ + long fataddr; /* sector number */ + long rootaddr; + long rootclust; + long dataaddr; + long freeptr; +}; + +extern int dosinit(Fs*); diff --git a/src/9vx/a/etherif.h b/src/9vx/a/etherif.h @@ -0,0 +1,39 @@ +enum { + MaxEther = 48, + Ntypes = 8, +}; + +typedef struct Ether Ether; +struct Ether { + ISAConf isac; + + int ctlrno; + int tbdf; /* type+busno+devno+funcno */ + int minmtu; + int maxmtu; + uchar ea[Eaddrlen]; + + void (*attach)(Ether*); /* filled in by reset routine */ + void (*detach)(Ether*); + void (*transmit)(Ether*); + void (*interrupt)(Ureg*, void*); + long (*ifstat)(Ether*, void*, long, ulong); + long (*ctl)(Ether*, void*, long); /* custom ctl messages */ + void (*power)(Ether*, int); /* power on/off */ + void (*shutdown)(Ether*); /* shutdown hardware before reboot */ + void *ctlr; + + Queue* oq; + + Netif ni; +}; + +extern Block* etheriq(Ether*, Block*, int); +extern void addethercard(char*, int(*)(Ether*)); +extern ulong ethercrc(uchar*, int); +extern int parseether(uchar*, char*); + +#define NEXT(x, l) (((uint)(x)+1)%(l)) +#define PREV(x, l) (((x) == 0) ? (l)-1: (x)-1) +#define HOWMANY(x, y) (((x)+((y)-1))/(y)) +#define ROUNDUP(x, y) (HOWMANY((x), (y))*(y)) diff --git a/src/9vx/a/fns.ed b/src/9vx/a/fns.ed @@ -16,4 +16,54 @@ int tailkmesg(char*, int); void trap(Ureg*); void uartecho(char*, int); void uartinit(int); + +#define GSHORT(p) (((p)[1]<<8)|(p)[0]) +#define GLONG(p) ((GSHORT(p+2)<<16)|GSHORT(p)) + +void __plock(Psleep*); +void __punlock(Psleep*); +void __pwakeup(Psleep*); +void __psleep(Psleep*); + +extern int tracelock; + +#define lockfngen(type) __ ## type + +#define lockgen(type, arg) \ + do { \ + if (tracelock) { \ + iprint("%s %p %s %d\n", (#type), (arg), __FILE__, __LINE__); \ + lockfngen(type)((arg)); \ + } else { \ + lockfngen(type)((arg)); \ + } \ + } while (0) + +#define qlock(x) lockgen(qlock, (x)) +#define qunlock(x) lockgen(qunlock, (x)) +#define rlock(x) lockgen(rlock, (x)) +#define runlock(x) lockgen(runlock, (x)) +#define wlock(x) lockgen(wlock, (x)) +#define wunlock(x) lockgen(wunlock, (x)) +#define plock(x) lockgen(plock, (x)) +#define punlock(x) lockgen(punlock, (x)) +#define pwakeup(x) lockgen(pwakeup, (x)) +#define psleep(x) lockgen(psleep, (x)) +// #define lock(x) lockgen(lock, (x)) +// #define unlock(x) lockgen(unlock, (x)) +#define lock(x) __lock(x) +#define unlock(x) __unlock(x) +#define canqlock __canqlock +#define canrlock __canrlock + +#define LOCK(x) lock(&((x)->lk)) +#define UNLOCK(x) unlock(&((x)->lk)) +#define CANQLOCK(x) canqlock(&((x)->qlock)) +#define QLOCK(x) qlock(&((x)->qlock)) +#define QUNLOCK(x) qunlock(&((x)->qlock)) +#define CANRLOCK(x) canrlock(&((x)->rwlock)) +#define RLOCK(x) rlock(&((x)->rwlock)) +#define RUNLOCK(x) runlock(&((x)->rwlock)) +#define WLOCK(x) wlock(&((x)->rwlock)) +#define WUNLOCK(x) wunlock(&((x)->rwlock)) . diff --git a/src/9vx/a/fns.h b/src/9vx/a/fns.h @@ -167,8 +167,53 @@ void *uvalidaddr(ulong addr, ulong len, int write); int isuaddr(void*); void setsigsegv(int invx32); -void plock(Psleep*); -void punlock(Psleep*); -void pwakeup(Psleep*); -void psleep(Psleep*); +#define GSHORT(p) (((p)[1]<<8)|(p)[0]) +#define GLONG(p) ((GSHORT(p+2)<<16)|GSHORT(p)) + +void __plock(Psleep*); +void __punlock(Psleep*); +void __pwakeup(Psleep*); +void __psleep(Psleep*); + +extern int tracelock; + +#define lockfngen(type) __ ## type + +#define lockgen(type, arg) \ + do { \ + if (tracelock) { \ + iprint("%s %p %s %d\n", (#type), (arg), __FILE__, __LINE__); \ + lockfngen(type)((arg)); \ + } else { \ + lockfngen(type)((arg)); \ + } \ + } while (0) + +#define qlock(x) lockgen(qlock, (x)) +#define qunlock(x) lockgen(qunlock, (x)) +#define rlock(x) lockgen(rlock, (x)) +#define runlock(x) lockgen(runlock, (x)) +#define wlock(x) lockgen(wlock, (x)) +#define wunlock(x) lockgen(wunlock, (x)) +#define plock(x) lockgen(plock, (x)) +#define punlock(x) lockgen(punlock, (x)) +#define pwakeup(x) lockgen(pwakeup, (x)) +#define psleep(x) lockgen(psleep, (x)) +// #define lock(x) lockgen(lock, (x)) +// #define unlock(x) lockgen(unlock, (x)) +#define lock(x) __lock(x) +#define unlock(x) __unlock(x) +#define canqlock __canqlock +#define canrlock __canrlock + +#define LOCK(x) lock(&((x)->lk)) +#define UNLOCK(x) unlock(&((x)->lk)) +#define CANQLOCK(x) canqlock(&((x)->qlock)) +#define QLOCK(x) qlock(&((x)->qlock)) +#define QUNLOCK(x) qunlock(&((x)->qlock)) +#define CANRLOCK(x) canrlock(&((x)->rwlock)) +#define RLOCK(x) rlock(&((x)->rwlock)) +#define RUNLOCK(x) runlock(&((x)->rwlock)) +#define WLOCK(x) wlock(&((x)->rwlock)) +#define WUNLOCK(x) wunlock(&((x)->rwlock)) diff --git a/src/9vx/a/fs.h b/src/9vx/a/fs.h @@ -0,0 +1,38 @@ +typedef struct File File; +typedef struct Fs Fs; + +#include "dosfs.h" +#include "kfs.h" + +struct File{ + union{ + Dosfile dos; + Kfsfile kfs; + int walked; + }; + Fs *fs; + char *path; +}; + +struct Fs{ + union { + Dos dos; + Kfs kfs; + }; + int dev; /* device id */ + long (*diskread)(Fs*, void*, long); /* disk read routine */ + vlong (*diskseek)(Fs*, vlong); /* disk seek routine */ + long (*read)(File*, void*, long); + int (*walk)(File*, char*); + File root; +}; + +/* +extern int chatty; +extern int dotini(Fs*); +extern int fswalk(Fs*, char*, File*); +extern int fsread(File*, void*, long); +extern int fsboot(Fs*, char*, Boot*); +*/ + +#define BADPTR(x) ((ulong)x < 0x80000000) diff --git a/src/9vx/a/ip.ed b/src/9vx/a/ip.ed @@ -0,0 +1,2297 @@ +diff -e ip.orig/arp.c ip/arp.c +643c + QUNLOCK(arp); +. +613,614c + RUNLOCK(ifc); + QLOCK(arp); +. +609c + QUNLOCK(arp); /* for icmpns */ +. +589c + if((a->rxtsrem <= 0) || !(CANRLOCK(ifc)) || (a->ifcid != ifc->ifcid)){ +. +574c + QLOCK(arp); +. +557c + QUNLOCK(arp); +. +554c + QLOCK(arp); +. +511c + QUNLOCK(arp); +. +481c + QLOCK(arp); +. +444c + QUNLOCK(arp); +. +426c + QLOCK(arp); +. +398c + QUNLOCK(arp); +. +380c + RUNLOCK(ifc); +. +375c + RLOCK(ifc); +. +372c + RUNLOCK(ifc); +. +366c + QUNLOCK(arp); +. +337c + QLOCK(arp); +. +292c + QUNLOCK(arp); +. +260c + QUNLOCK(arp); +. +258c +arprelease(Arp *arp, Arpent* ae) +. +250c + QUNLOCK(arp); +. +219c + QLOCK(arp); +. +50c +int ReTransTimer = RETRANS_TIMER; +. +48c +#define haship(s) ((ulong)((s)[IPaddrlen-1])%NHASH) +. +36c + QLock qlock; +. +14d +6c +#include "error.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/chandial.c ip/chandial.c +6,7c +#include "error.h" +#include "ip/ip.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/devip.c ip/devip.c +1430c + QUNLOCK(c); +. +1418c + QUNLOCK(c); +. +1404,1411c + QUNLOCK(c); +. +1399c + QLOCK(c); +. +1349c + QUNLOCK(c); +. +1326,1328d +1322,1323d +1318c + QUNLOCK(c); +. +1310c + if(CANQLOCK(c)){ +. +1294c + QLOCK(c); +. +1185c + QUNLOCK(c); +. +1130c + QUNLOCK(c); +. +1128c + QLOCK(c); +. +1033c + QLOCK(c); +. +1029c + QLOCK(c); +. +1027c + QUNLOCK(c); +. +980c + QLOCK(c); +. +976c + QLOCK(c); +. +974c + QUNLOCK(c); +. +831c + QUNLOCK(p); +. +820,826c + QUNLOCK(p); +. +793c + QLOCK(p); +. +765c + QUNLOCK(p); +. +760c + QUNLOCK(p); +. +748c + QLOCK(p); +. +582c + QUNLOCK(cv); +. +561c + QUNLOCK(cv); +. +558c + QLOCK(cv); +. +516c +ipremove(Chan* _) +. +510c +ipcreate(Chan* _, char* __, int ___, ulong ____) +. +494c + QUNLOCK(cv); +. +487c + QLOCK(cv); +. +470c + QUNLOCK(cv); +. +468c + QLOCK(cv); +. +447,448c + QUNLOCK(cv); + QUNLOCK(p); +. +431,432c + QUNLOCK(cv); + QUNLOCK(p); +. +429c + QLOCK(cv); +. +427c + QLOCK(p); +. +415c + QUNLOCK(p); +. +411c + QUNLOCK(p); +. +409c + QLOCK(p); +. +174c +ipgen(Chan *c, char* __ch, Dirtab* __dt, int __i, int s, Dir *dp) +. +50c +#define QID(p, c, y) ( ((uint)(p)<<(Shiftproto)) | ((uint)(c)<<Shiftconv) | (y) ) +. +6,7c +#include "error.h" +#include "ip/ip.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/esp.c ip/esp.c +1106a + + +#ifdef notdef +enum { + RC4forward= 10*1024*1024, /* maximum skip forward */ + RC4back = 100*1024, /* maximum look back */ +}; + +typedef struct Esprc4 Esprc4; +struct Esprc4 +{ + ulong cseq; /* current byte sequence number */ + RC4state current; + + int ovalid; /* old is valid */ + ulong lgseq; /* last good sequence */ + ulong oseq; /* old byte sequence number */ + RC4state old; +}; + +static void rc4espinit(Espcb *ecb, char *name, uchar *k, int n); + +static int +rc4cipher(Espcb *ecb, uchar *p, int n) +{ + Esprc4 *esprc4; + RC4state tmpstate; + ulong seq; + long d, dd; + + if(n < 4) + return 0; + + esprc4 = ecb->espstate; + if(ecb->incoming) { + seq = nhgetl(p); + p += 4; + n -= 4; + d = seq-esprc4->cseq; + if(d == 0) { + rc4(&esprc4->current, p, n); + esprc4->cseq += n; + if(esprc4->ovalid) { + dd = esprc4->cseq - esprc4->lgseq; + if(dd > RC4back) + esprc4->ovalid = 0; + } + } else if(d > 0) { +print("esp rc4cipher: missing packet: %uld %ld\n", seq, d); /* this link is hosed */ + if(d > RC4forward) { + strcpy(up->errstr, "rc4cipher: skipped too much"); + return 0; + } + esprc4->lgseq = seq; + if(!esprc4->ovalid) { + esprc4->ovalid = 1; + esprc4->oseq = esprc4->cseq; + memmove(&esprc4->old, &esprc4->current, + sizeof(RC4state)); + } + rc4skip(&esprc4->current, d); + rc4(&esprc4->current, p, n); + esprc4->cseq = seq+n; + } else { +print("esp rc4cipher: reordered packet: %uld %ld\n", seq, d); + dd = seq - esprc4->oseq; + if(!esprc4->ovalid || -d > RC4back || dd < 0) { + strcpy(up->errstr, "rc4cipher: too far back"); + return 0; + } + memmove(&tmpstate, &esprc4->old, sizeof(RC4state)); + rc4skip(&tmpstate, dd); + rc4(&tmpstate, p, n); + return 1; + } + + /* move old state up */ + if(esprc4->ovalid) { + dd = esprc4->cseq - RC4back - esprc4->oseq; + if(dd > 0) { + rc4skip(&esprc4->old, dd); + esprc4->oseq += dd; + } + } + } else { + hnputl(p, esprc4->cseq); + p += 4; + n -= 4; + rc4(&esprc4->current, p, n); + esprc4->cseq += n; + } + return 1; +} + +static void +rc4espinit(Espcb *ecb, char *name, uchar *k, int n) +{ + Esprc4 *esprc4; + + /* bits to bytes */ + n = (n+7)>>3; + esprc4 = smalloc(sizeof(Esprc4)); + memset(esprc4, 0, sizeof(Esprc4)); + setupRC4state(&esprc4->current, k, n); + ecb->espalg = name; + ecb->espblklen = 4; + ecb->espivlen = 4; + ecb->cipher = rc4cipher; + ecb->espstate = esprc4; +} +#endif +. +1056,1081d +1048,1050c + ecb->espblklen = 8; + ecb->espivlen = 8; +. +1045c + for(i=0; i<8; i++) +. +1040,1042c + /* bits to bytes */ + n = (n+7)>>3; + if(n > 8) + n = 8; +. +1037c + uchar key[8], ivec[8]; +. +1035c +desespinit(Espcb *ecb, char *name, uchar *k, int n) +. +1019,1033d +1013,1014c + memmove(p, ds->ivec, 8); + for(p += 8; p < ep; p += 8){ + pp = p; + ip = ds->ivec; + for(eip = ip+8; ip < eip; ) + *pp++ ^= *ip++; + block_cipher(ds->expanded, p, 0); + memmove(ds->ivec, p, 8); + } +. +1010,1011c + memmove(ds->ivec, p, 8); + p += 8; + while(p < ep){ + memmove(tmp, p, 8); + block_cipher(ds->expanded, p, 1); + tp = tmp; + ip = ds->ivec; + for(eip = ip+8; ip < eip; ){ + *p++ ^= *ip; + *ip++ = *tp++; + } + } +. +1008a + ep = p + n; +. +1006a + uchar tmp[8]; + uchar *pp, *tp, *ip, *eip, *ep; +. +999,1003d +993c + ecb->ahlen = 12; +. +990c + klen >>= 3; /* convert to bytes */ + +. +986c +md5ahinit(Espcb *ecb, char *name, uchar *key, int klen) +. +979c + seanq_hmac_md5(hash, t, tlen, (uchar*)ecb->ahstate, 16); +. +968c + digest = md5(opad, 64, nil, nil); +. +966c + digest = md5(ipad, 64, nil, nil); +. +959,962c + for(i=0; i<64; i++){ + ipad[i] = 0x36; + opad[i] = 0x5c; + } + ipad[64] = opad[64] = 0; + for(i=0; i<klen; i++){ +. +957a + uchar innerhash[MD5dlen]; +. +956d +954a + uchar ipad[65], opad[65]; +. +796,952c +void +. +790c + ecb->ahlen = 12; +. +786c + klen >>= 8; /* convert to bytes */ +. +782c +shaahinit(Espcb *ecb, char *name, uchar *key, int klen) +. +775c + seanq_hmac_sha1(hash, t, tlen, (uchar*)ecb->ahstate, 16); +. +772a + int r; +. +771d +764c + digest = sha1(opad, 64, nil, nil); +. +762c + digest = sha1(ipad, 64, nil, nil); +. +755,758c + for(i=0; i<64; i++){ + ipad[i] = 0x36; + opad[i] = 0x5c; + } + ipad[64] = opad[64] = 0; + for(i=0; i<klen; i++){ +. +753a + uchar innerhash[SHA1dlen]; +. +752d +750a + uchar ipad[65], opad[65]; +. +743,748c +void +. +735c +nullahinit(Espcb *ecb, char *name, uchar* _, int __) +. +729c +nullauth(Espcb* _, uchar* __, int ___, uchar* ____) +. +720c +nullespinit(Espcb *ecb, char *name, uchar* _, int __) +. +714c +nullcipher(Espcb* _, uchar* __, int ___) +. +708,712d +647c + QUNLOCK(c); +. +642c + QLOCK(c); +. +632c + QUNLOCK(c); +. +627c + QLOCK(c); +. +606c + QUNLOCK(esp); +. +600,601c + spi = nhgets(h->espspi); + QLOCK(esp); + c = convlookup(esp, spi); +. +597,598c + h = (Esp4hdr*)(bp->rp); +. +595c + ulong spi; +. +593a + Esp4hdr *h; +. +590d +568c + QUNLOCK(c); +. +565c + qpass(c->rq, bp); +. +560,561c + netlog(f, Logesp, "esp: qfull %I -> %I.%uld\n", raddr, + laddr, spi); +. +557,558d +547c + bp->rp += hdrlen + ecb->espivlen; +. +539,541c + QUNLOCK(c); + netlog(f, Logesp, "esp: short packet after decrypt %I -> %I!%d\n", + raddr, laddr, spi); +. +535c + et = (Esptail*)(bp->rp + hdrlen + payload); +. +523,529c + if(!ecb->cipher(ecb, bp->rp + hdrlen, payload)) { + QUNLOCK(c); +print("esp: cipher failed %I -> %I!%ld: %s\n", raddr, laddr, spi, up->errstr); + netlog(f, Logesp, "esp: cipher failed %I -> %I!%d: %s\n", raddr, + laddr, spi, up->errstr); +. +517,519c + QUNLOCK(c); + netlog(f, Logesp, "esp: bad length %I -> %I!%d payload=%d BLEN=%d\n", + raddr, laddr, spi, payload, BLEN(bp)); +. +515c + payload = BLEN(bp) - hdrlen - ecb->ahlen; +. +507,510c + QUNLOCK(c); +print("esp: bad auth %I -> %I!%ld\n", raddr, laddr, spi); + netlog(f, Logesp, "esp: bad auth %I -> %I!%d\n", raddr, + laddr, spi); +. +502,505c + espspi = version == V4? ((Esp4hdr*)bp->rp)->espspi: + ((Esp6hdr*)bp->rp)->espspi; +. +493,496c + if(BLEN(bp) < hdrlen + ecb->espivlen + Esptaillen + ecb->ahlen) { + QUNLOCK(c); + netlog(f, Logesp, "esp: short block %I -> %I!%d\n", raddr, + laddr, spi); +. +485,486c + QLOCK(c); + QUNLOCK(esp); +. +477,479c + QUNLOCK(esp); + netlog(f, Logesp, "esp: no conv %I -> %I!%d\n", raddr, + laddr, spi); +. +475c + c = convlookup(esp, spi); +. +473c + if (version == V4) { + eh4 = (Esp4hdr*)bp->rp; + spi = nhgetl(eh4->espspi); + v4tov6(raddr, eh4->espsrc); + v4tov6(laddr, eh4->espdst); + } else { + eh6 = (Esp6hdr*)bp->rp; + spi = nhgetl(eh6->espspi); + ipmove(raddr, eh6->src); + ipmove(laddr, eh6->dst); + } + + QLOCK(esp); +. +471d +464,466c + bp = pullupblock(bp, hdrlen + Esptaillen); +. +462a + if (bp == nil || BLEN(bp) == 0) { + /* get enough to identify the IP version */ + bp = pullupblock(bp, IP4HDR); + if(bp == nil) { + netlog(f, Logesp, "esp: short packet\n"); + return; + } + } + eh4 = (Esp4hdr*)bp->rp; + version = ((eh4->vihl & 0xf0) == IP_VER4? V4: V6); + hdrlen = version == V4? Esp4hdrlen: Esp6hdrlen; +. +459,460c + uchar *auth, *espspi; + ulong spi; + int payload, nexthdr, version, hdrlen; +. +457c + uchar raddr[IPaddrlen], laddr[IPaddrlen]; +. +453,454c + Esp4hdr *eh4; + Esp6hdr *eh6; + Esptail *et; + Userhdr *uh; +. +451c +espiput(Proto *esp, Ipifc* _, Block *bp) +. +446,449d +440c + if (version == V4) +. +438c + QUNLOCK(c); +. +434,435c + ecb->auth(ecb, bp->rp + iphdrlen, (hdrlen - iphdrlen) + +. +429,431d +425a + hnputl(eh6->espspi, ecb->spi); + hnputl(eh6->espseq, ++ecb->seq); +. +424d +420,422d +414a + hnputl(eh4->espspi, ecb->spi); + hnputl(eh4->espseq, ++ecb->seq); +. +411,413c + /* fill in head */ + if (version == V4) { +. +407,409c + ecb->cipher(ecb, bp->rp + hdrlen, payload + pad + Esptaillen); + auth = bp->rp + hdrlen + payload + pad + Esptaillen; +. +401c + eh4 = (Esp4hdr *)bp->rp; + eh6 = (Esp6hdr *)bp->rp; + et = (Esptail*)(bp->rp + hdrlen + payload + pad); +. +383,384c + bp = padblock(bp, hdrlen + ecb->espivlen); +. +370c + QUNLOCK(c); +. +363c + QLOCK(c); +. +358c + version = ipvers(c); + iphdrlen = version == V4? IP4HDR: IP6HDR; + hdrlen = version == V4? Esp4hdrlen: Esp6hdrlen; + +. +356c + Espcb *ecb; + Block *bp; + int nexthdr, payload, pad, align, version, hdrlen, iphdrlen; + uchar *auth; +. +353d +347,349d +299,344d +284,297d +274c +ipvers(Conv *c) +. +221c + QUNLOCK(c->p); +. +215c + QLOCK(c->p); +. +207,210c + parseip(c->raddr, argv[1]); +. +192c + char *p, *pp; + char *e = nil; +. +182,186c + "null", 0, nullahinit, + "hmac_sha1_96", 128, shaahinit, /* rfc2404 */ +// "aes_xcbc_mac_96", 128, aesahinit, /* rfc3566 */ + "hmac_md5_96", 128, md5ahinit, /* rfc2403 */ + nil, 0, nil, +. +170,177c + "null", 0, nullespinit, +// "des3_cbc", 192, des3espinit, /* rfc2451 */ +// "aes_128_cbc", 128, aescbcespinit, /* rfc3602 */ +// "aes_ctr", 128, aesctrespinit, /* rfc3686 */ + "des_56_cbc", 64, desespinit, /* rfc2405, deprecated */ +// "rc4_128", 128, rc4espinit, /* gone in rfc4305 */ + nil, 0, nil, +. +163,166c +static void nullahinit(Espcb*, char*, uchar *key, int keylen); +static void shaahinit(Espcb*, char*, uchar *key, int keylen); +static void md5ahinit(Espcb*, char*, uchar *key, int keylen); +. +157,161c +static void nullespinit(Espcb*, char*, uchar *key, int keylen); +static void desespinit(Espcb *ecb, char *name, uchar *k, int n); +. +150c + void (*init)(Espcb*, char* name, uchar *key, int keylen); +. +143d +137d +131d +127c + int header; /* user user level header */ +. +96,107d +86,87c + /* Ip6hdr; */ + uchar vcf[4]; /* version:4, traffic class:8, flow label:20 */ + uchar ploadlen[2]; /* payload length: packet length - 40 */ + uchar proto; /* next header type */ + uchar ttl; /* hop limit */ + uchar src[IPaddrlen]; + uchar dst[IPaddrlen]; + + /* Esphdr; */ + uchar espspi[4]; /* Security parameter index */ + uchar espseq[4]; /* Sequence number */ +. +80c + /* Esphdr; */ + uchar espspi[4]; /* Security parameter index */ + uchar espseq[4]; /* Sequence number */ +. +58,64c + * tunnel-mode layout: IP | ESP | TCP/UDP | user data. + * transport-mode layout is: ESP | IP | TCP/UDP | user data. +. +54d +42,47d +32,35c +enum +{ +. +30a +typedef struct Esppriv Esppriv; +typedef struct Espcb Espcb; +typedef struct Algorithm Algorithm; +. +26,28d +20,23c +typedef struct Esphdr Esphdr; +. +14c +#include "error.h" +. +10c +#include "lib.h" +. +6,7c + * TODO: update to match rfc4303. +. +3,4d +diff -e ip.orig/ethermedium.c ip/ethermedium.c +536c + if((sflag = ipv6anylocal(ifc, ipsrc)) != 0) +. +429c +etherremmulti(Ipifc *ifc, uchar *a, uchar *_) +. +407c +etheraddmulti(Ipifc *ifc, uchar *a, uchar *_) +. +401c + RUNLOCK(ifc); +. +392c + RUNLOCK(ifc); +. +387c + if(!CANRLOCK(ifc)){ +. +362c + RUNLOCK(ifc); +. +353c + RUNLOCK(ifc); +. +348c + if(!CANRLOCK(ifc)){ +. +269c + * called by ipoput with a single block to write with ifc RLOCK'd +. +123a + +. +8c +#include "netif.h" +. +6c +#include "error.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/gre.c ip/gre.c +968c + gre->ptclsize = 0; +. +919,948d +894,916c + return "unknown control request"; +. +885,892d +881,883c + else if(strcmp(f[0], "cooked") == 0){ + gpriv->raw = 0; + return nil; +. +696,879c + gpriv = c->p->priv; + if(n == 1){ + if(strcmp(f[0], "raw") == 0){ + gpriv->raw = 1; + return nil; +. +694c + GREpriv *gpriv; +. +691,692c +char* +grectl(Conv *c, char **f, int n) +. +681,688c + return snprint(buf, len, "gre: len %lud\n", gpriv->lenerr); +. +675,679d +659,660c + if(qlen(c->rq) > 64*1024) + freeblist(bp); +. +651d +648d +645c + freeblist(bp); +. +643c + len = nhgets(ghp->len) - GRE_IPONLY; +. +639a + QUNLOCK(gre); + +. +633,636c + if(*p == nil) { + QUNLOCK(gre); + freeblist(bp); +. +590,629c + if(c->rport == eproto && + (gpriv->raw || ipcmp(c->raddr, raddr) == 0)) +. +587d +553,585c + /* Look for a conversation structure for this port and address */ + c = nil; + for(p = gre->conv; *p; p++) { +. +547,551c + v4tov6(raddr, ghp->src); + eproto = nhgets(ghp->eproto); + QLOCK(gre); +. +536,545c + gpriv = gre->priv; + ghp = (GREhdr*)(bp->rp); +. +534d +531,532c + ushort eproto; + uchar raddr[IPaddrlen]; +. +336,529c + int len; + GREhdr *ghp; +. +334c +greiput(Proto *gre, Ipifc* __, Block *bp) +. +328,329d +325,326c + ghp->proto = IP_GREPROTO; + ghp->frag[0] = 0; + ghp->frag[1] = 0; +. +322c + hnputs(ghp->eproto, c->rport); +. +318,320c + findlocalip(c->p->f, c->laddr, raddr); /* pick interface closest to dest */ + memmove(ghp->src, c->laddr + IPv4off, IPv4addrlen); +. +314,315c + memmove(ghp->dst, c->raddr + IPv4off, IPv4addrlen); + v4tov6(laddr, ghp->src); +. +311,312c + if(!((GREpriv*)c->p->priv)->raw){ + v4tov6(raddr, ghp->dst); +. +308,309c + ghp = (GREhdr *)(bp->rp); + ghp->vihl = IP_VER4; +. +295,297d +287,289c + Conv *c = x; + GREhdr *ghp; +. +283a +int drop; + +. +281c + c->lport = 0; + c->rport = 0; +. +247,278c + qclose(c->rq); + qclose(c->wq); + qclose(c->eq); +. +241c + return "pktifc does not support announce"; +. +239c +greannounce(Conv* _, char** __, int ___) +. +218,235c + USED(c); + return snprint(state, n, "%s\n", "Datagram"); +. +211c + c->rq = qopen(64*1024, Qmsg, 0, c); +. +199c + QUNLOCK(p); +. +184c + QLOCK(p); +. +138,171c +static char* +. +136d +71,134d +68c + ulong csumerr; /* checksum errors */ + ulong lenerr; /* short packet */ +. +66c +struct GREpriv +{ + int raw; /* Raw GRE mode */ + +. +63c +} GREhdr; +. +54c + uchar Unused; +. +46,47c +typedef struct GREhdr +{ +. +21,43d +13c +enum +{ +. +9c +#include "error.h" +. +5c +#include "lib.h" +. +diff -e ip.orig/icmp.c ip/icmp.c +350c + if(iplen > n || ((uint)iplen % 1)){ +. +339,341c + netlog(icmp->f, Logicmp, "icmpiput %d %d\n", p->type, p->code); +. +324c +icmpiput(Proto *icmp, Ipifc* __, Block *bp) +. +6c +#include "error.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/icmp6.c ip/icmp6.c +781c + bp->rp -= sizeof(IPICMP); +. +770c + bp->rp += sizeof(IPICMP); +. +762c + bp->rp -= sizeof(IPICMP); +. +750c + bp->rp += sizeof(IPICMP); +. +711c + RUNLOCK(ifc); +. +707c + RUNLOCK(ifc); +. +700c + RUNLOCK(ifc); +. +698c + RLOCK(ifc); +. +666c + sz = sizeof(IPICMP) + 8; +. +661c + if(pktsz - sizeof(Ip6hdr) < 8) { +. +649c + sz = sizeof(IPICMP) + 8; +. +641c + if(pktsz - sizeof(Ip6hdr) < 16) { +. +575c + if(iplen > n - IP6HDR || ((uint)iplen % 1) != 0) { +. +568c + if(n < sizeof(IPICMP)) { +. +546c + memmove(nbp->rp + sizeof(IPICMP), bp->rp, sz - sizeof(IPICMP)); +. +537c + netlog(f, Logicmp, "icmppkttoobig6 fail -> s%I d%I\n", +. +534c + netlog(f, Logicmp, "send icmppkttoobig6 -> s%I d%I\n", +. +518c + int sz = MIN(sizeof(IPICMP) + osz, v6MINTU); +. +506c + memmove(nbp->rp + sizeof(IPICMP), bp->rp, sz - sizeof(IPICMP)); +. +498c + netlog(f, Logicmp, "icmpttlexceeded6 fail -> s%I d%I\n", +. +495c + netlog(f, Logicmp, "send icmpttlexceeded6 -> s%I d%I\n", +. +479c + int sz = MIN(sizeof(IPICMP) + osz, v6MINTU); +. +471c + RUNLOCK(ifc); +. +457c + memmove(nbp->rp + sizeof(IPICMP), bp->rp, sz - sizeof(IPICMP)); +. +445c + netlog(f, Logicmp, "icmphostunr fail -> s%I d%I\n", +. +442c + netlog(f, Logicmp, "send icmphostunr -> s%I d%I\n", +. +440c + RLOCK(ifc); +. +425c + int sz = MIN(sizeof(IPICMP) + osz, v6MINTU); +. +397c + nbp = newIPICMP(sizeof(Ndpkt)); +. +375c + nbp->wp -= sizeof(Ndpkt) - sizeof(NdiscC); +. +354c + nbp = newIPICMP(sizeof(Ndpkt)); +. +260c + if(blocklen(bp) < sizeof(IPICMP)){ +. +257c + bp = padblock(bp, sizeof(Ip6hdr)); +. +122c + QLock qlock; +. +109,110d +106d +101a + +. +99,100c + /* ICMPpkt; */ + uchar type; + uchar code; + uchar cksum[2]; + uchar icmpid[2]; + uchar seq[2]; + +. +97c +struct Ndpkt +{ + /* NdiscC; */ + /* IPICMP; */ + /* Ip6hdr; */ + uchar vcf[4]; /* version:4, traffic class:8, flow label:20 */ + uchar ploadlen[2]; /* payload length: packet length - 40 */ + uchar proto; /* next header type */ + uchar ttl; /* hop limit */ + uchar src[IPaddrlen]; + uchar dst[IPaddrlen]; +. +94d +91,92c + /* ICMPpkt; */ + uchar type; + uchar code; + uchar cksum[2]; + uchar icmpid[2]; + uchar seq[2]; + +. +89c +struct NdiscC +{ + /* IPICMP; */ + /* Ip6hdr; */ + uchar vcf[4]; /* version:4, traffic class:8, flow label:20 */ + uchar ploadlen[2]; /* payload length: packet length - 40 */ + uchar proto; /* next header type */ + uchar ttl; /* hop limit */ + uchar src[IPaddrlen]; + uchar dst[IPaddrlen]; +. +85,86c + /* Ip6hdr; */ + uchar vcf[4]; /* version:4, traffic class:8, flow label:20 */ + uchar ploadlen[2]; /* payload length: packet length - 40 */ + uchar proto; /* next header type */ + uchar ttl; /* hop limit */ + uchar src[IPaddrlen]; + uchar dst[IPaddrlen]; + + /* ICMPpkt; */ + uchar type; + uchar code; + uchar cksum[2]; + uchar icmpid[2]; + uchar seq[2]; +. +75,82c +struct ICMPpkt { + uchar type; + uchar code; + uchar cksum[2]; + uchar icmpid[2]; + uchar seq[2]; +}; +. +70c +typedef struct ICMPpkt ICMPpkt; +. +9c +#include "error.h" +. +5c +#include "lib.h" +. +diff -e ip.orig/igmp.c ip/igmp.c +217c + mp = Mediacopymulti(m); +. +177c +igmpiput(Media *m, Ipifc *, Block *bp) +. +123c + byte ip[IPaddrlen]; +. +97,99c + bp->wp += sizeof(IGMPpkt); + memset(bp->rp, 0, sizeof(IGMPpkt)); + hnputl(p->src, Mediagetaddr(m)); +. +87c +igmpsendreport(Media *m, byte *addr) +. +68c + Lock lk; + +. +60c + Media *m; +. +51,52d +43,48c + byte vertype; /* version and type */ + byte unused; + byte igmpcksum[2]; /* checksum of igmp portion */ + byte group[IPaddrlen]; /* multicast group */ +. +31,40c + byte vihl; /* Version and header length */ + byte tos; /* Type of service */ + byte len[2]; /* packet length (including headers) */ + byte id[2]; /* Identification */ + byte frag[2]; /* Fragment information */ + byte Unused; + byte proto; /* Protocol */ + byte cksum[2]; /* checksum of ip portion */ + byte src[IPaddrlen]; /* Ip source */ + byte dst[IPaddrlen]; /* Ip destination */ +. +27a +typedef char byte; + +. +10c +#include "error.h" +. +6c +#include "lib.h" +. +1,4d +diff -e ip.orig/inferno.c ip/inferno.c +28a + +Medium tripmedium = +{ + "trip", +}; +. +25c +bootpread(char* _, ulong __, int ___) +. +23a +char* +bootp(Ipifc* _) +{ + return "unimplmented"; +} + +. +17a +Chan* +commonfdtochan(int fd, int mode, int a, int b) +{ + return fdtochan(fd, mode, a, b); +} + +. +6c +#include "error.h" +#include "ip.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/ip.c ip/ip.c +522,524c + if(bp->base+sizeof(Ipfrag) >= bp->rp){ + bp = padblock(bp, sizeof(Ipfrag)); + bp->rp += sizeof(Ipfrag); +. +466,467c + for(i = 0; i < Nstats; i++) + p = seprint(p, e, "%s: %lud\n", statnames[i], ip->stats[i]); +. +383c + freeb(bp); +. +381a + Conv conv; + +. +322d +320d +301c + RUNLOCK(ifc); +. +213c + RUNLOCK(ifc); +. +211d +196,199c + medialen = ifc->maxtu - ifc->m->hsize; +. +189c + RUNLOCK(ifc); +. +186c + if(!CANRLOCK(ifc)) +. +11a +/* MIB II counters */ +enum +{ + Forwarding, + DefaultTTL, + InReceives, + InHdrErrors, + InAddrErrors, + ForwDatagrams, + InUnknownProtos, + InDiscards, + InDelivers, + OutRequests, + OutDiscards, + OutNoRoutes, + ReasmTimeout, + ReasmReqds, + ReasmOKs, + ReasmFails, + FragOKs, + FragFails, + FragCreates, + + Nstats, +}; + +struct Fragment4 +{ + Block* blist; + Fragment4* next; + ulong src; + ulong dst; + ushort id; + ulong age; +}; + +struct Fragment6 +{ + Block* blist; + Fragment6* next; + uchar src[IPaddrlen]; + uchar dst[IPaddrlen]; + uint id; + ulong age; +}; + +struct Ipfrag +{ + ushort foff; + ushort flen; +}; + +/* an instance of IP */ +struct IP +{ + ulong stats[Nstats]; + + QLock fraglock4; + Fragment4* flisthead4; + Fragment4* fragfree4; + Ref id4; + + QLock fraglock6; + Fragment6* flisthead6; + Fragment6* fragfree6; + Ref id6; + + int iprouting; /* true if we route like a gateway */ +}; + +. +9a +typedef struct Fragment4 Fragment4; +typedef struct Fragment6 Fragment6; +typedef struct Ipfrag Ipfrag; + +. +6c +#include "error.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/ip.h ip/ip.h +732a +Chan* commonfdtochan(int, int, int, int); +. +727a +extern char* bootp(Ipifc*); +. +676a +extern Medium tripmedium; +. +669c +#define NOW msec() +. +578c +/* RouteTree; */ + Route* right; + Route* left; + Route* mid; + uchar depth; + uchar type; + uchar ifcid; /* must match ifc->id */ + Ipifc *ifc; + char tag[4]; + int ref; +. +516,517d +491a + Logilmsg= 1<<8, +. +488a + Logil= 1<<4, +. +423c + RWlock rwlock; + + Conv *conv; /* link to its conversation structure */ +. +386c + QLock qlock; + +. +374c + Lock lk; + +. +312c + RWlock rwlock; +. +173c + QLock qlock; +. +153a +typedef struct Ip4hdr Ip4hdr; +. +79,152d +41c + Maxincall= 5, +. +30,35d +8,9d +2,3d +diff -e ip.orig/ipaux.c ip/ipaux.c +366c + UNLOCK(ht); +. +363c + UNLOCK(ht); +. +352c + UNLOCK(ht); +. +340c + UNLOCK(ht); +. +328c + UNLOCK(ht); +. +316c + UNLOCK(ht); +. +309c + LOCK(ht); +. +290c + UNLOCK(ht); +. +282c + LOCK(ht); +. +272c + UNLOCK(ht); +. +269c + LOCK(ht); +. +241c + return (ulong)(sa[IPaddrlen-1]<<24 ^ sp<< 16 ^ da[IPaddrlen-1]<<8 ^ dp) % Nhash; +. +6c +#include "error.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/ipifc.c ip/ipifc.c +1575c + RUNLOCK(nifc); +. +1565c + RUNLOCK(nifc); +. +1562c + RLOCK(nifc); +. +1555c + RUNLOCK(nifc); +. +1541c + RUNLOCK(nifc); +. +1538c + RLOCK(nifc); +. +1518d +1511d +1498c + WUNLOCK(ifc); +. +1494c + WLOCK(ifc); +. +1491c + WUNLOCK(ifc); +. +1455c + WUNLOCK(ifc); +. +1451c + WLOCK(ifc); +. +1448c + WUNLOCK(ifc); +. +1301c + QUNLOCK(f->ipifc); +. +1265,1266c + if((atypel > atype && atype < atyper) || + (atypel < atype && atype > atyper)){ +. +1232,1234c + QLOCK(f->ipifc); +. +1154c + (isv6mcast(addr) && (addr[1] & 0xF) <= Link_local_scop)) +. +1054c + QUNLOCK(f->self); +. +1040c + QLOCK(f->self); +. +1021c + QUNLOCK(f->self); +. +951c + QLOCK(f->self); +. +888c + QUNLOCK(f->self); +. +839c + QLOCK(f->self); +. +689c + WUNLOCK(ifc); +. +683c + WLOCK(ifc); +. +680c + WUNLOCK(ifc); +. +619c + WUNLOCK(ifc); +. +604c + WLOCK(ifc); +. +539c + * always called with ifc WLOCK'd +. +531c + WUNLOCK(ifc); +. +417c + WLOCK(ifc); +. +319c + c->sq = qopen(2*QMAX, 0, 0, 0); +. +306c + RUNLOCK(ifc); +. +299c + RUNLOCK(ifc); +. +294c + if(!CANRLOCK(ifc)){ +. +266c + RUNLOCK(ifc); +. +259c + RLOCK(ifc); +. +244c + RUNLOCK(ifc); +. +238c + RLOCK(ifc); +. +212c + WUNLOCK(ifc); +. +181c + WLOCK(ifc); +. +178c + WUNLOCK(ifc); +. +162c + WUNLOCK(ifc); +. +124c + WUNLOCK(ifc); +. +120c + WUNLOCK(ifc); +. +118c + WLOCK(ifc); +. +58c +#define hashipa(a) ( (ulong)(((a)[IPaddrlen-2]<<8) | (a)[IPaddrlen-1])%NHASH ) +. +39c + QLock qlock; +. +18c + QMAX = 64*1024-1, +. +6c +#include "error.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/ipmux.c ip/ipmux.c +811c + RUNLOCK(f); +. +809c + RLOCK(f); +. +742c + RUNLOCK(f); +. +680c + RLOCK(f); +. +631,633c + WLOCK(f); + i = (Ipmux *)c->p->priv; + ipmuxremove(&i, r->chain); + WUNLOCK(f); +. +617a + Ipmux *i; +. +610c +ipmuxannounce(Conv* _, char** __, int ___) +. +583c + WUNLOCK(f); +. +581c + WLOCK(f); +. +9c +#include "error.h" +. +5c +#include "lib.h" +. +diff -e ip.orig/iproute.c ip/iproute.c +469c + while((p = f->queue) != nil) { +. +425c + while((p = f->queue) != nil) { +. +359c + while((p = f->queue) != nil) { +. +313c + while((p = f->queue) != nil) { +. +213,214c + dl = 0; if((l = p->left) != nil) dl = l->depth; + dr = 0; if((r = p->right) != nil) dr = r->depth; +. +6c +#include "error.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/ipv6.c ip/ipv6.c +506,508c + if(bp->base+sizeof(Ipfrag) >= bp->rp){ + bp = padblock(bp, sizeof(Ipfrag)); + bp->rp += sizeof(Ipfrag); +. +218c + RUNLOCK(ifc); +. +122c + RUNLOCK(ifc); +. +110c + RUNLOCK(ifc); +. +106c + if(!CANRLOCK(ifc)) +. +29a +/* MIB II counters */ +enum +{ + Forwarding, + DefaultTTL, + InReceives, + InHdrErrors, + InAddrErrors, + ForwDatagrams, + InUnknownProtos, + InDiscards, + InDelivers, + OutRequests, + OutDiscards, + OutNoRoutes, + ReasmTimeout, + ReasmReqds, + ReasmOKs, + ReasmFails, + FragOKs, + FragFails, + FragCreates, + + Nstats, +}; + +static char *statnames[] = +{ +[Forwarding] "Forwarding", +[DefaultTTL] "DefaultTTL", +[InReceives] "InReceives", +[InHdrErrors] "InHdrErrors", +[InAddrErrors] "InAddrErrors", +[ForwDatagrams] "ForwDatagrams", +[InUnknownProtos] "InUnknownProtos", +[InDiscards] "InDiscards", +[InDelivers] "InDelivers", +[OutRequests] "OutRequests", +[OutDiscards] "OutDiscards", +[OutNoRoutes] "OutNoRoutes", +[ReasmTimeout] "ReasmTimeout", +[ReasmReqds] "ReasmReqds", +[ReasmOKs] "ReasmOKs", +[ReasmFails] "ReasmFails", +[FragOKs] "FragOKs", +[FragFails] "FragFails", +[FragCreates] "FragCreates", +}; + +struct Fragment4 +{ + Block* blist; + Fragment4* next; + ulong src; + ulong dst; + ushort id; + ulong age; +}; + +struct Fragment6 +{ + Block* blist; + Fragment6* next; + uchar src[IPaddrlen]; + uchar dst[IPaddrlen]; + uint id; + ulong age; +}; + +struct Ipfrag +{ + ushort foff; + ushort flen; +}; + +/* an instance of IP */ +struct IP +{ + ulong stats[Nstats]; + + QLock fraglock4; + Fragment4* flisthead4; + Fragment4* fragfree4; + Ref id4; + + QLock fraglock6; + Fragment6* flisthead6; + Fragment6* fragfree6; + Ref id6; + + int iprouting; /* true if we route like a gateway */ +}; + +. +22a +typedef struct Fragment4 Fragment4; +typedef struct Fragment6 Fragment6; +typedef struct Ipfrag Ipfrag; + +. +6c +#include "error.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/ipv6.h ip/ipv6.h +145c +struct Routinghdr { +. +134c +struct Opthdr { +. +130,131c + uchar vcf[4]; /* version:4, traffic class:8, flow label:20 */ + uchar ploadlen[2]; /* payload length: packet length - 40 */ + uchar proto; /* next header type */ + uchar ttl; /* hop limit */ + uchar src[IPaddrlen]; + uchar dst[IPaddrlen]; +. +120,128d +81c + IP6HDR = 20, /* sizeof(Ip6hdr) */ +. +26a +#undef ESP + +. +diff -e ip.orig/loopbackmedium.c ip/loopbackmedium.c +99c + RUNLOCK(ifc); +. +92c + RUNLOCK(ifc); +. +87c + if(!CANRLOCK(ifc)){ +. +58c +loopbackbwrite(Ipifc *ifc, Block *bp, int _, uchar* __) +. +26c +loopbackbind(Ipifc *ifc, int _, char** __) +. +6c +#include "error.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/netdevmedium.c ip/netdevmedium.c +144c + RUNLOCK(ifc); +. +136c + RUNLOCK(ifc); +. +131c + if(!CANRLOCK(ifc)){ +. +85c +netdevbwrite(Ipifc *ifc, Block *bp, int _, uchar* __) +. +6c +#include "error.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/netlog.c ip/netlog.c +260c + wakeup(&f->alog->rendez); +. +258c + UNLOCK(f->alog); +. +242c + LOCK(f->alog); +. +228c + char buf[128], *t, *fp; +. +185c + set = 1; +. +160c + QUNLOCK(f->alog); +. +157c + sleep(&f->alog->rendez, netlogready, f); +. +155c + UNLOCK(f->alog); +. +146c + UNLOCK(f->alog); +. +134c + LOCK(f->alog); +. +129c + QUNLOCK(f->alog); +. +127c + QLOCK(f->alog); +. +122c +netlogread(Fs *f, void *a, ulong _, long n) +. +109c + UNLOCK(f->alog); +. +101c + UNLOCK(f->alog); +. +99c + LOCK(f->alog); +. +92c + UNLOCK(f->alog); +. +82c + UNLOCK(f->alog); +. +80c + LOCK(f->alog); +. +28,29c + QLock qlock; + Rendez rendez; +. +17c + Lock lk; +. +6,7c +#include "error.h" +#include "ip/ip.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/nullmedium.c ip/nullmedium.c +22c +nullbwrite(Ipifc* _, Block* __, int ___, uchar* ____) +. +17c +nullunbind(Ipifc* _) +. +11c +nullbind(Ipifc* _, int __, char** ___) +. +6c +#include "error.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/pktmedium.c ip/pktmedium.c +51c +pktbwrite(Ipifc *ifc, Block *bp, int _, uchar* __) +. +43c +pktunbind(Ipifc* _) +. +36d +34c +pktbind(Ipifc* _, int argc, char **argv) +. +6c +#include "error.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/ptclbsum.c ip/ptclbsum.c +68c + while((hisum = losum>>16)) +. +6c +#include "error.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/rudp.c ip/rudp.c +693c + rudp->nc = 16; +. +11c +#include "error.h" +. +7c +#include "lib.h" +. +diff -e ip.orig/tcp.c ip/tcp.c +3171c + QUNLOCK(c); +. +3154c + if(!CANQLOCK(c)) +. +3127c + p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]); +. +3101c +/* called with c QLOCKed */ +. +3085c + QUNLOCK(tcp); +. +3080c + QUNLOCK(s); +. +3073,3074c + QLOCK(s); + QUNLOCK(tcp); +. +3064c + QLOCK(tcp); +. +2871,2873d +2869c + if(seg->mss != 0 && seg->mss < tcb->mss) +. +2859d +2842c + QUNLOCK(s); +. +2830c + netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n", tcb->snd.una, tcb->timer.start, NOW); +. +2817c + QLOCK(s); +. +2814c + QUNLOCK(s); +. +2768c +tcpsetchecksum(Conv *s, char **f, int _) +. +2737c + QUNLOCK(s); +. +2728c + QLOCK(s); +. +2725c + QUNLOCK(s); +. +2641c + QLOCK(s); +. +2638,2639c + if((uint)(msgs%4) == 1){ + QUNLOCK(s); +. +2563c + netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr %lux nxt %lux\n", +. +2421c + QUNLOCK(s); +. +2417c + QUNLOCK(s); +. +2351c + QUNLOCK(s); +. +2189c + QUNLOCK(s); +. +2172,2174d +2144c + QUNLOCK(s); +. +2095,2096c + QLOCK(s); + QUNLOCK(tcp); +. +2092c + QUNLOCK(s); +. +2072c + QUNLOCK(tcp); +. +2064c + QUNLOCK(tcp); +. +2053c + QUNLOCK(tcp); +. +2050,2051c + netlog(f, Logtcp, "iphtlook failed\n"); +. +2045c + QLOCK(tcp); +. +1942c +tcpiput(Proto *tcp, Ipifc* _, Block *bp) +. +1862c + netlog(s->p->f, Logtcp, "rxt next %lud, cwin %ud\n", seg->ack, tcb->cwind); +. +1817c + netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %d advwin %d\n", +. +1685,1686d +1683c + if(lp->mss != 0 && lp->mss < tcb->mss) +. +1626c + netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d\n", +. +1562c + QUNLOCK(tcp); +. +1529c + if(!CANQLOCK(tcp)) +. +1421,1422d +1334c + * called with s QLOCKed +. +1245,1246d +1231,1232d +1210,1211d +1208c + if(optlen == MSS_LENGTH) +. +995d +873c + * called with s QLOCKed +. +861,862d +805d +609c + QUNLOCK(s); +. +603c + QLOCK(s); +. +600c + QUNLOCK(s); +. +583,584d +569c + QUNLOCK(s); +. +551c + QLOCK(s); +. +548c + QUNLOCK(s); +. +352c + ulong stats[Nstats]; +. +317d +293d +231c + ulong window; /* Recevive window */ +. +229c + ushort mss; /* Mean segment size */ +. +193c + * the QLOCK in the Conv locks this structure +. +49,50c + DEF_MSS = 1460, /* Default mean segment */ + DEF_MSS6 = 1280, /* Default mean segment (min) for v6 */ +. +44c + MSS_LENGTH = 4, /* Mean segment size */ +. +6c +#include "error.h" +. +2c +#include "lib.h" +. +diff -e ip.orig/udp.c ip/udp.c +590,591c + return snprint(buf, len, "InDatagrams: %lud\nNoPorts: %lud\nInErrors: %lud\nOutDatagrams: %lud\n", +. +580c + QUNLOCK(udp); +. +575c + QUNLOCK(s); +. +571,572c + QLOCK(s); + QUNLOCK(udp); +. +562c + QLOCK(udp); +. +510c + QUNLOCK(c); +. +502c + QUNLOCK(c); +. +475c + QUNLOCK(c); +. +456,457c + QLOCK(c); + QUNLOCK(udp); +. +447c + QUNLOCK(udp); +. +410c + QUNLOCK(udp); +. +404c + QLOCK(udp); +. +197c + netlog(c->p->f, Logudp, "udp: kick\n"); +. +103c + QLock qlock; +. +78c + ulong udpOutDatagrams; +. +75c + ulong udpInDatagrams; +. +6c +#include "error.h" +. +2c +#include "lib.h" +. diff --git a/src/9vx/a/ip/arp.c b/src/9vx/a/ip/arp.c @@ -0,0 +1,684 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" +#include "ipv6.h" + +/* + * address resolution tables + */ +enum +{ + NHASH = (1<<6), + NCACHE = 256, + + AOK = 1, + AWAIT = 2, +}; + +char *arpstate[] = +{ + "UNUSED", + "OK", + "WAIT", +}; + +/* + * one per Fs + */ +struct Arp +{ + QLock qlock; + Fs *f; + Arpent *hash[NHASH]; + Arpent cache[NCACHE]; + Arpent *rxmt; + Proc *rxmitp; /* neib sol re-transmit proc */ + Rendez rxmtq; + Block *dropf, *dropl; +}; + +char *Ebadarp = "bad arp"; + +#define haship(s) ((ulong)((s)[IPaddrlen-1])%NHASH) + +int ReTransTimer = RETRANS_TIMER; + +static void rxmitproc(void *v); + +void +arpinit(Fs *f) +{ + f->arp = smalloc(sizeof(Arp)); + f->arp->f = f; + f->arp->rxmt = nil; + f->arp->dropf = f->arp->dropl = nil; + kproc("rxmitproc", rxmitproc, f->arp); +} + +/* + * create a new arp entry for an ip address. + */ +static Arpent* +newarp6(Arp *arp, uchar *ip, Ipifc *ifc, int addrxt) +{ + uint t; + Block *next, *xp; + Arpent *a, *e, *f, **l; + Medium *m = ifc->m; + int empty; + + /* find oldest entry */ + e = &arp->cache[NCACHE]; + a = arp->cache; + t = a->utime; + for(f = a; f < e; f++){ + if(f->utime < t){ + t = f->utime; + a = f; + } + } + + /* dump waiting packets */ + xp = a->hold; + a->hold = nil; + + if(isv4(a->ip)){ + while(xp){ + next = xp->list; + freeblist(xp); + xp = next; + } + } + else { /* queue icmp unreachable for rxmitproc later on, w/o arp lock */ + if(xp){ + if(arp->dropl == nil) + arp->dropf = xp; + else + arp->dropl->list = xp; + + for(next = xp->list; next; next = next->list) + xp = next; + arp->dropl = xp; + wakeup(&arp->rxmtq); + } + } + + /* take out of current chain */ + l = &arp->hash[haship(a->ip)]; + for(f = *l; f; f = f->hash){ + if(f == a){ + *l = a->hash; + break; + } + l = &f->hash; + } + + /* insert into new chain */ + l = &arp->hash[haship(ip)]; + a->hash = *l; + *l = a; + + memmove(a->ip, ip, sizeof(a->ip)); + a->utime = NOW; + a->ctime = 0; + a->type = m; + + a->rtime = NOW + ReTransTimer; + a->rxtsrem = MAX_MULTICAST_SOLICIT; + a->ifc = ifc; + a->ifcid = ifc->ifcid; + + /* put to the end of re-transmit chain; addrxt is 0 when isv4(a->ip) */ + if(!ipismulticast(a->ip) && addrxt){ + l = &arp->rxmt; + empty = (*l==nil); + + for(f = *l; f; f = f->nextrxt){ + if(f == a){ + *l = a->nextrxt; + break; + } + l = &f->nextrxt; + } + for(f = *l; f; f = f->nextrxt){ + l = &f->nextrxt; + } + *l = a; + if(empty) + wakeup(&arp->rxmtq); + } + + a->nextrxt = nil; + + return a; +} + +/* called with arp qlocked */ + +void +cleanarpent(Arp *arp, Arpent *a) +{ + Arpent *f, **l; + + a->utime = 0; + a->ctime = 0; + a->type = 0; + a->state = 0; + + /* take out of current chain */ + l = &arp->hash[haship(a->ip)]; + for(f = *l; f; f = f->hash){ + if(f == a){ + *l = a->hash; + break; + } + l = &f->hash; + } + + /* take out of re-transmit chain */ + l = &arp->rxmt; + for(f = *l; f; f = f->nextrxt){ + if(f == a){ + *l = a->nextrxt; + break; + } + l = &f->nextrxt; + } + a->nextrxt = nil; + a->hash = nil; + a->hold = nil; + a->last = nil; + a->ifc = nil; +} + +/* + * fill in the media address if we have it. Otherwise return an + * Arpent that represents the state of the address resolution FSM + * for ip. Add the packet to be sent onto the list of packets + * waiting for ip->mac to be resolved. + */ +Arpent* +arpget(Arp *arp, Block *bp, int version, Ipifc *ifc, uchar *ip, uchar *mac) +{ + int hash; + Arpent *a; + Medium *type = ifc->m; + uchar v6ip[IPaddrlen]; + + if(version == V4){ + v4tov6(v6ip, ip); + ip = v6ip; + } + + QLOCK(arp); + hash = haship(ip); + for(a = arp->hash[hash]; a; a = a->hash){ + if(memcmp(ip, a->ip, sizeof(a->ip)) == 0) + if(type == a->type) + break; + } + + if(a == nil){ + a = newarp6(arp, ip, ifc, (version != V4)); + a->state = AWAIT; + } + a->utime = NOW; + if(a->state == AWAIT){ + if(bp != nil){ + if(a->hold) + a->last->list = bp; + else + a->hold = bp; + a->last = bp; + bp->list = nil; + } + return a; /* return with arp qlocked */ + } + + memmove(mac, a->mac, a->type->maclen); + + /* remove old entries */ + if(NOW - a->ctime > 15*60*1000) + cleanarpent(arp, a); + + QUNLOCK(arp); + return nil; +} + +/* + * called with arp locked + */ +void +arprelease(Arp *arp, Arpent* ae) +{ + QUNLOCK(arp); +} + +/* + * Copy out the mac address from the Arpent. Return the + * block waiting to get sent to this mac address. + * + * called with arp locked + */ +Block* +arpresolve(Arp *arp, Arpent *a, Medium *type, uchar *mac) +{ + Block *bp; + Arpent *f, **l; + + if(!isv4(a->ip)){ + l = &arp->rxmt; + for(f = *l; f; f = f->nextrxt){ + if(f == a){ + *l = a->nextrxt; + break; + } + l = &f->nextrxt; + } + } + + memmove(a->mac, mac, type->maclen); + a->type = type; + a->state = AOK; + a->utime = NOW; + bp = a->hold; + a->hold = nil; + QUNLOCK(arp); + + return bp; +} + +void +arpenter(Fs *fs, int version, uchar *ip, uchar *mac, int n, int refresh) +{ + Arp *arp; + Route *r; + Arpent *a, *f, **l; + Ipifc *ifc; + Medium *type; + Block *bp, *next; + uchar v6ip[IPaddrlen]; + + arp = fs->arp; + + if(n != 6){ +// print("arp: len = %d\n", n); + return; + } + + switch(version){ + case V4: + r = v4lookup(fs, ip, nil); + v4tov6(v6ip, ip); + ip = v6ip; + break; + case V6: + r = v6lookup(fs, ip, nil); + break; + default: + panic("arpenter: version %d", version); + return; /* to supress warnings */ + } + + if(r == nil){ +// print("arp: no route for entry\n"); + return; + } + + ifc = r->ifc; + type = ifc->m; + + QLOCK(arp); + for(a = arp->hash[haship(ip)]; a; a = a->hash){ + if(a->type != type || (a->state != AWAIT && a->state != AOK)) + continue; + + if(ipcmp(a->ip, ip) == 0){ + a->state = AOK; + memmove(a->mac, mac, type->maclen); + + if(version == V6){ + /* take out of re-transmit chain */ + l = &arp->rxmt; + for(f = *l; f; f = f->nextrxt){ + if(f == a){ + *l = a->nextrxt; + break; + } + l = &f->nextrxt; + } + } + + a->ifc = ifc; + a->ifcid = ifc->ifcid; + bp = a->hold; + a->hold = nil; + if(version == V4) + ip += IPv4off; + a->utime = NOW; + a->ctime = a->utime; + QUNLOCK(arp); + + while(bp){ + next = bp->list; + if(ifc != nil){ + if(waserror()){ + RUNLOCK(ifc); + nexterror(); + } + RLOCK(ifc); + if(ifc->m != nil) + ifc->m->bwrite(ifc, bp, version, ip); + else + freeb(bp); + RUNLOCK(ifc); + poperror(); + } else + freeb(bp); + bp = next; + } + return; + } + } + + if(refresh == 0){ + a = newarp6(arp, ip, ifc, 0); + a->state = AOK; + a->type = type; + a->ctime = NOW; + memmove(a->mac, mac, type->maclen); + } + + QUNLOCK(arp); +} + +int +arpwrite(Fs *fs, char *s, int len) +{ + int n; + Route *r; + Arp *arp; + Block *bp; + Arpent *a, *fl, **l; + Medium *m; + char *f[4], buf[256]; + uchar ip[IPaddrlen], mac[MAClen]; + + arp = fs->arp; + + if(len == 0) + error(Ebadarp); + if(len >= sizeof(buf)) + len = sizeof(buf)-1; + strncpy(buf, s, len); + buf[len] = 0; + if(len > 0 && buf[len-1] == '\n') + buf[len-1] = 0; + + n = getfields(buf, f, 4, 1, " "); + if(strcmp(f[0], "flush") == 0){ + QLOCK(arp); + for(a = arp->cache; a < &arp->cache[NCACHE]; a++){ + memset(a->ip, 0, sizeof(a->ip)); + memset(a->mac, 0, sizeof(a->mac)); + a->hash = nil; + a->state = 0; + a->utime = 0; + while(a->hold != nil){ + bp = a->hold->list; + freeblist(a->hold); + a->hold = bp; + } + } + memset(arp->hash, 0, sizeof(arp->hash)); + /* clear all pkts on these lists (rxmt, dropf/l) */ + arp->rxmt = nil; + arp->dropf = nil; + arp->dropl = nil; + QUNLOCK(arp); + } else if(strcmp(f[0], "add") == 0){ + switch(n){ + default: + error(Ebadarg); + case 3: + if (parseip(ip, f[1]) == -1) + error(Ebadip); + if(isv4(ip)) + r = v4lookup(fs, ip+IPv4off, nil); + else + r = v6lookup(fs, ip, nil); + if(r == nil) + error("Destination unreachable"); + m = r->ifc->m; + n = parsemac(mac, f[2], m->maclen); + break; + case 4: + m = ipfindmedium(f[1]); + if(m == nil) + error(Ebadarp); + if (parseip(ip, f[2]) == -1) + error(Ebadip); + n = parsemac(mac, f[3], m->maclen); + break; + } + + if(m->ares == nil) + error(Ebadarp); + + m->ares(fs, V6, ip, mac, n, 0); + } else if(strcmp(f[0], "del") == 0){ + if(n != 2) + error(Ebadarg); + + if (parseip(ip, f[1]) == -1) + error(Ebadip); + QLOCK(arp); + + l = &arp->hash[haship(ip)]; + for(a = *l; a; a = a->hash){ + if(memcmp(ip, a->ip, sizeof(a->ip)) == 0){ + *l = a->hash; + break; + } + l = &a->hash; + } + + if(a){ + /* take out of re-transmit chain */ + l = &arp->rxmt; + for(fl = *l; fl; fl = fl->nextrxt){ + if(fl == a){ + *l = a->nextrxt; + break; + } + l = &fl->nextrxt; + } + + a->nextrxt = nil; + a->hash = nil; + a->hold = nil; + a->last = nil; + a->ifc = nil; + memset(a->ip, 0, sizeof(a->ip)); + memset(a->mac, 0, sizeof(a->mac)); + } + QUNLOCK(arp); + } else + error(Ebadarp); + + return len; +} + +enum +{ + Alinelen= 90, +}; + +char *aformat = "%-6.6s %-8.8s %-40.40I %-32.32s\n"; + +static void +convmac(char *p, uchar *mac, int n) +{ + while(n-- > 0) + p += sprint(p, "%2.2ux", *mac++); +} + +int +arpread(Arp *arp, char *p, ulong offset, int len) +{ + Arpent *a; + int n; + char mac[2*MAClen+1]; + + if(offset % Alinelen) + return 0; + + offset = offset/Alinelen; + len = len/Alinelen; + + n = 0; + for(a = arp->cache; len > 0 && a < &arp->cache[NCACHE]; a++){ + if(a->state == 0) + continue; + if(offset > 0){ + offset--; + continue; + } + len--; + QLOCK(arp); + convmac(mac, a->mac, a->type->maclen); + n += sprint(p+n, aformat, a->type->name, arpstate[a->state], a->ip, mac); + QUNLOCK(arp); + } + + return n; +} + +extern int +rxmitsols(Arp *arp) +{ + uint sflag; + Block *next, *xp; + Arpent *a, *b, **l; + Fs *f; + uchar ipsrc[IPaddrlen]; + Ipifc *ifc = nil; + long nrxt; + + QLOCK(arp); + f = arp->f; + + a = arp->rxmt; + if(a==nil){ + nrxt = 0; + goto dodrops; /* return nrxt; */ + } + nrxt = a->rtime - NOW; + if(nrxt > 3*ReTransTimer/4) + goto dodrops; /* return nrxt; */ + + for(; a; a = a->nextrxt){ + ifc = a->ifc; + assert(ifc != nil); + if((a->rxtsrem <= 0) || !(CANRLOCK(ifc)) || (a->ifcid != ifc->ifcid)){ + xp = a->hold; + a->hold = nil; + + if(xp){ + if(arp->dropl == nil) + arp->dropf = xp; + else + arp->dropl->list = xp; + } + + cleanarpent(arp, a); + } + else + break; + } + if(a == nil) + goto dodrops; + + + QUNLOCK(arp); /* for icmpns */ + if((sflag = ipv6anylocal(ifc, ipsrc)) != SRC_UNSPEC) + icmpns(f, ipsrc, sflag, a->ip, TARG_MULTI, ifc->mac); + + RUNLOCK(ifc); + QLOCK(arp); + + /* put to the end of re-transmit chain */ + l = &arp->rxmt; + for(b = *l; b; b = b->nextrxt){ + if(b == a){ + *l = a->nextrxt; + break; + } + l = &b->nextrxt; + } + for(b = *l; b; b = b->nextrxt){ + l = &b->nextrxt; + } + *l = a; + a->rxtsrem--; + a->nextrxt = nil; + a->rtime = NOW + ReTransTimer; + + a = arp->rxmt; + if(a==nil) + nrxt = 0; + else + nrxt = a->rtime - NOW; + +dodrops: + xp = arp->dropf; + arp->dropf = nil; + arp->dropl = nil; + QUNLOCK(arp); + + for(; xp; xp = next){ + next = xp->list; + icmphostunr(f, ifc, xp, Icmp6_adr_unreach, 1); + } + + return nrxt; + +} + +static int +rxready(void *v) +{ + Arp *arp = (Arp *) v; + int x; + + x = ((arp->rxmt != nil) || (arp->dropf != nil)); + + return x; +} + +static void +rxmitproc(void *v) +{ + Arp *arp = v; + long wakeupat; + + arp->rxmitp = up; + //print("arp rxmitproc started\n"); + if(waserror()){ + arp->rxmitp = 0; + pexit("hangup", 1); + } + for(;;){ + wakeupat = rxmitsols(arp); + if(wakeupat == 0) + sleep(&arp->rxmtq, rxready, v); + else if(wakeupat > ReTransTimer/4) + tsleep(&arp->rxmtq, return0, 0, wakeupat); + } +} + diff --git a/src/9vx/a/ip/chandial.c b/src/9vx/a/ip/chandial.c @@ -0,0 +1,124 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" +#include "ip/ip.h" + +typedef struct DS DS; +static Chan* call(char*, char*, DS*); +static void _dial_string_parse(char*, DS*); + +enum +{ + Maxstring= 128, +}; + +struct DS +{ + char buf[Maxstring]; /* dist string */ + char *netdir; + char *proto; + char *rem; + char *local; /* other args */ + char *dir; + Chan **ctlp; +}; + +/* + * the dialstring is of the form '[/net/]proto!dest' + */ +Chan* +chandial(char *dest, char *local, char *dir, Chan **ctlp) +{ + DS ds; + char clone[Maxpath]; + + ds.local = local; + ds.dir = dir; + ds.ctlp = ctlp; + + _dial_string_parse(dest, &ds); + if(ds.netdir == 0) + ds.netdir = "/net"; + + /* no connection server, don't translate */ + snprint(clone, sizeof(clone), "%s/%s/clone", ds.netdir, ds.proto); + return call(clone, ds.rem, &ds); +} + +static Chan* +call(char *clone, char *dest, DS *ds) +{ + int n; + Chan *dchan, *cchan; + char name[Maxpath], data[Maxpath], *p; + + cchan = namec(clone, Aopen, ORDWR, 0); + + /* get directory name */ + if(waserror()){ + cclose(cchan); + nexterror(); + } + n = devtab[cchan->type]->read(cchan, name, sizeof(name)-1, 0); + name[n] = 0; + for(p = name; *p == ' '; p++) + ; + sprint(name, "%lud", strtoul(p, 0, 0)); + p = strrchr(clone, '/'); + *p = 0; + if(ds->dir) + snprint(ds->dir, Maxpath, "%s/%s", clone, name); + snprint(data, sizeof(data), "%s/%s/data", clone, name); + + /* connect */ + if(ds->local) + snprint(name, sizeof(name), "connect %s %s", dest, ds->local); + else + snprint(name, sizeof(name), "connect %s", dest); + devtab[cchan->type]->write(cchan, name, strlen(name), 0); + + /* open data connection */ + dchan = namec(data, Aopen, ORDWR, 0); + if(ds->ctlp) + *ds->ctlp = cchan; + else + cclose(cchan); + poperror(); + return dchan; + +} + +/* + * parse a dial string + */ +static void +_dial_string_parse(char *str, DS *ds) +{ + char *p, *p2; + + strncpy(ds->buf, str, Maxstring); + ds->buf[Maxstring-1] = 0; + + p = strchr(ds->buf, '!'); + if(p == 0) { + ds->netdir = 0; + ds->proto = "net"; + ds->rem = ds->buf; + } else { + if(*ds->buf != '/' && *ds->buf != '#'){ + ds->netdir = 0; + ds->proto = ds->buf; + } else { + for(p2 = p; *p2 != '/'; p2--) + ; + *p2++ = 0; + ds->netdir = ds->buf; + ds->proto = p2; + } + *p = 0; + ds->rem = p + 1; + } +} diff --git a/src/9vx/a/ip/devip.c b/src/9vx/a/ip/devip.c @@ -0,0 +1,1439 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" +#include "ip/ip.h" + +enum +{ + Qtopdir= 1, /* top level directory */ + Qtopbase, + Qarp= Qtopbase, + Qbootp, + Qndb, + Qiproute, + Qipselftab, + Qlog, + + Qprotodir, /* directory for a protocol */ + Qprotobase, + Qclone= Qprotobase, + Qstats, + + Qconvdir, /* directory for a conversation */ + Qconvbase, + Qctl= Qconvbase, + Qdata, + Qerr, + Qlisten, + Qlocal, + Qremote, + Qstatus, + Qsnoop, + + Logtype= 5, + Masktype= (1<<Logtype)-1, + Logconv= 12, + Maskconv= (1<<Logconv)-1, + Shiftconv= Logtype, + Logproto= 8, + Maskproto= (1<<Logproto)-1, + Shiftproto= Logtype + Logconv, + + Nfs= 128, +}; +#define TYPE(x) ( ((ulong)(x).path) & Masktype ) +#define CONV(x) ( (((ulong)(x).path) >> Shiftconv) & Maskconv ) +#define PROTO(x) ( (((ulong)(x).path) >> Shiftproto) & Maskproto ) +#define QID(p, c, y) ( ((uint)(p)<<(Shiftproto)) | ((uint)(c)<<Shiftconv) | (y) ) + +static char network[] = "network"; + +QLock fslock; +Fs *ipfs[Nfs]; /* attached fs's */ +Queue *qlog; + +extern void nullmediumlink(void); +extern void pktmediumlink(void); + long ndbwrite(Fs *f, char *a, ulong off, int n); + +static int +ip3gen(Chan *c, int i, Dir *dp) +{ + Qid q; + Conv *cv; + char *p; + + cv = ipfs[c->dev]->p[PROTO(c->qid)]->conv[CONV(c->qid)]; + if(cv->owner == nil) + kstrdup(&cv->owner, eve); + mkqid(&q, QID(PROTO(c->qid), CONV(c->qid), i), 0, QTFILE); + + switch(i) { + default: + return -1; + case Qctl: + devdir(c, q, "ctl", 0, cv->owner, cv->perm, dp); + return 1; + case Qdata: + devdir(c, q, "data", qlen(cv->rq), cv->owner, cv->perm, dp); + return 1; + case Qerr: + devdir(c, q, "err", qlen(cv->eq), cv->owner, cv->perm, dp); + return 1; + case Qlisten: + devdir(c, q, "listen", 0, cv->owner, cv->perm, dp); + return 1; + case Qlocal: + p = "local"; + break; + case Qremote: + p = "remote"; + break; + case Qsnoop: + if(strcmp(cv->p->name, "ipifc") != 0) + return -1; + devdir(c, q, "snoop", qlen(cv->sq), cv->owner, 0400, dp); + return 1; + case Qstatus: + p = "status"; + break; + } + devdir(c, q, p, 0, cv->owner, 0444, dp); + return 1; +} + +static int +ip2gen(Chan *c, int i, Dir *dp) +{ + Qid q; + + switch(i) { + case Qclone: + mkqid(&q, QID(PROTO(c->qid), 0, Qclone), 0, QTFILE); + devdir(c, q, "clone", 0, network, 0666, dp); + return 1; + case Qstats: + mkqid(&q, QID(PROTO(c->qid), 0, Qstats), 0, QTFILE); + devdir(c, q, "stats", 0, network, 0444, dp); + return 1; + } + return -1; +} + +static int +ip1gen(Chan *c, int i, Dir *dp) +{ + Qid q; + char *p; + int prot; + int len = 0; + Fs *f; + extern ulong kerndate; + + f = ipfs[c->dev]; + + prot = 0666; + mkqid(&q, QID(0, 0, i), 0, QTFILE); + switch(i) { + default: + return -1; + case Qarp: + p = "arp"; + prot = 0664; + break; + case Qbootp: + p = "bootp"; + break; + case Qndb: + p = "ndb"; + len = strlen(f->ndb); + q.vers = f->ndbvers; + break; + case Qiproute: + p = "iproute"; + prot = 0664; + break; + case Qipselftab: + p = "ipselftab"; + prot = 0444; + break; + case Qlog: + p = "log"; + break; + } + devdir(c, q, p, len, network, prot, dp); + if(i == Qndb && f->ndbmtime > kerndate) + dp->mtime = f->ndbmtime; + return 1; +} + +static int +ipgen(Chan *c, char* __ch, Dirtab* __dt, int __i, int s, Dir *dp) +{ + Qid q; + Conv *cv; + Fs *f; + + f = ipfs[c->dev]; + + switch(TYPE(c->qid)) { + case Qtopdir: + if(s == DEVDOTDOT){ + mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR); + sprint(up->genbuf, "#I%lud", c->dev); + devdir(c, q, up->genbuf, 0, network, 0555, dp); + return 1; + } + if(s < f->np) { + if(f->p[s]->connect == nil) + return 0; /* protocol with no user interface */ + mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR); + devdir(c, q, f->p[s]->name, 0, network, 0555, dp); + return 1; + } + s -= f->np; + return ip1gen(c, s+Qtopbase, dp); + case Qarp: + case Qbootp: + case Qndb: + case Qlog: + case Qiproute: + case Qipselftab: + return ip1gen(c, TYPE(c->qid), dp); + case Qprotodir: + if(s == DEVDOTDOT){ + mkqid(&q, QID(0, 0, Qtopdir), 0, QTDIR); + sprint(up->genbuf, "#I%lud", c->dev); + devdir(c, q, up->genbuf, 0, network, 0555, dp); + return 1; + } + if(s < f->p[PROTO(c->qid)]->ac) { + cv = f->p[PROTO(c->qid)]->conv[s]; + sprint(up->genbuf, "%d", s); + mkqid(&q, QID(PROTO(c->qid), s, Qconvdir), 0, QTDIR); + devdir(c, q, up->genbuf, 0, cv->owner, 0555, dp); + return 1; + } + s -= f->p[PROTO(c->qid)]->ac; + return ip2gen(c, s+Qprotobase, dp); + case Qclone: + case Qstats: + return ip2gen(c, TYPE(c->qid), dp); + case Qconvdir: + if(s == DEVDOTDOT){ + s = PROTO(c->qid); + mkqid(&q, QID(s, 0, Qprotodir), 0, QTDIR); + devdir(c, q, f->p[s]->name, 0, network, 0555, dp); + return 1; + } + return ip3gen(c, s+Qconvbase, dp); + case Qctl: + case Qdata: + case Qerr: + case Qlisten: + case Qlocal: + case Qremote: + case Qstatus: + case Qsnoop: + return ip3gen(c, TYPE(c->qid), dp); + } + return -1; +} + +static void +ipreset(void) +{ + nullmediumlink(); + pktmediumlink(); + + fmtinstall('i', eipfmt); + fmtinstall('I', eipfmt); + fmtinstall('E', eipfmt); + fmtinstall('V', eipfmt); + fmtinstall('M', eipfmt); +} + +static Fs* +ipgetfs(int dev) +{ + extern void (*ipprotoinit[])(Fs*); + Fs *f; + int i; + + if(dev >= Nfs) + return nil; + + qlock(&fslock); + if(ipfs[dev] == nil){ + f = smalloc(sizeof(Fs)); + ip_init(f); + arpinit(f); + netloginit(f); + for(i = 0; ipprotoinit[i]; i++) + ipprotoinit[i](f); + f->dev = dev; + ipfs[dev] = f; + } + qunlock(&fslock); + + return ipfs[dev]; +} + +IPaux* +newipaux(char *owner, char *tag) +{ + IPaux *a; + int n; + + a = smalloc(sizeof(*a)); + kstrdup(&a->owner, owner); + memset(a->tag, ' ', sizeof(a->tag)); + n = strlen(tag); + if(n > sizeof(a->tag)) + n = sizeof(a->tag); + memmove(a->tag, tag, n); + return a; +} + +#define ATTACHER(c) (((IPaux*)((c)->aux))->owner) + +static Chan* +ipattach(char* spec) +{ + Chan *c; + int dev; + + dev = atoi(spec); + if(dev >= Nfs) + error("bad specification"); + + ipgetfs(dev); + c = devattach('I', spec); + mkqid(&c->qid, QID(0, 0, Qtopdir), 0, QTDIR); + c->dev = dev; + + c->aux = newipaux(commonuser(), "none"); + + return c; +} + +static Walkqid* +ipwalk(Chan* c, Chan *nc, char **name, int nname) +{ + IPaux *a = c->aux; + Walkqid* w; + + w = devwalk(c, nc, name, nname, nil, 0, ipgen); + if(w != nil && w->clone != nil) + w->clone->aux = newipaux(a->owner, a->tag); + return w; +} + + +static int +ipstat(Chan* c, uchar* db, int n) +{ + return devstat(c, db, n, nil, 0, ipgen); +} + +static int +incoming(void* arg) +{ + Conv *conv; + + conv = arg; + return conv->incall != nil; +} + +static int m2p[] = { + [OREAD] 4, + [OWRITE] 2, + [ORDWR] 6 +}; + +static Chan* +ipopen(Chan* c, int omode) +{ + Conv *cv, *nc; + Proto *p; + int perm; + Fs *f; + + perm = m2p[omode&3]; + + f = ipfs[c->dev]; + + switch(TYPE(c->qid)) { + default: + break; + case Qndb: + if(omode & (OWRITE|OTRUNC) && !iseve()) + error(Eperm); + if((omode & (OWRITE|OTRUNC)) == (OWRITE|OTRUNC)) + f->ndb[0] = 0; + break; + case Qlog: + netlogopen(f); + break; + case Qiproute: + case Qarp: + if(omode != OREAD && !iseve()) + error(Eperm); + break; + case Qtopdir: + case Qprotodir: + case Qconvdir: + case Qstatus: + case Qremote: + case Qlocal: + case Qstats: + case Qbootp: + case Qipselftab: + if(omode != OREAD) + error(Eperm); + break; + case Qsnoop: + if(omode != OREAD) + error(Eperm); + p = f->p[PROTO(c->qid)]; + cv = p->conv[CONV(c->qid)]; + if(strcmp(ATTACHER(c), cv->owner) != 0 && !iseve()) + error(Eperm); + incref(&cv->snoopers); + break; + case Qclone: + p = f->p[PROTO(c->qid)]; + QLOCK(p); + if(waserror()){ + QUNLOCK(p); + nexterror(); + } + cv = Fsprotoclone(p, ATTACHER(c)); + QUNLOCK(p); + poperror(); + if(cv == nil) { + error(Enodev); + break; + } + mkqid(&c->qid, QID(p->x, cv->x, Qctl), 0, QTFILE); + break; + case Qdata: + case Qctl: + case Qerr: + p = f->p[PROTO(c->qid)]; + QLOCK(p); + cv = p->conv[CONV(c->qid)]; + QLOCK(cv); + if(waserror()) { + QUNLOCK(cv); + QUNLOCK(p); + nexterror(); + } + if((perm & (cv->perm>>6)) != perm) { + if(strcmp(ATTACHER(c), cv->owner) != 0) + error(Eperm); + if((perm & cv->perm) != perm) + error(Eperm); + + } + cv->inuse++; + if(cv->inuse == 1){ + kstrdup(&cv->owner, ATTACHER(c)); + cv->perm = 0660; + } + QUNLOCK(cv); + QUNLOCK(p); + poperror(); + break; + case Qlisten: + cv = f->p[PROTO(c->qid)]->conv[CONV(c->qid)]; + if((perm & (cv->perm>>6)) != perm) { + if(strcmp(ATTACHER(c), cv->owner) != 0) + error(Eperm); + if((perm & cv->perm) != perm) + error(Eperm); + + } + + if(cv->state != Announced) + error("not announced"); + + if(waserror()){ + closeconv(cv); + nexterror(); + } + QLOCK(cv); + cv->inuse++; + QUNLOCK(cv); + + nc = nil; + while(nc == nil) { + /* give up if we got a hangup */ + if(qisclosed(cv->rq)) + error("listen hungup"); + + qlock(&cv->listenq); + if(waserror()) { + qunlock(&cv->listenq); + nexterror(); + } + + /* wait for a connect */ + sleep(&cv->listenr, incoming, cv); + + QLOCK(cv); + nc = cv->incall; + if(nc != nil){ + cv->incall = nc->next; + mkqid(&c->qid, QID(PROTO(c->qid), nc->x, Qctl), 0, QTFILE); + kstrdup(&cv->owner, ATTACHER(c)); + } + QUNLOCK(cv); + + qunlock(&cv->listenq); + poperror(); + } + closeconv(cv); + poperror(); + break; + } + c->mode = openmode(omode); + c->flag |= COPEN; + c->offset = 0; + return c; +} + +static void +ipcreate(Chan* _, char* __, int ___, ulong ____) +{ + error(Eperm); +} + +static void +ipremove(Chan* _) +{ + error(Eperm); +} + +static int +ipwstat(Chan *c, uchar *dp, int n) +{ + Dir d; + Conv *cv; + Fs *f; + Proto *p; + + f = ipfs[c->dev]; + switch(TYPE(c->qid)) { + default: + error(Eperm); + break; + case Qctl: + case Qdata: + break; + } + + n = convM2D(dp, n, &d, nil); + if(n > 0){ + p = f->p[PROTO(c->qid)]; + cv = p->conv[CONV(c->qid)]; + if(!iseve() && strcmp(ATTACHER(c), cv->owner) != 0) + error(Eperm); + if(d.uid[0]) + kstrdup(&cv->owner, d.uid); + cv->perm = d.mode & 0777; + } + return n; +} + +void +closeconv(Conv *cv) +{ + Conv *nc; + Ipmulti *mp; + + QLOCK(cv); + + if(--cv->inuse > 0) { + QUNLOCK(cv); + return; + } + + /* close all incoming calls since no listen will ever happen */ + for(nc = cv->incall; nc; nc = cv->incall){ + cv->incall = nc->next; + closeconv(nc); + } + cv->incall = nil; + + kstrdup(&cv->owner, network); + cv->perm = 0660; + + while((mp = cv->multi) != nil) + ipifcremmulti(cv, mp->ma, mp->ia); + + cv->r = nil; + cv->rgen = 0; + cv->p->close(cv); + cv->state = Idle; + QUNLOCK(cv); +} + +static void +ipclose(Chan* c) +{ + Fs *f; + + f = ipfs[c->dev]; + switch(TYPE(c->qid)) { + default: + break; + case Qlog: + if(c->flag & COPEN) + netlogclose(f); + break; + case Qdata: + case Qctl: + case Qerr: + if(c->flag & COPEN) + closeconv(f->p[PROTO(c->qid)]->conv[CONV(c->qid)]); + break; + case Qsnoop: + if(c->flag & COPEN) + decref(&f->p[PROTO(c->qid)]->conv[CONV(c->qid)]->snoopers); + break; + } + free(((IPaux*)c->aux)->owner); + free(c->aux); +} + +enum +{ + Statelen= 32*1024, +}; + +static long +ipread(Chan *ch, void *a, long n, vlong off) +{ + Conv *c; + Proto *x; + char *buf, *p; + long rv; + Fs *f; + ulong offset = off; + + f = ipfs[ch->dev]; + + p = a; + switch(TYPE(ch->qid)) { + default: + error(Eperm); + case Qtopdir: + case Qprotodir: + case Qconvdir: + return devdirread(ch, a, n, 0, 0, ipgen); + case Qarp: + return arpread(f->arp, a, offset, n); + case Qbootp: + return bootpread(a, offset, n); + case Qndb: + return readstr(offset, a, n, f->ndb); + case Qiproute: + return routeread(f, a, offset, n); + case Qipselftab: + return ipselftabread(f, a, offset, n); + case Qlog: + return netlogread(f, a, offset, n); + case Qctl: + buf = smalloc(16); + sprint(buf, "%lud", CONV(ch->qid)); + rv = readstr(offset, p, n, buf); + free(buf); + return rv; + case Qremote: + buf = smalloc(Statelen); + x = f->p[PROTO(ch->qid)]; + c = x->conv[CONV(ch->qid)]; + if(x->remote == nil) { + sprint(buf, "%I!%d\n", c->raddr, c->rport); + } else { + (*x->remote)(c, buf, Statelen-2); + } + rv = readstr(offset, p, n, buf); + free(buf); + return rv; + case Qlocal: + buf = smalloc(Statelen); + x = f->p[PROTO(ch->qid)]; + c = x->conv[CONV(ch->qid)]; + if(x->local == nil) { + sprint(buf, "%I!%d\n", c->laddr, c->lport); + } else { + (*x->local)(c, buf, Statelen-2); + } + rv = readstr(offset, p, n, buf); + free(buf); + return rv; + case Qstatus: + buf = smalloc(Statelen); + x = f->p[PROTO(ch->qid)]; + c = x->conv[CONV(ch->qid)]; + (*x->state)(c, buf, Statelen-2); + rv = readstr(offset, p, n, buf); + free(buf); + return rv; + case Qdata: + c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)]; + return qread(c->rq, a, n); + case Qerr: + c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)]; + return qread(c->eq, a, n); + case Qsnoop: + c = f->p[PROTO(ch->qid)]->conv[CONV(ch->qid)]; + return qread(c->sq, a, n); + case Qstats: + x = f->p[PROTO(ch->qid)]; + if(x->stats == nil) + error("stats not implemented"); + buf = smalloc(Statelen); + (*x->stats)(x, buf, Statelen); + rv = readstr(offset, p, n, buf); + free(buf); + return rv; + } +} + +static Block* +ipbread(Chan* ch, long n, ulong offset) +{ + Conv *c; + Proto *x; + Fs *f; + + switch(TYPE(ch->qid)){ + case Qdata: + f = ipfs[ch->dev]; + x = f->p[PROTO(ch->qid)]; + c = x->conv[CONV(ch->qid)]; + return qbread(c->rq, n); + default: + return devbread(ch, n, offset); + } +} + +/* + * set local address to be that of the ifc closest to remote address + */ +static void +setladdr(Conv* c) +{ + findlocalip(c->p->f, c->laddr, c->raddr); +} + +/* + * set a local port making sure the quad of raddr,rport,laddr,lport is unique + */ +char* +setluniqueport(Conv* c, int lport) +{ + Proto *p; + Conv *xp; + int x; + + p = c->p; + + QLOCK(p); + for(x = 0; x < p->nc; x++){ + xp = p->conv[x]; + if(xp == nil) + break; + if(xp == c) + continue; + if((xp->state == Connected || xp->state == Announced) + && xp->lport == lport + && xp->rport == c->rport + && ipcmp(xp->raddr, c->raddr) == 0 + && ipcmp(xp->laddr, c->laddr) == 0){ + QUNLOCK(p); + return "address in use"; + } + } + c->lport = lport; + QUNLOCK(p); + return nil; +} + +/* + * is lport in use by anyone? + */ +static int +lportinuse(Proto *p, ushort lport) +{ + int x; + + for(x = 0; x < p->nc && p->conv[x]; x++) + if(p->conv[x]->lport == lport) + return 1; + return 0; +} + +/* + * pick a local port and set it + */ +char * +setlport(Conv* c) +{ + Proto *p; + int i, port; + + p = c->p; + QLOCK(p); + if(c->restricted){ + /* Restricted ports cycle between 600 and 1024. */ + for(i=0; i<1024-600; i++){ + if(p->nextrport >= 1024 || p->nextrport < 600) + p->nextrport = 600; + port = p->nextrport++; + if(!lportinuse(p, port)) + goto chosen; + } + }else{ + /* + * Unrestricted ports are chosen randomly + * between 2^15 and 2^16. There are at most + * 4*Nchan = 4096 ports in use at any given time, + * so even in the worst case, a random probe has a + * 1 - 4096/2^15 = 87% chance of success. + * If 64 successive probes fail, there is a bug somewhere + * (or a once in 10^58 event has happened, but that's + * less likely than a venti collision). + */ + for(i=0; i<64; i++){ + port = (1<<15) + nrand(1<<15); + if(!lportinuse(p, port)) + goto chosen; + } + } + QUNLOCK(p); + return "no ports available"; + +chosen: + c->lport = port; + QUNLOCK(p); + return nil; +} + +/* + * set a local address and port from a string of the form + * [address!]port[!r] + */ +char* +setladdrport(Conv* c, char* str, int announcing) +{ + char *p; + char *rv; + ushort lport; + uchar addr[IPaddrlen]; + + /* + * ignore restricted part if it exists. it's + * meaningless on local ports. + */ + p = strchr(str, '!'); + if(p != nil){ + *p++ = 0; + if(strcmp(p, "r") == 0) + p = nil; + } + + c->lport = 0; + if(p == nil){ + if(announcing) + ipmove(c->laddr, IPnoaddr); + else + setladdr(c); + p = str; + } else { + if(strcmp(str, "*") == 0) + ipmove(c->laddr, IPnoaddr); + else { + if(parseip(addr, str) == -1) + return Ebadip; + if(ipforme(c->p->f, addr)) + ipmove(c->laddr, addr); + else + return "not a local IP address"; + } + } + + /* one process can get all connections */ + if(announcing && strcmp(p, "*") == 0){ + if(!iseve()) + error(Eperm); + return setluniqueport(c, 0); + } + + lport = atoi(p); + if(lport <= 0) + rv = setlport(c); + else + rv = setluniqueport(c, lport); + return rv; +} + +static char* +setraddrport(Conv* c, char* str) +{ + char *p; + + p = strchr(str, '!'); + if(p == nil) + return "malformed address"; + *p++ = 0; + if (parseip(c->raddr, str) == -1) + return Ebadip; + c->rport = atoi(p); + p = strchr(p, '!'); + if(p){ + if(strstr(p, "!r") != nil) + c->restricted = 1; + } + return nil; +} + +/* + * called by protocol connect routine to set addresses + */ +char* +Fsstdconnect(Conv *c, char *argv[], int argc) +{ + char *p; + + switch(argc) { + default: + return "bad args to connect"; + case 2: + p = setraddrport(c, argv[1]); + if(p != nil) + return p; + setladdr(c); + p = setlport(c); + if (p != nil) + return p; + break; + case 3: + p = setraddrport(c, argv[1]); + if(p != nil) + return p; + p = setladdrport(c, argv[2], 0); + if(p != nil) + return p; + } + + if( (memcmp(c->raddr, v4prefix, IPv4off) == 0 && + memcmp(c->laddr, v4prefix, IPv4off) == 0) + || ipcmp(c->raddr, IPnoaddr) == 0) + c->ipversion = V4; + else + c->ipversion = V6; + + return nil; +} +/* + * initiate connection and sleep till its set up + */ +static int +connected(void* a) +{ + return ((Conv*)a)->state == Connected; +} +static void +connectctlmsg(Proto *x, Conv *c, Cmdbuf *cb) +{ + char *p; + + if(c->state != 0) + error(Econinuse); + c->state = Connecting; + c->cerr[0] = '\0'; + if(x->connect == nil) + error("connect not supported"); + p = x->connect(c, cb->f, cb->nf); + if(p != nil) + error(p); + + QUNLOCK(c); + if(waserror()){ + QLOCK(c); + nexterror(); + } + sleep(&c->cr, connected, c); + QLOCK(c); + poperror(); + + if(c->cerr[0] != '\0') + error(c->cerr); +} + +/* + * called by protocol announce routine to set addresses + */ +char* +Fsstdannounce(Conv* c, char* argv[], int argc) +{ + memset(c->raddr, 0, sizeof(c->raddr)); + c->rport = 0; + switch(argc){ + default: + break; + case 2: + return setladdrport(c, argv[1], 1); + } + return "bad args to announce"; +} + +/* + * initiate announcement and sleep till its set up + */ +static int +announced(void* a) +{ + return ((Conv*)a)->state == Announced; +} +static void +announcectlmsg(Proto *x, Conv *c, Cmdbuf *cb) +{ + char *p; + + if(c->state != 0) + error(Econinuse); + c->state = Announcing; + c->cerr[0] = '\0'; + if(x->announce == nil) + error("announce not supported"); + p = x->announce(c, cb->f, cb->nf); + if(p != nil) + error(p); + + QUNLOCK(c); + if(waserror()){ + QLOCK(c); + nexterror(); + } + sleep(&c->cr, announced, c); + QLOCK(c); + poperror(); + + if(c->cerr[0] != '\0') + error(c->cerr); +} + +/* + * called by protocol bind routine to set addresses + */ +char* +Fsstdbind(Conv* c, char* argv[], int argc) +{ + switch(argc){ + default: + break; + case 2: + return setladdrport(c, argv[1], 0); + } + return "bad args to bind"; +} + +static void +bindctlmsg(Proto *x, Conv *c, Cmdbuf *cb) +{ + char *p; + + if(x->bind == nil) + p = Fsstdbind(c, cb->f, cb->nf); + else + p = x->bind(c, cb->f, cb->nf); + if(p != nil) + error(p); +} + +static void +tosctlmsg(Conv *c, Cmdbuf *cb) +{ + if(cb->nf < 2) + c->tos = 0; + else + c->tos = atoi(cb->f[1]); +} + +static void +ttlctlmsg(Conv *c, Cmdbuf *cb) +{ + if(cb->nf < 2) + c->ttl = MAXTTL; + else + c->ttl = atoi(cb->f[1]); +} + +static long +ipwrite(Chan* ch, void *v, long n, vlong off) +{ + Conv *c; + Proto *x; + char *p; + Cmdbuf *cb; + uchar ia[IPaddrlen], ma[IPaddrlen]; + Fs *f; + char *a; + ulong offset = off; + + a = v; + f = ipfs[ch->dev]; + + switch(TYPE(ch->qid)){ + default: + error(Eperm); + case Qdata: + x = f->p[PROTO(ch->qid)]; + c = x->conv[CONV(ch->qid)]; + + if(c->wq == nil) + error(Eperm); + + qwrite(c->wq, a, n); + break; + case Qarp: + return arpwrite(f, a, n); + case Qiproute: + return routewrite(f, ch, a, n); + case Qlog: + netlogctl(f, a, n); + return n; + case Qndb: + return ndbwrite(f, a, offset, n); + break; + case Qctl: + x = f->p[PROTO(ch->qid)]; + c = x->conv[CONV(ch->qid)]; + cb = parsecmd(a, n); + + QLOCK(c); + if(waserror()) { + QUNLOCK(c); + free(cb); + nexterror(); + } + if(cb->nf < 1) + error("short control request"); + if(strcmp(cb->f[0], "connect") == 0) + connectctlmsg(x, c, cb); + else if(strcmp(cb->f[0], "announce") == 0) + announcectlmsg(x, c, cb); + else if(strcmp(cb->f[0], "bind") == 0) + bindctlmsg(x, c, cb); + else if(strcmp(cb->f[0], "ttl") == 0) + ttlctlmsg(c, cb); + else if(strcmp(cb->f[0], "tos") == 0) + tosctlmsg(c, cb); + else if(strcmp(cb->f[0], "ignoreadvice") == 0) + c->ignoreadvice = 1; + else if(strcmp(cb->f[0], "addmulti") == 0){ + if(cb->nf < 2) + error("addmulti needs interface address"); + if(cb->nf == 2){ + if(!ipismulticast(c->raddr)) + error("addmulti for a non multicast address"); + if (parseip(ia, cb->f[1]) == -1) + error(Ebadip); + ipifcaddmulti(c, c->raddr, ia); + } else { + if (parseip(ia, cb->f[1]) == -1 || + parseip(ma, cb->f[2]) == -1) + error(Ebadip); + if(!ipismulticast(ma)) + error("addmulti for a non multicast address"); + ipifcaddmulti(c, ma, ia); + } + } else if(strcmp(cb->f[0], "remmulti") == 0){ + if(cb->nf < 2) + error("remmulti needs interface address"); + if(!ipismulticast(c->raddr)) + error("remmulti for a non multicast address"); + if (parseip(ia, cb->f[1]) == -1) + error(Ebadip); + ipifcremmulti(c, c->raddr, ia); + } else if(strcmp(cb->f[0], "maxfragsize") == 0){ + if(cb->nf < 2) + error("maxfragsize needs size"); + + c->maxfragsize = (int)strtol(cb->f[1], nil, 0); + + } else if(x->ctl != nil) { + p = x->ctl(c, cb->f, cb->nf); + if(p != nil) + error(p); + } else + error("unknown control request"); + QUNLOCK(c); + free(cb); + poperror(); + } + return n; +} + +static long +ipbwrite(Chan* ch, Block* bp, ulong offset) +{ + Conv *c; + Proto *x; + Fs *f; + int n; + + switch(TYPE(ch->qid)){ + case Qdata: + f = ipfs[ch->dev]; + x = f->p[PROTO(ch->qid)]; + c = x->conv[CONV(ch->qid)]; + + if(c->wq == nil) + error(Eperm); + + if(bp->next) + bp = concatblock(bp); + n = BLEN(bp); + qbwrite(c->wq, bp); + return n; + default: + return devbwrite(ch, bp, offset); + } +} + +Dev ipdevtab = { + 'I', + "ip", + + ipreset, + devinit, + devshutdown, + ipattach, + ipwalk, + ipstat, + ipopen, + ipcreate, + ipclose, + ipread, + ipbread, + ipwrite, + ipbwrite, + ipremove, + ipwstat, +}; + +int +Fsproto(Fs *f, Proto *p) +{ + if(f->np >= Maxproto) + return -1; + + p->f = f; + + if(p->ipproto > 0){ + if(f->t2p[p->ipproto] != nil) + return -1; + f->t2p[p->ipproto] = p; + } + + p->qid.type = QTDIR; + p->qid.path = QID(f->np, 0, Qprotodir); + p->conv = malloc(sizeof(Conv*)*(p->nc+1)); + if(p->conv == nil) + panic("Fsproto"); + + p->x = f->np; + p->nextrport = 600; + f->p[f->np++] = p; + + return 0; +} + +/* + * return true if this protocol is + * built in + */ +int +Fsbuiltinproto(Fs* f, uchar proto) +{ + return f->t2p[proto] != nil; +} + +/* + * called with protocol locked + */ +Conv* +Fsprotoclone(Proto *p, char *user) +{ + Conv *c, **pp, **ep; + +retry: + c = nil; + ep = &p->conv[p->nc]; + for(pp = p->conv; pp < ep; pp++) { + c = *pp; + if(c == nil){ + c = malloc(sizeof(Conv)); + if(c == nil) + error(Enomem); + QLOCK(c); + c->p = p; + c->x = pp - p->conv; + if(p->ptclsize != 0){ + c->ptcl = malloc(p->ptclsize); + if(c->ptcl == nil) { + free(c); + error(Enomem); + } + } + *pp = c; + p->ac++; + c->eq = qopen(1024, Qmsg, 0, 0); + (*p->create)(c); + break; + } + if(CANQLOCK(c)){ + /* + * make sure both processes and protocol + * are done with this Conv + */ + if(c->inuse == 0 && (p->inuse == nil || (*p->inuse)(c) == 0)) + break; + + QUNLOCK(c); + } + } + if(pp >= ep) { + if(p->gc != nil && (*p->gc)(p)) + goto retry; + return nil; + } + + c->inuse = 1; + kstrdup(&c->owner, user); + c->perm = 0660; + c->state = Idle; + ipmove(c->laddr, IPnoaddr); + ipmove(c->raddr, IPnoaddr); + c->r = nil; + c->rgen = 0; + c->lport = 0; + c->rport = 0; + c->restricted = 0; + c->maxfragsize = 0; + c->ttl = MAXTTL; + qreopen(c->rq); + qreopen(c->wq); + qreopen(c->eq); + + QUNLOCK(c); + return c; +} + +int +Fsconnected(Conv* c, char* msg) +{ + if(msg != nil && *msg != '\0') + strncpy(c->cerr, msg, ERRMAX-1); + + switch(c->state){ + + case Announcing: + c->state = Announced; + break; + + case Connecting: + c->state = Connected; + break; + } + + wakeup(&c->cr); + return 0; +} + +Proto* +Fsrcvpcol(Fs* f, uchar proto) +{ + if(f->ipmux) + return f->ipmux; + else + return f->t2p[proto]; +} + +Proto* +Fsrcvpcolx(Fs *f, uchar proto) +{ + return f->t2p[proto]; +} + +/* + * called with protocol locked + */ +Conv* +Fsnewcall(Conv *c, uchar *raddr, ushort rport, uchar *laddr, ushort lport, uchar version) +{ + Conv *nc; + Conv **l; + int i; + + QLOCK(c); + i = 0; + for(l = &c->incall; *l; l = &(*l)->next) + i++; + if(i >= Maxincall) { + QUNLOCK(c); + return nil; + } + + /* find a free conversation */ + nc = Fsprotoclone(c->p, network); + if(nc == nil) { + QUNLOCK(c); + return nil; + } + ipmove(nc->raddr, raddr); + nc->rport = rport; + ipmove(nc->laddr, laddr); + nc->lport = lport; + nc->next = nil; + *l = nc; + nc->state = Connected; + nc->ipversion = version; + + QUNLOCK(c); + + wakeup(&c->listenr); + + return nc; +} + +long +ndbwrite(Fs *f, char *a, ulong off, int n) +{ + if(off > strlen(f->ndb)) + error(Eio); + if(off+n >= sizeof(f->ndb)) + error(Eio); + memmove(f->ndb+off, a, n); + f->ndb[off+n] = 0; + f->ndbvers++; + f->ndbmtime = seconds(); + return n; +} + +ulong +scalednconv(void) +{ + if(cpuserver && conf.npage*BY2PG >= 128*MB) + return Nchans*4; + return Nchans; +} diff --git a/src/9vx/a/ip/eipconvtest.c b/src/9vx/a/ip/eipconvtest.c @@ -0,0 +1,152 @@ +#include <u.h> +#include <libc.h> + +enum +{ + Isprefix= 16, +}; + +uchar prefixvals[256] = +{ +[0x00] 0 | Isprefix, +[0x80] 1 | Isprefix, +[0xC0] 2 | Isprefix, +[0xE0] 3 | Isprefix, +[0xF0] 4 | Isprefix, +[0xF8] 5 | Isprefix, +[0xFC] 6 | Isprefix, +[0xFE] 7 | Isprefix, +[0xFF] 8 | Isprefix, +}; + +uchar v4prefix[16] = { + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0xff, 0xff, + 0, 0, 0, 0 +}; + +void +hnputl(void *p, ulong v) +{ + uchar *a; + + a = p; + a[0] = v>>24; + a[1] = v>>16; + a[2] = v>>8; + a[3] = v; +} + +int +eipconv(va_list *arg, Fconv *f) +{ + char buf[8*5]; + static char *efmt = "%.2lux%.2lux%.2lux%.2lux%.2lux%.2lux"; + static char *ifmt = "%d.%d.%d.%d"; + uchar *p, ip[16]; + ulong *lp; + ushort s; + int i, j, n, eln, eli; + + switch(f->chr) { + case 'E': /* Ethernet address */ + p = va_arg(*arg, uchar*); + sprint(buf, efmt, p[0], p[1], p[2], p[3], p[4], p[5]); + break; + case 'I': /* Ip address */ + p = va_arg(*arg, uchar*); +common: + if(memcmp(p, v4prefix, 12) == 0) + sprint(buf, ifmt, p[12], p[13], p[14], p[15]); + else { + /* find longest elision */ + eln = eli = -1; + for(i = 0; i < 16; i += 2){ + for(j = i; j < 16; j += 2) + if(p[j] != 0 || p[j+1] != 0) + break; + if(j > i && j - i > eln){ + eli = i; + eln = j - i; + } + } + + /* print with possible elision */ + n = 0; + for(i = 0; i < 16; i += 2){ + if(i == eli){ + n += sprint(buf+n, "::"); + i += eln; + if(i >= 16) + break; + } else if(i != 0) + n += sprint(buf+n, ":"); + s = (p[i]<<8) + p[i+1]; + n += sprint(buf+n, "%ux", s); + } + } + break; + case 'i': /* v6 address as 4 longs */ + lp = va_arg(*arg, ulong*); + for(i = 0; i < 4; i++) + hnputl(ip+4*i, *lp++); + p = ip; + goto common; + case 'V': /* v4 ip address */ + p = va_arg(*arg, uchar*); + sprint(buf, ifmt, p[0], p[1], p[2], p[3]); + break; + case 'M': /* ip mask */ + p = va_arg(*arg, uchar*); + + /* look for a prefix mask */ + for(i = 0; i < 16; i++) + if(p[i] != 0xff) + break; + if(i < 16){ + if((prefixvals[p[i]] & Isprefix) == 0) + goto common; + for(j = i+1; j < 16; j++) + if(p[j] != 0) + goto common; + n = 8*i + (prefixvals[p[i]] & ~Isprefix); + } else + n = 8*16; + + /* got one, use /xx format */ + sprint(buf, "/%d", n); + break; + default: + strcpy(buf, "(eipconv)"); + } + strconv(buf, f); + return sizeof(uchar*); +} + +uchar testvec[11][16] = +{ + { 0,0,0,0, 0,0,0,0, 0,0,0xff,0xff, 1,3,4,5, }, + { 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, }, + { 0xff,0xff,0x80,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }, + { 0xff,0xff,0xff,0xc0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }, + { 0xff,0xff,0xff,0xff, 0xe0,0,0,0, 0,0,0,0, 0,0,0,0, }, + { 0xff,0xff,0xff,0xff, 0xff,0xf0,0,0, 0,0,0,0, 0,0,0,0, }, + { 0xff,0xff,0xff,0xff, 0xff,0xff,0xf8,0, 0,0,0,0, 0,0,0,0, }, + { 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff, }, + { 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, }, + { 0,0,0,0, 0,0x11,0,0, 0,0,0,0, 0,0,0,0, }, + { 0,0,0,0x11, 0,0,0,0, 0,0,0,0, 0,0,0,0x12, }, +}; + +void +main(void) +{ + int i; + + fmtinstall('I', eipconv); + fmtinstall('M', eipconv); + for(i = 0; i < 11; i++) + print("%I\n%M\n", testvec[i], testvec[i]); + exits(0); +} diff --git a/src/9vx/a/ip/esp.c b/src/9vx/a/ip/esp.c @@ -0,0 +1,951 @@ +/* + * Encapsulating Security Payload for IPsec for IPv4, rfc1827. + * currently only implements tunnel mode. + * TODO: update to match rfc4303. + */ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" +#include "ipv6.h" +#include "libsec.h" + +typedef struct Esphdr Esphdr; +typedef struct Esp4hdr Esp4hdr; +typedef struct Esp6hdr Esp6hdr; +typedef struct Esptail Esptail; +typedef struct Userhdr Userhdr; +typedef struct Esppriv Esppriv; +typedef struct Espcb Espcb; +typedef struct Algorithm Algorithm; + +enum +{ + IP_ESPPROTO = 50, /* IP v4 and v6 protocol number */ + Esp4hdrlen = IP4HDR + 8, + Esp6hdrlen = IP6HDR + 8, + + Esptaillen = 2, /* does not include pad or auth data */ + Userhdrlen = 4, /* user-visible header size - if enabled */ +}; + +struct Esphdr +{ + uchar espspi[4]; /* Security parameter index */ + uchar espseq[4]; /* Sequence number */ +}; + +/* + * tunnel-mode layout: IP | ESP | TCP/UDP | user data. + * transport-mode layout is: ESP | IP | TCP/UDP | user data. + */ +struct Esp4hdr +{ + /* ipv4 header */ + uchar vihl; /* Version and header length */ + uchar tos; /* Type of service */ + uchar length[2]; /* packet length */ + uchar id[2]; /* Identification */ + uchar frag[2]; /* Fragment information */ + uchar Unused; + uchar espproto; /* Protocol */ + uchar espplen[2]; /* Header plus data length */ + uchar espsrc[4]; /* Ip source */ + uchar espdst[4]; /* Ip destination */ + + /* Esphdr; */ + uchar espspi[4]; /* Security parameter index */ + uchar espseq[4]; /* Sequence number */ +}; + +/* tunnel-mode layout */ +struct Esp6hdr +{ + /* Ip6hdr; */ + uchar vcf[4]; /* version:4, traffic class:8, flow label:20 */ + uchar ploadlen[2]; /* payload length: packet length - 40 */ + uchar proto; /* next header type */ + uchar ttl; /* hop limit */ + uchar src[IPaddrlen]; + uchar dst[IPaddrlen]; + + /* Esphdr; */ + uchar espspi[4]; /* Security parameter index */ + uchar espseq[4]; /* Sequence number */ +}; + +struct Esptail +{ + uchar pad; + uchar nexthdr; +}; + +/* header as seen by the user */ +struct Userhdr +{ + uchar nexthdr; /* next protocol */ + uchar unused[3]; +}; + +struct Esppriv +{ + ulong in; + ulong inerrors; +}; + +/* + * protocol specific part of Conv + */ +struct Espcb +{ + int incoming; + int header; /* user user level header */ + ulong spi; + ulong seq; /* last seq sent */ + ulong window; /* for replay attacks */ + char *espalg; + void *espstate; /* other state for esp */ + int espivlen; /* in bytes */ + int espblklen; + int (*cipher)(Espcb*, uchar *buf, int len); + char *ahalg; + void *ahstate; /* other state for esp */ + int ahlen; /* auth data length in bytes */ + int ahblklen; + int (*auth)(Espcb*, uchar *buf, int len, uchar *hash); +}; + +struct Algorithm +{ + char *name; + int keylen; /* in bits */ + void (*init)(Espcb*, char* name, uchar *key, int keylen); +}; + +static Conv* convlookup(Proto *esp, ulong spi); +static char *setalg(Espcb *ecb, char **f, int n, Algorithm *alg); +static void espkick(void *x); + +static void nullespinit(Espcb*, char*, uchar *key, int keylen); +static void desespinit(Espcb *ecb, char *name, uchar *k, int n); + +static void nullahinit(Espcb*, char*, uchar *key, int keylen); +static void shaahinit(Espcb*, char*, uchar *key, int keylen); +static void md5ahinit(Espcb*, char*, uchar *key, int keylen); + +static Algorithm espalg[] = +{ + "null", 0, nullespinit, +// "des3_cbc", 192, des3espinit, /* rfc2451 */ +// "aes_128_cbc", 128, aescbcespinit, /* rfc3602 */ +// "aes_ctr", 128, aesctrespinit, /* rfc3686 */ + "des_56_cbc", 64, desespinit, /* rfc2405, deprecated */ +// "rc4_128", 128, rc4espinit, /* gone in rfc4305 */ + nil, 0, nil, +}; + +static Algorithm ahalg[] = +{ + "null", 0, nullahinit, + "hmac_sha1_96", 128, shaahinit, /* rfc2404 */ +// "aes_xcbc_mac_96", 128, aesahinit, /* rfc3566 */ + "hmac_md5_96", 128, md5ahinit, /* rfc2403 */ + nil, 0, nil, +}; + +static char* +espconnect(Conv *c, char **argv, int argc) +{ + char *p, *pp; + char *e = nil; + ulong spi; + Espcb *ecb = (Espcb*)c->ptcl; + + switch(argc) { + default: + e = "bad args to connect"; + break; + case 2: + p = strchr(argv[1], '!'); + if(p == nil){ + e = "malformed address"; + break; + } + *p++ = 0; + parseip(c->raddr, argv[1]); + findlocalip(c->p->f, c->laddr, c->raddr); + ecb->incoming = 0; + ecb->seq = 0; + if(strcmp(p, "*") == 0) { + QLOCK(c->p); + for(;;) { + spi = nrand(1<<16) + 256; + if(convlookup(c->p, spi) == nil) + break; + } + QUNLOCK(c->p); + ecb->spi = spi; + ecb->incoming = 1; + qhangup(c->wq, nil); + } else { + spi = strtoul(p, &pp, 10); + if(pp == p) { + e = "malformed address"; + break; + } + ecb->spi = spi; + qhangup(c->rq, nil); + } + nullespinit(ecb, "null", nil, 0); + nullahinit(ecb, "null", nil, 0); + } + Fsconnected(c, e); + + return e; +} + + +static int +espstate(Conv *c, char *state, int n) +{ + return snprint(state, n, "%s", c->inuse?"Open\n":"Closed\n"); +} + +static void +espcreate(Conv *c) +{ + c->rq = qopen(64*1024, Qmsg, 0, 0); + c->wq = qopen(64*1024, Qkick, espkick, c); +} + +static void +espclose(Conv *c) +{ + Espcb *ecb; + + qclose(c->rq); + qclose(c->wq); + qclose(c->eq); + ipmove(c->laddr, IPnoaddr); + ipmove(c->raddr, IPnoaddr); + + ecb = (Espcb*)c->ptcl; + free(ecb->espstate); + free(ecb->ahstate); + memset(ecb, 0, sizeof(Espcb)); +} + +static int +ipvers(Conv *c) +{ + if((memcmp(c->raddr, v4prefix, IPv4off) == 0 && + memcmp(c->laddr, v4prefix, IPv4off) == 0) || + ipcmp(c->raddr, IPnoaddr) == 0) + return V4; + else + return V6; +} + +static void +espkick(void *x) +{ + Conv *c = x; + Esp4hdr *eh4; + Esp6hdr *eh6; + Esptail *et; + Userhdr *uh; + Espcb *ecb; + Block *bp; + int nexthdr, payload, pad, align, version, hdrlen, iphdrlen; + uchar *auth; + + version = ipvers(c); + iphdrlen = version == V4? IP4HDR: IP6HDR; + hdrlen = version == V4? Esp4hdrlen: Esp6hdrlen; + + bp = qget(c->wq); + if(bp == nil) + return; + + QLOCK(c); + ecb = c->ptcl; + + if(ecb->header) { + /* make sure the message has a User header */ + bp = pullupblock(bp, Userhdrlen); + if(bp == nil) { + QUNLOCK(c); + return; + } + uh = (Userhdr*)bp->rp; + nexthdr = uh->nexthdr; + bp->rp += Userhdrlen; + } else { + nexthdr = 0; /* what should this be? */ + } + + payload = BLEN(bp) + ecb->espivlen; + + /* Make space to fit ip header */ + bp = padblock(bp, hdrlen + ecb->espivlen); + + align = 4; + if(ecb->espblklen > align) + align = ecb->espblklen; + if(align % ecb->ahblklen != 0) + panic("espkick: ahblklen is important after all"); + pad = (align-1) - (payload + Esptaillen-1)%align; + + /* + * Make space for tail + * this is done by calling padblock with a negative size + * Padblock does not change bp->wp! + */ + bp = padblock(bp, -(pad+Esptaillen+ecb->ahlen)); + bp->wp += pad+Esptaillen+ecb->ahlen; + + eh4 = (Esp4hdr *)bp->rp; + eh6 = (Esp6hdr *)bp->rp; + et = (Esptail*)(bp->rp + hdrlen + payload + pad); + + /* fill in tail */ + et->pad = pad; + et->nexthdr = nexthdr; + + ecb->cipher(ecb, bp->rp + hdrlen, payload + pad + Esptaillen); + auth = bp->rp + hdrlen + payload + pad + Esptaillen; + + /* fill in head */ + if (version == V4) { + eh4->vihl = IP_VER4; + hnputl(eh4->espspi, ecb->spi); + hnputl(eh4->espseq, ++ecb->seq); + v6tov4(eh4->espsrc, c->laddr); + v6tov4(eh4->espdst, c->raddr); + eh4->espproto = IP_ESPPROTO; + eh4->frag[0] = 0; + eh4->frag[1] = 0; + } else { + eh6->vcf[0] = IP_VER6; + hnputl(eh6->espspi, ecb->spi); + hnputl(eh6->espseq, ++ecb->seq); + ipmove(eh6->src, c->laddr); + ipmove(eh6->dst, c->raddr); + eh6->proto = IP_ESPPROTO; + } + + ecb->auth(ecb, bp->rp + iphdrlen, (hdrlen - iphdrlen) + + payload + pad + Esptaillen, auth); + + QUNLOCK(c); + /* print("esp: pass down: %uld\n", BLEN(bp)); */ + if (version == V4) + ipoput4(c->p->f, bp, 0, c->ttl, c->tos, c); + else + ipoput6(c->p->f, bp, 0, c->ttl, c->tos, c); +} + +void +espiput(Proto *esp, Ipifc* _, Block *bp) +{ + Esp4hdr *eh4; + Esp6hdr *eh6; + Esptail *et; + Userhdr *uh; + Conv *c; + Espcb *ecb; + uchar raddr[IPaddrlen], laddr[IPaddrlen]; + Fs *f; + uchar *auth, *espspi; + ulong spi; + int payload, nexthdr, version, hdrlen; + + f = esp->f; + if (bp == nil || BLEN(bp) == 0) { + /* get enough to identify the IP version */ + bp = pullupblock(bp, IP4HDR); + if(bp == nil) { + netlog(f, Logesp, "esp: short packet\n"); + return; + } + } + eh4 = (Esp4hdr*)bp->rp; + version = ((eh4->vihl & 0xf0) == IP_VER4? V4: V6); + hdrlen = version == V4? Esp4hdrlen: Esp6hdrlen; + + bp = pullupblock(bp, hdrlen + Esptaillen); + if(bp == nil) { + netlog(f, Logesp, "esp: short packet\n"); + return; + } + + if (version == V4) { + eh4 = (Esp4hdr*)bp->rp; + spi = nhgetl(eh4->espspi); + v4tov6(raddr, eh4->espsrc); + v4tov6(laddr, eh4->espdst); + } else { + eh6 = (Esp6hdr*)bp->rp; + spi = nhgetl(eh6->espspi); + ipmove(raddr, eh6->src); + ipmove(laddr, eh6->dst); + } + + QLOCK(esp); + /* Look for a conversation structure for this port */ + c = convlookup(esp, spi); + if(c == nil) { + QUNLOCK(esp); + netlog(f, Logesp, "esp: no conv %I -> %I!%d\n", raddr, + laddr, spi); + icmpnoconv(f, bp); + freeblist(bp); + return; + } + + QLOCK(c); + QUNLOCK(esp); + + ecb = c->ptcl; + /* too hard to do decryption/authentication on block lists */ + if(bp->next) + bp = concatblock(bp); + + if(BLEN(bp) < hdrlen + ecb->espivlen + Esptaillen + ecb->ahlen) { + QUNLOCK(c); + netlog(f, Logesp, "esp: short block %I -> %I!%d\n", raddr, + laddr, spi); + freeb(bp); + return; + } + + auth = bp->wp - ecb->ahlen; + espspi = version == V4? ((Esp4hdr*)bp->rp)->espspi: + ((Esp6hdr*)bp->rp)->espspi; + if(!ecb->auth(ecb, espspi, auth - espspi, auth)) { + QUNLOCK(c); +print("esp: bad auth %I -> %I!%ld\n", raddr, laddr, spi); + netlog(f, Logesp, "esp: bad auth %I -> %I!%d\n", raddr, + laddr, spi); + freeb(bp); + return; + } + + payload = BLEN(bp) - hdrlen - ecb->ahlen; + if(payload <= 0 || payload % 4 != 0 || payload % ecb->espblklen != 0) { + QUNLOCK(c); + netlog(f, Logesp, "esp: bad length %I -> %I!%d payload=%d BLEN=%d\n", + raddr, laddr, spi, payload, BLEN(bp)); + freeb(bp); + return; + } + if(!ecb->cipher(ecb, bp->rp + hdrlen, payload)) { + QUNLOCK(c); +print("esp: cipher failed %I -> %I!%ld: %s\n", raddr, laddr, spi, up->errstr); + netlog(f, Logesp, "esp: cipher failed %I -> %I!%d: %s\n", raddr, + laddr, spi, up->errstr); + freeb(bp); + return; + } + + payload -= Esptaillen; + et = (Esptail*)(bp->rp + hdrlen + payload); + payload -= et->pad + ecb->espivlen; + nexthdr = et->nexthdr; + if(payload <= 0) { + QUNLOCK(c); + netlog(f, Logesp, "esp: short packet after decrypt %I -> %I!%d\n", + raddr, laddr, spi); + freeb(bp); + return; + } + + /* trim packet */ + bp->rp += hdrlen + ecb->espivlen; + bp->wp = bp->rp + payload; + if(ecb->header) { + /* assume Userhdrlen < Esp4hdrlen < Esp6hdrlen */ + bp->rp -= Userhdrlen; + uh = (Userhdr*)bp->rp; + memset(uh, 0, Userhdrlen); + uh->nexthdr = nexthdr; + } + + if(qfull(c->rq)){ + netlog(f, Logesp, "esp: qfull %I -> %I.%uld\n", raddr, + laddr, spi); + freeblist(bp); + }else { +// print("esp: pass up: %uld\n", BLEN(bp)); + qpass(c->rq, bp); + } + + QUNLOCK(c); +} + +char* +espctl(Conv *c, char **f, int n) +{ + Espcb *ecb = c->ptcl; + char *e = nil; + + if(strcmp(f[0], "esp") == 0) + e = setalg(ecb, f, n, espalg); + else if(strcmp(f[0], "ah") == 0) + e = setalg(ecb, f, n, ahalg); + else if(strcmp(f[0], "header") == 0) + ecb->header = 1; + else if(strcmp(f[0], "noheader") == 0) + ecb->header = 0; + else + e = "unknown control request"; + return e; +} + +void +espadvise(Proto *esp, Block *bp, char *msg) +{ + Esp4hdr *h; + Conv *c; + ulong spi; + + h = (Esp4hdr*)(bp->rp); + + spi = nhgets(h->espspi); + QLOCK(esp); + c = convlookup(esp, spi); + if(c != nil) { + qhangup(c->rq, msg); + qhangup(c->wq, msg); + } + QUNLOCK(esp); + freeblist(bp); +} + +int +espstats(Proto *esp, char *buf, int len) +{ + Esppriv *upriv; + + upriv = esp->priv; + return snprint(buf, len, "%lud %lud\n", + upriv->in, + upriv->inerrors); +} + +static int +esplocal(Conv *c, char *buf, int len) +{ + Espcb *ecb = c->ptcl; + int n; + + QLOCK(c); + if(ecb->incoming) + n = snprint(buf, len, "%I!%uld\n", c->laddr, ecb->spi); + else + n = snprint(buf, len, "%I\n", c->laddr); + QUNLOCK(c); + return n; +} + +static int +espremote(Conv *c, char *buf, int len) +{ + Espcb *ecb = c->ptcl; + int n; + + QLOCK(c); + if(ecb->incoming) + n = snprint(buf, len, "%I\n", c->raddr); + else + n = snprint(buf, len, "%I!%uld\n", c->raddr, ecb->spi); + QUNLOCK(c); + return n; +} + +static Conv* +convlookup(Proto *esp, ulong spi) +{ + Conv *c, **p; + Espcb *ecb; + + for(p=esp->conv; *p; p++){ + c = *p; + ecb = c->ptcl; + if(ecb->incoming && ecb->spi == spi) + return c; + } + return nil; +} + +static char * +setalg(Espcb *ecb, char **f, int n, Algorithm *alg) +{ + uchar *key; + int c, i, nbyte, nchar; + + if(n < 2) + return "bad format"; + for(; alg->name; alg++) + if(strcmp(f[1], alg->name) == 0) + break; + if(alg->name == nil) + return "unknown algorithm"; + + if(n != 3) + return "bad format"; + nbyte = (alg->keylen + 7) >> 3; + nchar = strlen(f[2]); + for(i=0; i<nchar; i++) { + c = f[2][i]; + if(c >= '0' && c <= '9') + f[2][i] -= '0'; + else if(c >= 'a' && c <= 'f') + f[2][i] -= 'a'-10; + else if(c >= 'A' && c <= 'F') + f[2][i] -= 'A'-10; + else + return "bad character in key"; + } + key = smalloc(nbyte); + for(i=0; i<nchar && i*2<nbyte; i++) { + c = f[2][nchar-i-1]; + if(i&1) + c <<= 4; + key[i>>1] |= c; + } + + alg->init(ecb, alg->name, key, alg->keylen); + free(key); + return nil; +} + +static int +nullcipher(Espcb* _, uchar* __, int ___) +{ + return 1; +} + +static void +nullespinit(Espcb *ecb, char *name, uchar* _, int __) +{ + ecb->espalg = name; + ecb->espblklen = 1; + ecb->espivlen = 0; + ecb->cipher = nullcipher; +} + +static int +nullauth(Espcb* _, uchar* __, int ___, uchar* ____) +{ + return 1; +} + +static void +nullahinit(Espcb *ecb, char *name, uchar* _, int __) +{ + ecb->ahalg = name; + ecb->ahblklen = 1; + ecb->ahlen = 0; + ecb->auth = nullauth; +} + +void +seanq_hmac_sha1(uchar hash[SHA1dlen], uchar *t, long tlen, uchar *key, long klen) +{ + uchar ipad[65], opad[65]; + int i; + DigestState *digest; + uchar innerhash[SHA1dlen]; + + for(i=0; i<64; i++){ + ipad[i] = 0x36; + opad[i] = 0x5c; + } + ipad[64] = opad[64] = 0; + for(i=0; i<klen; i++){ + ipad[i] ^= key[i]; + opad[i] ^= key[i]; + } + digest = sha1(ipad, 64, nil, nil); + sha1(t, tlen, innerhash, digest); + digest = sha1(opad, 64, nil, nil); + sha1(innerhash, SHA1dlen, hash, digest); +} + +static int +shaauth(Espcb *ecb, uchar *t, int tlen, uchar *auth) +{ + uchar hash[SHA1dlen]; + int r; + + memset(hash, 0, SHA1dlen); + seanq_hmac_sha1(hash, t, tlen, (uchar*)ecb->ahstate, 16); + r = memcmp(auth, hash, ecb->ahlen) == 0; + memmove(auth, hash, ecb->ahlen); + return r; +} + +static void +shaahinit(Espcb *ecb, char *name, uchar *key, int klen) +{ + if(klen != 128) + panic("shaahinit: bad keylen"); + klen >>= 8; /* convert to bytes */ + + ecb->ahalg = name; + ecb->ahblklen = 1; + ecb->ahlen = 12; + ecb->auth = shaauth; + ecb->ahstate = smalloc(klen); + memmove(ecb->ahstate, key, klen); +} + +void +seanq_hmac_md5(uchar hash[MD5dlen], uchar *t, long tlen, uchar *key, long klen) +{ + uchar ipad[65], opad[65]; + int i; + DigestState *digest; + uchar innerhash[MD5dlen]; + + for(i=0; i<64; i++){ + ipad[i] = 0x36; + opad[i] = 0x5c; + } + ipad[64] = opad[64] = 0; + for(i=0; i<klen; i++){ + ipad[i] ^= key[i]; + opad[i] ^= key[i]; + } + digest = md5(ipad, 64, nil, nil); + md5(t, tlen, innerhash, digest); + digest = md5(opad, 64, nil, nil); + md5(innerhash, MD5dlen, hash, digest); +} + +static int +md5auth(Espcb *ecb, uchar *t, int tlen, uchar *auth) +{ + uchar hash[MD5dlen]; + int r; + + memset(hash, 0, MD5dlen); + seanq_hmac_md5(hash, t, tlen, (uchar*)ecb->ahstate, 16); + r = memcmp(auth, hash, ecb->ahlen) == 0; + memmove(auth, hash, ecb->ahlen); + return r; +} + +static void +md5ahinit(Espcb *ecb, char *name, uchar *key, int klen) +{ + if(klen != 128) + panic("md5ahinit: bad keylen"); + klen >>= 3; /* convert to bytes */ + + ecb->ahalg = name; + ecb->ahblklen = 1; + ecb->ahlen = 12; + ecb->auth = md5auth; + ecb->ahstate = smalloc(klen); + memmove(ecb->ahstate, key, klen); +} + +static int +descipher(Espcb *ecb, uchar *p, int n) +{ + uchar tmp[8]; + uchar *pp, *tp, *ip, *eip, *ep; + DESstate *ds = ecb->espstate; + + ep = p + n; + if(ecb->incoming) { + memmove(ds->ivec, p, 8); + p += 8; + while(p < ep){ + memmove(tmp, p, 8); + block_cipher(ds->expanded, p, 1); + tp = tmp; + ip = ds->ivec; + for(eip = ip+8; ip < eip; ){ + *p++ ^= *ip; + *ip++ = *tp++; + } + } + } else { + memmove(p, ds->ivec, 8); + for(p += 8; p < ep; p += 8){ + pp = p; + ip = ds->ivec; + for(eip = ip+8; ip < eip; ) + *pp++ ^= *ip++; + block_cipher(ds->expanded, p, 0); + memmove(ds->ivec, p, 8); + } + } + return 1; +} + +static void +desespinit(Espcb *ecb, char *name, uchar *k, int n) +{ + uchar key[8], ivec[8]; + int i; + + /* bits to bytes */ + n = (n+7)>>3; + if(n > 8) + n = 8; + memset(key, 0, sizeof(key)); + memmove(key, k, n); + for(i=0; i<8; i++) + ivec[i] = nrand(256); + ecb->espalg = name; + ecb->espblklen = 8; + ecb->espivlen = 8; + ecb->cipher = descipher; + ecb->espstate = smalloc(sizeof(DESstate)); + setupDESstate(ecb->espstate, key, ivec); +} + +void +espinit(Fs *fs) +{ + Proto *esp; + + esp = smalloc(sizeof(Proto)); + esp->priv = smalloc(sizeof(Esppriv)); + esp->name = "esp"; + esp->connect = espconnect; + esp->announce = nil; + esp->ctl = espctl; + esp->state = espstate; + esp->create = espcreate; + esp->close = espclose; + esp->rcv = espiput; + esp->advise = espadvise; + esp->stats = espstats; + esp->local = esplocal; + esp->remote = espremote; + esp->ipproto = IP_ESPPROTO; + esp->nc = Nchans; + esp->ptclsize = sizeof(Espcb); + + Fsproto(fs, esp); +} + + +#ifdef notdef +enum { + RC4forward= 10*1024*1024, /* maximum skip forward */ + RC4back = 100*1024, /* maximum look back */ +}; + +typedef struct Esprc4 Esprc4; +struct Esprc4 +{ + ulong cseq; /* current byte sequence number */ + RC4state current; + + int ovalid; /* old is valid */ + ulong lgseq; /* last good sequence */ + ulong oseq; /* old byte sequence number */ + RC4state old; +}; + +static void rc4espinit(Espcb *ecb, char *name, uchar *k, int n); + +static int +rc4cipher(Espcb *ecb, uchar *p, int n) +{ + Esprc4 *esprc4; + RC4state tmpstate; + ulong seq; + long d, dd; + + if(n < 4) + return 0; + + esprc4 = ecb->espstate; + if(ecb->incoming) { + seq = nhgetl(p); + p += 4; + n -= 4; + d = seq-esprc4->cseq; + if(d == 0) { + rc4(&esprc4->current, p, n); + esprc4->cseq += n; + if(esprc4->ovalid) { + dd = esprc4->cseq - esprc4->lgseq; + if(dd > RC4back) + esprc4->ovalid = 0; + } + } else if(d > 0) { +print("esp rc4cipher: missing packet: %uld %ld\n", seq, d); /* this link is hosed */ + if(d > RC4forward) { + strcpy(up->errstr, "rc4cipher: skipped too much"); + return 0; + } + esprc4->lgseq = seq; + if(!esprc4->ovalid) { + esprc4->ovalid = 1; + esprc4->oseq = esprc4->cseq; + memmove(&esprc4->old, &esprc4->current, + sizeof(RC4state)); + } + rc4skip(&esprc4->current, d); + rc4(&esprc4->current, p, n); + esprc4->cseq = seq+n; + } else { +print("esp rc4cipher: reordered packet: %uld %ld\n", seq, d); + dd = seq - esprc4->oseq; + if(!esprc4->ovalid || -d > RC4back || dd < 0) { + strcpy(up->errstr, "rc4cipher: too far back"); + return 0; + } + memmove(&tmpstate, &esprc4->old, sizeof(RC4state)); + rc4skip(&tmpstate, dd); + rc4(&tmpstate, p, n); + return 1; + } + + /* move old state up */ + if(esprc4->ovalid) { + dd = esprc4->cseq - RC4back - esprc4->oseq; + if(dd > 0) { + rc4skip(&esprc4->old, dd); + esprc4->oseq += dd; + } + } + } else { + hnputl(p, esprc4->cseq); + p += 4; + n -= 4; + rc4(&esprc4->current, p, n); + esprc4->cseq += n; + } + return 1; +} + +static void +rc4espinit(Espcb *ecb, char *name, uchar *k, int n) +{ + Esprc4 *esprc4; + + /* bits to bytes */ + n = (n+7)>>3; + esprc4 = smalloc(sizeof(Esprc4)); + memset(esprc4, 0, sizeof(Esprc4)); + setupRC4state(&esprc4->current, k, n); + ecb->espalg = name; + ecb->espblklen = 4; + ecb->espivlen = 4; + ecb->cipher = rc4cipher; + ecb->espstate = esprc4; +} +#endif diff --git a/src/9vx/a/ip/ethermedium.c b/src/9vx/a/ip/ethermedium.c @@ -0,0 +1,766 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "netif.h" +#include "ip.h" +#include "ipv6.h" + +typedef struct Etherhdr Etherhdr; +struct Etherhdr +{ + uchar d[6]; + uchar s[6]; + uchar t[2]; +}; + +static uchar ipbroadcast[IPaddrlen] = { + 0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff, +}; + +static uchar etherbroadcast[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + +static void etherread4(void *a); +static void etherread6(void *a); +static void etherbind(Ipifc *ifc, int argc, char **argv); +static void etherunbind(Ipifc *ifc); +static void etherbwrite(Ipifc *ifc, Block *bp, int version, uchar *ip); +static void etheraddmulti(Ipifc *ifc, uchar *a, uchar *ia); +static void etherremmulti(Ipifc *ifc, uchar *a, uchar *ia); +static Block* multicastarp(Fs *f, Arpent *a, Medium*, uchar *mac); +static void sendarp(Ipifc *ifc, Arpent *a); +static void sendgarp(Ipifc *ifc, uchar*); +static int multicastea(uchar *ea, uchar *ip); +static void recvarpproc(void*); +static void resolveaddr6(Ipifc *ifc, Arpent *a); +static void etherpref2addr(uchar *pref, uchar *ea); + +Medium ethermedium = +{ +.name= "ether", +.hsize= 14, +.mintu= 60, +.maxtu= 1514, +.maclen= 6, +.bind= etherbind, +.unbind= etherunbind, +.bwrite= etherbwrite, +.addmulti= etheraddmulti, +.remmulti= etherremmulti, +.ares= arpenter, +.areg= sendgarp, +.pref2addr= etherpref2addr, +}; + +Medium gbemedium = +{ +.name= "gbe", +.hsize= 14, +.mintu= 60, +.maxtu= 9014, +.maclen= 6, +.bind= etherbind, +.unbind= etherunbind, +.bwrite= etherbwrite, +.addmulti= etheraddmulti, +.remmulti= etherremmulti, +.ares= arpenter, +.areg= sendgarp, +.pref2addr= etherpref2addr, +}; + +typedef struct Etherrock Etherrock; +struct Etherrock +{ + Fs *f; /* file system we belong to */ + Proc *arpp; /* arp process */ + Proc *read4p; /* reading process (v4)*/ + Proc *read6p; /* reading process (v6)*/ + Chan *mchan4; /* Data channel for v4 */ + Chan *achan; /* Arp channel */ + Chan *cchan4; /* Control channel for v4 */ + Chan *mchan6; /* Data channel for v6 */ + Chan *cchan6; /* Control channel for v6 */ +}; + +/* + * ethernet arp request + */ +enum +{ + ARPREQUEST = 1, + ARPREPLY = 2, +}; + +typedef struct Etherarp Etherarp; +struct Etherarp +{ + uchar d[6]; + uchar s[6]; + uchar type[2]; + uchar hrd[2]; + uchar pro[2]; + uchar hln; + uchar pln; + uchar op[2]; + uchar sha[6]; + uchar spa[4]; + uchar tha[6]; + uchar tpa[4]; +}; + +static char *nbmsg = "nonblocking"; + +/* + * called to bind an IP ifc to an ethernet device + * called with ifc wlock'd + */ + +static void +etherbind(Ipifc *ifc, int argc, char **argv) +{ + Chan *mchan4, *cchan4, *achan, *mchan6, *cchan6, *schan; + char addr[Maxpath]; //char addr[2*KNAMELEN]; + char dir[Maxpath]; //char dir[2*KNAMELEN]; + char *buf; + int n; + char *ptr; + Etherrock *er; + + if(argc < 2) + error(Ebadarg); + + mchan4 = cchan4 = achan = mchan6 = cchan6 = nil; + buf = nil; + if(waserror()){ + if(mchan4 != nil) + cclose(mchan4); + if(cchan4 != nil) + cclose(cchan4); + if(achan != nil) + cclose(achan); + if(mchan6 != nil) + cclose(mchan6); + if(cchan6 != nil) + cclose(cchan6); + if(buf != nil) + free(buf); + nexterror(); + } + + /* + * open ipv4 conversation + * + * the dial will fail if the type is already open on + * this device. + */ + snprint(addr, sizeof(addr), "%s!0x800", argv[2]); /* ETIP4 */ + mchan4 = chandial(addr, nil, dir, &cchan4); + + /* + * make it non-blocking + */ + devtab[cchan4->type]->write(cchan4, nbmsg, strlen(nbmsg), 0); + + /* + * get mac address and speed + */ + snprint(addr, sizeof(addr), "%s/stats", argv[2]); + buf = smalloc(512); + schan = namec(addr, Aopen, OREAD, 0); + if(waserror()){ + cclose(schan); + nexterror(); + } + n = devtab[schan->type]->read(schan, buf, 511, 0); + cclose(schan); + poperror(); + buf[n] = 0; + + ptr = strstr(buf, "addr: "); + if(!ptr) + error(Eio); + ptr += 6; + parsemac(ifc->mac, ptr, 6); + + ptr = strstr(buf, "mbps: "); + if(ptr){ + ptr += 6; + ifc->mbps = atoi(ptr); + } else + ifc->mbps = 100; + + /* + * open arp conversation + */ + snprint(addr, sizeof(addr), "%s!0x806", argv[2]); /* ETARP */ + achan = chandial(addr, nil, nil, nil); + + /* + * open ipv6 conversation + * + * the dial will fail if the type is already open on + * this device. + */ + snprint(addr, sizeof(addr), "%s!0x86DD", argv[2]); /* ETIP6 */ + mchan6 = chandial(addr, nil, dir, &cchan6); + + /* + * make it non-blocking + */ + devtab[cchan6->type]->write(cchan6, nbmsg, strlen(nbmsg), 0); + + er = smalloc(sizeof(*er)); + er->mchan4 = mchan4; + er->cchan4 = cchan4; + er->achan = achan; + er->mchan6 = mchan6; + er->cchan6 = cchan6; + er->f = ifc->conv->p->f; + ifc->arg = er; + + free(buf); + poperror(); + + kproc("etherread4", etherread4, ifc); + kproc("recvarpproc", recvarpproc, ifc); + kproc("etherread6", etherread6, ifc); +} + +/* + * called with ifc wlock'd + */ +static void +etherunbind(Ipifc *ifc) +{ + Etherrock *er = ifc->arg; + + if(er->read4p) + postnote(er->read4p, 1, "unbind", 0); + if(er->read6p) + postnote(er->read6p, 1, "unbind", 0); + if(er->arpp) + postnote(er->arpp, 1, "unbind", 0); + + /* wait for readers to die */ + while(er->arpp != 0 || er->read4p != 0 || er->read6p != 0) + tsleep(&up->sleep, return0, 0, 300); + + if(er->mchan4 != nil) + cclose(er->mchan4); + if(er->achan != nil) + cclose(er->achan); + if(er->cchan4 != nil) + cclose(er->cchan4); + if(er->mchan6 != nil) + cclose(er->mchan6); + if(er->cchan6 != nil) + cclose(er->cchan6); + + free(er); +} + +/* + * called by ipoput with a single block to write with ifc RLOCK'd + */ +static void +etherbwrite(Ipifc *ifc, Block *bp, int version, uchar *ip) +{ + Etherhdr *eh; + Arpent *a; + uchar mac[6]; + Etherrock *er = ifc->arg; + + /* get mac address of destination */ + a = arpget(er->f->arp, bp, version, ifc, ip, mac); + if(a){ + /* check for broadcast or multicast */ + bp = multicastarp(er->f, a, ifc->m, mac); + if(bp==nil){ + switch(version){ + case V4: + sendarp(ifc, a); + break; + case V6: + resolveaddr6(ifc, a); + break; + default: + panic("etherbwrite: version %d", version); + } + return; + } + } + + /* make it a single block with space for the ether header */ + bp = padblock(bp, ifc->m->hsize); + if(bp->next) + bp = concatblock(bp); + if(BLEN(bp) < ifc->mintu) + bp = adjustblock(bp, ifc->mintu); + eh = (Etherhdr*)bp->rp; + + /* copy in mac addresses and ether type */ + memmove(eh->s, ifc->mac, sizeof(eh->s)); + memmove(eh->d, mac, sizeof(eh->d)); + + switch(version){ + case V4: + eh->t[0] = 0x08; + eh->t[1] = 0x00; + devtab[er->mchan4->type]->bwrite(er->mchan4, bp, 0); + break; + case V6: + eh->t[0] = 0x86; + eh->t[1] = 0xDD; + devtab[er->mchan6->type]->bwrite(er->mchan6, bp, 0); + break; + default: + panic("etherbwrite2: version %d", version); + } + ifc->out++; +} + + +/* + * process to read from the ethernet + */ +static void +etherread4(void *a) +{ + Ipifc *ifc; + Block *bp; + Etherrock *er; + + ifc = a; + er = ifc->arg; + er->read4p = up; /* hide identity under a rock for unbind */ + if(waserror()){ + er->read4p = 0; + pexit("hangup", 1); + } + for(;;){ + bp = devtab[er->mchan4->type]->bread(er->mchan4, ifc->maxtu, 0); + if(!CANRLOCK(ifc)){ + freeb(bp); + continue; + } + if(waserror()){ + RUNLOCK(ifc); + nexterror(); + } + ifc->in++; + bp->rp += ifc->m->hsize; + if(ifc->lifc == nil) + freeb(bp); + else + ipiput4(er->f, ifc, bp); + RUNLOCK(ifc); + poperror(); + } +} + + +/* + * process to read from the ethernet, IPv6 + */ +static void +etherread6(void *a) +{ + Ipifc *ifc; + Block *bp; + Etherrock *er; + + ifc = a; + er = ifc->arg; + er->read6p = up; /* hide identity under a rock for unbind */ + if(waserror()){ + er->read6p = 0; + pexit("hangup", 1); + } + for(;;){ + bp = devtab[er->mchan6->type]->bread(er->mchan6, ifc->maxtu, 0); + if(!CANRLOCK(ifc)){ + freeb(bp); + continue; + } + if(waserror()){ + RUNLOCK(ifc); + nexterror(); + } + ifc->in++; + bp->rp += ifc->m->hsize; + if(ifc->lifc == nil) + freeb(bp); + else + ipiput6(er->f, ifc, bp); + RUNLOCK(ifc); + poperror(); + } +} + +static void +etheraddmulti(Ipifc *ifc, uchar *a, uchar *_) +{ + uchar mac[6]; + char buf[64]; + Etherrock *er = ifc->arg; + int version; + + version = multicastea(mac, a); + sprint(buf, "addmulti %E", mac); + switch(version){ + case V4: + devtab[er->cchan4->type]->write(er->cchan4, buf, strlen(buf), 0); + break; + case V6: + devtab[er->cchan6->type]->write(er->cchan6, buf, strlen(buf), 0); + break; + default: + panic("etheraddmulti: version %d", version); + } +} + +static void +etherremmulti(Ipifc *ifc, uchar *a, uchar *_) +{ + uchar mac[6]; + char buf[64]; + Etherrock *er = ifc->arg; + int version; + + version = multicastea(mac, a); + sprint(buf, "remmulti %E", mac); + switch(version){ + case V4: + devtab[er->cchan4->type]->write(er->cchan4, buf, strlen(buf), 0); + break; + case V6: + devtab[er->cchan6->type]->write(er->cchan6, buf, strlen(buf), 0); + break; + default: + panic("etherremmulti: version %d", version); + } +} + +/* + * send an ethernet arp + * (only v4, v6 uses the neighbor discovery, rfc1970) + */ +static void +sendarp(Ipifc *ifc, Arpent *a) +{ + int n; + Block *bp; + Etherarp *e; + Etherrock *er = ifc->arg; + + /* don't do anything if it's been less than a second since the last */ + if(NOW - a->ctime < 1000){ + arprelease(er->f->arp, a); + return; + } + + /* remove all but the last message */ + while((bp = a->hold) != nil){ + if(bp == a->last) + break; + a->hold = bp->list; + freeblist(bp); + } + + /* try to keep it around for a second more */ + a->ctime = NOW; + arprelease(er->f->arp, a); + + n = sizeof(Etherarp); + if(n < a->type->mintu) + n = a->type->mintu; + bp = allocb(n); + memset(bp->rp, 0, n); + e = (Etherarp*)bp->rp; + memmove(e->tpa, a->ip+IPv4off, sizeof(e->tpa)); + ipv4local(ifc, e->spa); + memmove(e->sha, ifc->mac, sizeof(e->sha)); + memset(e->d, 0xff, sizeof(e->d)); /* ethernet broadcast */ + memmove(e->s, ifc->mac, sizeof(e->s)); + + hnputs(e->type, ETARP); + hnputs(e->hrd, 1); + hnputs(e->pro, ETIP4); + e->hln = sizeof(e->sha); + e->pln = sizeof(e->spa); + hnputs(e->op, ARPREQUEST); + bp->wp += n; + + devtab[er->achan->type]->bwrite(er->achan, bp, 0); +} + +static void +resolveaddr6(Ipifc *ifc, Arpent *a) +{ + int sflag; + Block *bp; + Etherrock *er = ifc->arg; + uchar ipsrc[IPaddrlen]; + + /* don't do anything if it's been less than a second since the last */ + if(NOW - a->ctime < ReTransTimer){ + arprelease(er->f->arp, a); + return; + } + + /* remove all but the last message */ + while((bp = a->hold) != nil){ + if(bp == a->last) + break; + a->hold = bp->list; + freeblist(bp); + } + + /* try to keep it around for a second more */ + a->ctime = NOW; + a->rtime = NOW + ReTransTimer; + if(a->rxtsrem <= 0) { + arprelease(er->f->arp, a); + return; + } + + a->rxtsrem--; + arprelease(er->f->arp, a); + + if((sflag = ipv6anylocal(ifc, ipsrc)) != 0) + icmpns(er->f, ipsrc, sflag, a->ip, TARG_MULTI, ifc->mac); +} + +/* + * send a gratuitous arp to refresh arp caches + */ +static void +sendgarp(Ipifc *ifc, uchar *ip) +{ + int n; + Block *bp; + Etherarp *e; + Etherrock *er = ifc->arg; + + /* don't arp for our initial non address */ + if(ipcmp(ip, IPnoaddr) == 0) + return; + + n = sizeof(Etherarp); + if(n < ifc->m->mintu) + n = ifc->m->mintu; + bp = allocb(n); + memset(bp->rp, 0, n); + e = (Etherarp*)bp->rp; + memmove(e->tpa, ip+IPv4off, sizeof(e->tpa)); + memmove(e->spa, ip+IPv4off, sizeof(e->spa)); + memmove(e->sha, ifc->mac, sizeof(e->sha)); + memset(e->d, 0xff, sizeof(e->d)); /* ethernet broadcast */ + memmove(e->s, ifc->mac, sizeof(e->s)); + + hnputs(e->type, ETARP); + hnputs(e->hrd, 1); + hnputs(e->pro, ETIP4); + e->hln = sizeof(e->sha); + e->pln = sizeof(e->spa); + hnputs(e->op, ARPREQUEST); + bp->wp += n; + + devtab[er->achan->type]->bwrite(er->achan, bp, 0); +} + +static void +recvarp(Ipifc *ifc) +{ + int n; + Block *ebp, *rbp; + Etherarp *e, *r; + uchar ip[IPaddrlen]; + static uchar eprinted[4]; + Etherrock *er = ifc->arg; + + ebp = devtab[er->achan->type]->bread(er->achan, ifc->maxtu, 0); + if(ebp == nil) + return; + + e = (Etherarp*)ebp->rp; + switch(nhgets(e->op)) { + default: + break; + + case ARPREPLY: + /* check for machine using my ip address */ + v4tov6(ip, e->spa); + if(iplocalonifc(ifc, ip) || ipproxyifc(er->f, ifc, ip)){ + if(memcmp(e->sha, ifc->mac, sizeof(e->sha)) != 0){ + print("arprep: 0x%E/0x%E also has ip addr %V\n", + e->s, e->sha, e->spa); + break; + } + } + + /* make sure we're not entering broadcast addresses */ + if(ipcmp(ip, ipbroadcast) == 0 || + !memcmp(e->sha, etherbroadcast, sizeof(e->sha))){ + print("arprep: 0x%E/0x%E cannot register broadcast address %I\n", + e->s, e->sha, e->spa); + break; + } + + arpenter(er->f, V4, e->spa, e->sha, sizeof(e->sha), 0); + break; + + case ARPREQUEST: + /* don't answer arps till we know who we are */ + if(ifc->lifc == 0) + break; + + /* check for machine using my ip or ether address */ + v4tov6(ip, e->spa); + if(iplocalonifc(ifc, ip) || ipproxyifc(er->f, ifc, ip)){ + if(memcmp(e->sha, ifc->mac, sizeof(e->sha)) != 0){ + if (memcmp(eprinted, e->spa, sizeof(e->spa))){ + /* print only once */ + print("arpreq: 0x%E also has ip addr %V\n", e->sha, e->spa); + memmove(eprinted, e->spa, sizeof(e->spa)); + } + } + } else { + if(memcmp(e->sha, ifc->mac, sizeof(e->sha)) == 0){ + print("arpreq: %V also has ether addr %E\n", e->spa, e->sha); + break; + } + } + + /* refresh what we know about sender */ + arpenter(er->f, V4, e->spa, e->sha, sizeof(e->sha), 1); + + /* answer only requests for our address or systems we're proxying for */ + v4tov6(ip, e->tpa); + if(!iplocalonifc(ifc, ip)) + if(!ipproxyifc(er->f, ifc, ip)) + break; + + n = sizeof(Etherarp); + if(n < ifc->mintu) + n = ifc->mintu; + rbp = allocb(n); + r = (Etherarp*)rbp->rp; + memset(r, 0, sizeof(Etherarp)); + hnputs(r->type, ETARP); + hnputs(r->hrd, 1); + hnputs(r->pro, ETIP4); + r->hln = sizeof(r->sha); + r->pln = sizeof(r->spa); + hnputs(r->op, ARPREPLY); + memmove(r->tha, e->sha, sizeof(r->tha)); + memmove(r->tpa, e->spa, sizeof(r->tpa)); + memmove(r->sha, ifc->mac, sizeof(r->sha)); + memmove(r->spa, e->tpa, sizeof(r->spa)); + memmove(r->d, e->sha, sizeof(r->d)); + memmove(r->s, ifc->mac, sizeof(r->s)); + rbp->wp += n; + + devtab[er->achan->type]->bwrite(er->achan, rbp, 0); + } + freeb(ebp); +} + +static void +recvarpproc(void *v) +{ + Ipifc *ifc = v; + Etherrock *er = ifc->arg; + + er->arpp = up; + if(waserror()){ + er->arpp = 0; + pexit("hangup", 1); + } + for(;;) + recvarp(ifc); +} + +static int +multicastea(uchar *ea, uchar *ip) +{ + int x; + + switch(x = ipismulticast(ip)){ + case V4: + ea[0] = 0x01; + ea[1] = 0x00; + ea[2] = 0x5e; + ea[3] = ip[13] & 0x7f; + ea[4] = ip[14]; + ea[5] = ip[15]; + break; + case V6: + ea[0] = 0x33; + ea[1] = 0x33; + ea[2] = ip[12]; + ea[3] = ip[13]; + ea[4] = ip[14]; + ea[5] = ip[15]; + break; + } + return x; +} + +/* + * fill in an arp entry for broadcast or multicast + * addresses. Return the first queued packet for the + * IP address. + */ +static Block* +multicastarp(Fs *f, Arpent *a, Medium *medium, uchar *mac) +{ + /* is it broadcast? */ + switch(ipforme(f, a->ip)){ + case Runi: + return nil; + case Rbcast: + memset(mac, 0xff, 6); + return arpresolve(f->arp, a, medium, mac); + default: + break; + } + + /* if multicast, fill in mac */ + switch(multicastea(mac, a->ip)){ + case V4: + case V6: + return arpresolve(f->arp, a, medium, mac); + } + + /* let arp take care of it */ + return nil; +} + +void +ethermediumlink(void) +{ + addipmedium(&ethermedium); + addipmedium(&gbemedium); +} + + +static void +etherpref2addr(uchar *pref, uchar *ea) +{ + pref[8] = ea[0] | 0x2; + pref[9] = ea[1]; + pref[10] = ea[2]; + pref[11] = 0xFF; + pref[12] = 0xFE; + pref[13] = ea[3]; + pref[14] = ea[4]; + pref[15] = ea[5]; +} diff --git a/src/9vx/a/ip/gre.c b/src/9vx/a/ip/gre.c @@ -0,0 +1,283 @@ +/* + * Generic Routing Encapsulation over IPv4, rfc1702 + */ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" + +enum +{ + GRE_IPONLY = 12, /* size of ip header */ + GRE_IPPLUSGRE = 12, /* minimum size of GRE header */ + IP_GREPROTO = 47, + + GRErxms = 200, + GREtickms = 100, + GREmaxxmit = 10, +}; + +typedef struct GREhdr +{ + /* ip header */ + uchar vihl; /* Version and header length */ + uchar tos; /* Type of service */ + uchar len[2]; /* packet length (including headers) */ + uchar id[2]; /* Identification */ + uchar frag[2]; /* Fragment information */ + uchar Unused; + uchar proto; /* Protocol */ + uchar cksum[2]; /* checksum */ + uchar src[4]; /* Ip source */ + uchar dst[4]; /* Ip destination */ + + /* gre header */ + uchar flags[2]; + uchar eproto[2]; /* encapsulation protocol */ +} GREhdr; + +typedef struct GREpriv GREpriv; +struct GREpriv +{ + int raw; /* Raw GRE mode */ + + /* non-MIB stats */ + ulong csumerr; /* checksum errors */ + ulong lenerr; /* short packet */ +}; + +static void grekick(void *x, Block *bp); + +static char* +greconnect(Conv *c, char **argv, int argc) +{ + Proto *p; + char *err; + Conv *tc, **cp, **ecp; + + err = Fsstdconnect(c, argv, argc); + if(err != nil) + return err; + + /* make sure noone's already connected to this other sys */ + p = c->p; + QLOCK(p); + ecp = &p->conv[p->nc]; + for(cp = p->conv; cp < ecp; cp++){ + tc = *cp; + if(tc == nil) + break; + if(tc == c) + continue; + if(tc->rport == c->rport && ipcmp(tc->raddr, c->raddr) == 0){ + err = "already connected to that addr/proto"; + ipmove(c->laddr, IPnoaddr); + ipmove(c->raddr, IPnoaddr); + break; + } + } + QUNLOCK(p); + + if(err != nil) + return err; + Fsconnected(c, nil); + + return nil; +} + +static void +grecreate(Conv *c) +{ + c->rq = qopen(64*1024, Qmsg, 0, c); + c->wq = qbypass(grekick, c); +} + +static int +grestate(Conv *c, char *state, int n) +{ + USED(c); + return snprint(state, n, "%s\n", "Datagram"); +} + +static char* +greannounce(Conv* _, char** __, int ___) +{ + return "pktifc does not support announce"; +} + +static void +greclose(Conv *c) +{ + qclose(c->rq); + qclose(c->wq); + qclose(c->eq); + ipmove(c->laddr, IPnoaddr); + ipmove(c->raddr, IPnoaddr); + c->lport = 0; + c->rport = 0; +} + +int drop; + +static void +grekick(void *x, Block *bp) +{ + Conv *c = x; + GREhdr *ghp; + uchar laddr[IPaddrlen], raddr[IPaddrlen]; + + if(bp == nil) + return; + + /* Make space to fit ip header (gre header already there) */ + bp = padblock(bp, GRE_IPONLY); + if(bp == nil) + return; + + /* make sure the message has a GRE header */ + bp = pullupblock(bp, GRE_IPONLY+GRE_IPPLUSGRE); + if(bp == nil) + return; + + ghp = (GREhdr *)(bp->rp); + ghp->vihl = IP_VER4; + + if(!((GREpriv*)c->p->priv)->raw){ + v4tov6(raddr, ghp->dst); + if(ipcmp(raddr, v4prefix) == 0) + memmove(ghp->dst, c->raddr + IPv4off, IPv4addrlen); + v4tov6(laddr, ghp->src); + if(ipcmp(laddr, v4prefix) == 0){ + if(ipcmp(c->laddr, IPnoaddr) == 0) + findlocalip(c->p->f, c->laddr, raddr); /* pick interface closest to dest */ + memmove(ghp->src, c->laddr + IPv4off, IPv4addrlen); + } + hnputs(ghp->eproto, c->rport); + } + + ghp->proto = IP_GREPROTO; + ghp->frag[0] = 0; + ghp->frag[1] = 0; + + ipoput4(c->p->f, bp, 0, c->ttl, c->tos, nil); +} + +static void +greiput(Proto *gre, Ipifc* __, Block *bp) +{ + int len; + GREhdr *ghp; + Conv *c, **p; + ushort eproto; + uchar raddr[IPaddrlen]; + GREpriv *gpriv; + + gpriv = gre->priv; + ghp = (GREhdr*)(bp->rp); + + v4tov6(raddr, ghp->src); + eproto = nhgets(ghp->eproto); + QLOCK(gre); + + /* Look for a conversation structure for this port and address */ + c = nil; + for(p = gre->conv; *p; p++) { + c = *p; + if(c->inuse == 0) + continue; + if(c->rport == eproto && + (gpriv->raw || ipcmp(c->raddr, raddr) == 0)) + break; + } + + if(*p == nil) { + QUNLOCK(gre); + freeblist(bp); + return; + } + + QUNLOCK(gre); + + /* + * Trim the packet down to data size + */ + len = nhgets(ghp->len) - GRE_IPONLY; + if(len < GRE_IPPLUSGRE){ + freeblist(bp); + return; + } + bp = trimblock(bp, GRE_IPONLY, len); + if(bp == nil){ + gpriv->lenerr++; + return; + } + + /* + * Can't delimit packet so pull it all into one block. + */ + if(qlen(c->rq) > 64*1024) + freeblist(bp); + else{ + bp = concatblock(bp); + if(bp == 0) + panic("greiput"); + qpass(c->rq, bp); + } +} + +int +grestats(Proto *gre, char *buf, int len) +{ + GREpriv *gpriv; + + gpriv = gre->priv; + + return snprint(buf, len, "gre: len %lud\n", gpriv->lenerr); +} + +char* +grectl(Conv *c, char **f, int n) +{ + GREpriv *gpriv; + + gpriv = c->p->priv; + if(n == 1){ + if(strcmp(f[0], "raw") == 0){ + gpriv->raw = 1; + return nil; + } + else if(strcmp(f[0], "cooked") == 0){ + gpriv->raw = 0; + return nil; + } + } + return "unknown control request"; +} + +void +greinit(Fs *fs) +{ + Proto *gre; + + gre = smalloc(sizeof(Proto)); + gre->priv = smalloc(sizeof(GREpriv)); + gre->name = "gre"; + gre->connect = greconnect; + gre->announce = greannounce; + gre->state = grestate; + gre->create = grecreate; + gre->close = greclose; + gre->rcv = greiput; + gre->ctl = grectl; + gre->advise = nil; + gre->stats = grestats; + gre->ipproto = IP_GREPROTO; + gre->nc = 64; + gre->ptclsize = 0; + + Fsproto(fs, gre); +} diff --git a/src/9vx/a/ip/icmp.c b/src/9vx/a/ip/icmp.c @@ -0,0 +1,490 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" + +typedef struct Icmp { + uchar vihl; /* Version and header length */ + uchar tos; /* Type of service */ + uchar length[2]; /* packet length */ + uchar id[2]; /* Identification */ + uchar frag[2]; /* Fragment information */ + uchar ttl; /* Time to live */ + uchar proto; /* Protocol */ + uchar ipcksum[2]; /* Header checksum */ + uchar src[4]; /* Ip source */ + uchar dst[4]; /* Ip destination */ + uchar type; + uchar code; + uchar cksum[2]; + uchar icmpid[2]; + uchar seq[2]; + uchar data[1]; +} Icmp; + +enum { /* Packet Types */ + EchoReply = 0, + Unreachable = 3, + SrcQuench = 4, + Redirect = 5, + EchoRequest = 8, + TimeExceed = 11, + InParmProblem = 12, + Timestamp = 13, + TimestampReply = 14, + InfoRequest = 15, + InfoReply = 16, + AddrMaskRequest = 17, + AddrMaskReply = 18, + + Maxtype = 18, +}; + +enum +{ + MinAdvise = 24, /* minimum needed for us to advise another protocol */ +}; + +char *icmpnames[Maxtype+1] = +{ +[EchoReply] "EchoReply", +[Unreachable] "Unreachable", +[SrcQuench] "SrcQuench", +[Redirect] "Redirect", +[EchoRequest] "EchoRequest", +[TimeExceed] "TimeExceed", +[InParmProblem] "InParmProblem", +[Timestamp] "Timestamp", +[TimestampReply] "TimestampReply", +[InfoRequest] "InfoRequest", +[InfoReply] "InfoReply", +[AddrMaskRequest] "AddrMaskRequest", +[AddrMaskReply ] "AddrMaskReply ", +}; + +enum { + IP_ICMPPROTO = 1, + ICMP_IPSIZE = 20, + ICMP_HDRSIZE = 8, +}; + +enum +{ + InMsgs, + InErrors, + OutMsgs, + CsumErrs, + LenErrs, + HlenErrs, + + Nstats, +}; + +static char *statnames[Nstats] = +{ +[InMsgs] "InMsgs", +[InErrors] "InErrors", +[OutMsgs] "OutMsgs", +[CsumErrs] "CsumErrs", +[LenErrs] "LenErrs", +[HlenErrs] "HlenErrs", +}; + +typedef struct Icmppriv Icmppriv; +struct Icmppriv +{ + ulong stats[Nstats]; + + /* message counts */ + ulong in[Maxtype+1]; + ulong out[Maxtype+1]; +}; + +static void icmpkick(void *x, Block*); + +static void +icmpcreate(Conv *c) +{ + c->rq = qopen(64*1024, Qmsg, 0, c); + c->wq = qbypass(icmpkick, c); +} + +extern char* +icmpconnect(Conv *c, char **argv, int argc) +{ + char *e; + + e = Fsstdconnect(c, argv, argc); + if(e != nil) + return e; + Fsconnected(c, e); + + return nil; +} + +extern int +icmpstate(Conv *c, char *state, int n) +{ + USED(c); + return snprint(state, n, "%s qin %d qout %d\n", + "Datagram", + c->rq ? qlen(c->rq) : 0, + c->wq ? qlen(c->wq) : 0 + ); +} + +extern char* +icmpannounce(Conv *c, char **argv, int argc) +{ + char *e; + + e = Fsstdannounce(c, argv, argc); + if(e != nil) + return e; + Fsconnected(c, nil); + + return nil; +} + +extern void +icmpclose(Conv *c) +{ + qclose(c->rq); + qclose(c->wq); + ipmove(c->laddr, IPnoaddr); + ipmove(c->raddr, IPnoaddr); + c->lport = 0; +} + +static void +icmpkick(void *x, Block *bp) +{ + Conv *c = x; + Icmp *p; + Icmppriv *ipriv; + + if(bp == nil) + return; + + if(blocklen(bp) < ICMP_IPSIZE + ICMP_HDRSIZE){ + freeblist(bp); + return; + } + p = (Icmp *)(bp->rp); + p->vihl = IP_VER4; + ipriv = c->p->priv; + if(p->type <= Maxtype) + ipriv->out[p->type]++; + + v6tov4(p->dst, c->raddr); + v6tov4(p->src, c->laddr); + p->proto = IP_ICMPPROTO; + hnputs(p->icmpid, c->lport); + memset(p->cksum, 0, sizeof(p->cksum)); + hnputs(p->cksum, ptclcsum(bp, ICMP_IPSIZE, blocklen(bp) - ICMP_IPSIZE)); + ipriv->stats[OutMsgs]++; + ipoput4(c->p->f, bp, 0, c->ttl, c->tos, nil); +} + +extern void +icmpttlexceeded(Fs *f, uchar *ia, Block *bp) +{ + Block *nbp; + Icmp *p, *np; + + p = (Icmp *)bp->rp; + + netlog(f, Logicmp, "sending icmpttlexceeded -> %V\n", p->src); + nbp = allocb(ICMP_IPSIZE + ICMP_HDRSIZE + ICMP_IPSIZE + 8); + nbp->wp += ICMP_IPSIZE + ICMP_HDRSIZE + ICMP_IPSIZE + 8; + np = (Icmp *)nbp->rp; + np->vihl = IP_VER4; + memmove(np->dst, p->src, sizeof(np->dst)); + v6tov4(np->src, ia); + memmove(np->data, bp->rp, ICMP_IPSIZE + 8); + np->type = TimeExceed; + np->code = 0; + np->proto = IP_ICMPPROTO; + hnputs(np->icmpid, 0); + hnputs(np->seq, 0); + memset(np->cksum, 0, sizeof(np->cksum)); + hnputs(np->cksum, ptclcsum(nbp, ICMP_IPSIZE, blocklen(nbp) - ICMP_IPSIZE)); + ipoput4(f, nbp, 0, MAXTTL, DFLTTOS, nil); + +} + +static void +icmpunreachable(Fs *f, Block *bp, int code, int seq) +{ + Block *nbp; + Icmp *p, *np; + int i; + uchar addr[IPaddrlen]; + + p = (Icmp *)bp->rp; + + /* only do this for unicast sources and destinations */ + v4tov6(addr, p->dst); + i = ipforme(f, addr); + if((i&Runi) == 0) + return; + v4tov6(addr, p->src); + i = ipforme(f, addr); + if(i != 0 && (i&Runi) == 0) + return; + + netlog(f, Logicmp, "sending icmpnoconv -> %V\n", p->src); + nbp = allocb(ICMP_IPSIZE + ICMP_HDRSIZE + ICMP_IPSIZE + 8); + nbp->wp += ICMP_IPSIZE + ICMP_HDRSIZE + ICMP_IPSIZE + 8; + np = (Icmp *)nbp->rp; + np->vihl = IP_VER4; + memmove(np->dst, p->src, sizeof(np->dst)); + memmove(np->src, p->dst, sizeof(np->src)); + memmove(np->data, bp->rp, ICMP_IPSIZE + 8); + np->type = Unreachable; + np->code = code; + np->proto = IP_ICMPPROTO; + hnputs(np->icmpid, 0); + hnputs(np->seq, seq); + memset(np->cksum, 0, sizeof(np->cksum)); + hnputs(np->cksum, ptclcsum(nbp, ICMP_IPSIZE, blocklen(nbp) - ICMP_IPSIZE)); + ipoput4(f, nbp, 0, MAXTTL, DFLTTOS, nil); +} + +extern void +icmpnoconv(Fs *f, Block *bp) +{ + icmpunreachable(f, bp, 3, 0); +} + +extern void +icmpcantfrag(Fs *f, Block *bp, int mtu) +{ + icmpunreachable(f, bp, 4, mtu); +} + +static void +goticmpkt(Proto *icmp, Block *bp) +{ + Conv **c, *s; + Icmp *p; + uchar dst[IPaddrlen]; + ushort recid; + + p = (Icmp *) bp->rp; + v4tov6(dst, p->src); + recid = nhgets(p->icmpid); + + for(c = icmp->conv; *c; c++) { + s = *c; + if(s->lport == recid) + if(ipcmp(s->raddr, dst) == 0){ + bp = concatblock(bp); + if(bp != nil) + qpass(s->rq, bp); + return; + } + } + freeblist(bp); +} + +static Block * +mkechoreply(Block *bp) +{ + Icmp *q; + uchar ip[4]; + + q = (Icmp *)bp->rp; + q->vihl = IP_VER4; + memmove(ip, q->src, sizeof(q->dst)); + memmove(q->src, q->dst, sizeof(q->src)); + memmove(q->dst, ip, sizeof(q->dst)); + q->type = EchoReply; + memset(q->cksum, 0, sizeof(q->cksum)); + hnputs(q->cksum, ptclcsum(bp, ICMP_IPSIZE, blocklen(bp) - ICMP_IPSIZE)); + + return bp; +} + +static char *unreachcode[] = +{ +[0] "net unreachable", +[1] "host unreachable", +[2] "protocol unreachable", +[3] "port unreachable", +[4] "fragmentation needed and DF set", +[5] "source route failed", +}; + +static void +icmpiput(Proto *icmp, Ipifc* __, Block *bp) +{ + int n, iplen; + Icmp *p; + Block *r; + Proto *pr; + char *msg; + char m2[128]; + Icmppriv *ipriv; + + ipriv = icmp->priv; + + ipriv->stats[InMsgs]++; + + p = (Icmp *)bp->rp; + netlog(icmp->f, Logicmp, "icmpiput %d %d\n", p->type, p->code); + n = blocklen(bp); + if(n < ICMP_IPSIZE+ICMP_HDRSIZE){ + ipriv->stats[InErrors]++; + ipriv->stats[HlenErrs]++; + netlog(icmp->f, Logicmp, "icmp hlen %d\n", n); + goto raise; + } + iplen = nhgets(p->length); + if(iplen > n || ((uint)iplen % 1)){ + ipriv->stats[LenErrs]++; + ipriv->stats[InErrors]++; + netlog(icmp->f, Logicmp, "icmp length %d\n", iplen); + goto raise; + } + if(ptclcsum(bp, ICMP_IPSIZE, iplen - ICMP_IPSIZE)){ + ipriv->stats[InErrors]++; + ipriv->stats[CsumErrs]++; + netlog(icmp->f, Logicmp, "icmp checksum error\n"); + goto raise; + } + if(p->type <= Maxtype) + ipriv->in[p->type]++; + + switch(p->type) { + case EchoRequest: + if (iplen < n) + bp = trimblock(bp, 0, iplen); + r = mkechoreply(bp); + ipriv->out[EchoReply]++; + ipoput4(icmp->f, r, 0, MAXTTL, DFLTTOS, nil); + break; + case Unreachable: + if(p->code > 5) + msg = unreachcode[1]; + else + msg = unreachcode[p->code]; + + bp->rp += ICMP_IPSIZE+ICMP_HDRSIZE; + if(blocklen(bp) < MinAdvise){ + ipriv->stats[LenErrs]++; + goto raise; + } + p = (Icmp *)bp->rp; + pr = Fsrcvpcolx(icmp->f, p->proto); + if(pr != nil && pr->advise != nil) { + (*pr->advise)(pr, bp, msg); + return; + } + + bp->rp -= ICMP_IPSIZE+ICMP_HDRSIZE; + goticmpkt(icmp, bp); + break; + case TimeExceed: + if(p->code == 0){ + sprint(m2, "ttl exceeded at %V", p->src); + + bp->rp += ICMP_IPSIZE+ICMP_HDRSIZE; + if(blocklen(bp) < MinAdvise){ + ipriv->stats[LenErrs]++; + goto raise; + } + p = (Icmp *)bp->rp; + pr = Fsrcvpcolx(icmp->f, p->proto); + if(pr != nil && pr->advise != nil) { + (*pr->advise)(pr, bp, m2); + return; + } + bp->rp -= ICMP_IPSIZE+ICMP_HDRSIZE; + } + + goticmpkt(icmp, bp); + break; + default: + goticmpkt(icmp, bp); + break; + } + return; + +raise: + freeblist(bp); +} + +void +icmpadvise(Proto *icmp, Block *bp, char *msg) +{ + Conv **c, *s; + Icmp *p; + uchar dst[IPaddrlen]; + ushort recid; + + p = (Icmp *) bp->rp; + v4tov6(dst, p->dst); + recid = nhgets(p->icmpid); + + for(c = icmp->conv; *c; c++) { + s = *c; + if(s->lport == recid) + if(ipcmp(s->raddr, dst) == 0){ + qhangup(s->rq, msg); + qhangup(s->wq, msg); + break; + } + } + freeblist(bp); +} + +int +icmpstats(Proto *icmp, char *buf, int len) +{ + Icmppriv *priv; + char *p, *e; + int i; + + priv = icmp->priv; + p = buf; + e = p+len; + for(i = 0; i < Nstats; i++) + p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]); + for(i = 0; i <= Maxtype; i++){ + if(icmpnames[i]) + p = seprint(p, e, "%s: %lud %lud\n", icmpnames[i], priv->in[i], priv->out[i]); + else + p = seprint(p, e, "%d: %lud %lud\n", i, priv->in[i], priv->out[i]); + } + return p - buf; +} + +void +icmpinit(Fs *fs) +{ + Proto *icmp; + + icmp = smalloc(sizeof(Proto)); + icmp->priv = smalloc(sizeof(Icmppriv)); + icmp->name = "icmp"; + icmp->connect = icmpconnect; + icmp->announce = icmpannounce; + icmp->state = icmpstate; + icmp->create = icmpcreate; + icmp->close = icmpclose; + icmp->rcv = icmpiput; + icmp->stats = icmpstats; + icmp->ctl = nil; + icmp->advise = icmpadvise; + icmp->gc = nil; + icmp->ipproto = IP_ICMPPROTO; + icmp->nc = 128; + icmp->ptclsize = 0; + + Fsproto(fs, icmp); +} diff --git a/src/9vx/a/ip/icmp6.c b/src/9vx/a/ip/icmp6.c @@ -0,0 +1,946 @@ +/* + * Internet Control Message Protocol for IPv6 + */ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" +#include "ip.h" +#include "ipv6.h" + +enum +{ + InMsgs6, + InErrors6, + OutMsgs6, + CsumErrs6, + LenErrs6, + HlenErrs6, + HoplimErrs6, + IcmpCodeErrs6, + TargetErrs6, + OptlenErrs6, + AddrmxpErrs6, + RouterAddrErrs6, + + Nstats6, +}; + +enum { + ICMP_USEAD6 = 40, +}; + +enum { + Oflag = 1<<5, + Sflag = 1<<6, + Rflag = 1<<7, +}; + +enum { + /* ICMPv6 types */ + EchoReply = 0, + UnreachableV6 = 1, + PacketTooBigV6 = 2, + TimeExceedV6 = 3, + SrcQuench = 4, + ParamProblemV6 = 4, + Redirect = 5, + EchoRequest = 8, + TimeExceed = 11, + InParmProblem = 12, + Timestamp = 13, + TimestampReply = 14, + InfoRequest = 15, + InfoReply = 16, + AddrMaskRequest = 17, + AddrMaskReply = 18, + EchoRequestV6 = 128, + EchoReplyV6 = 129, + RouterSolicit = 133, + RouterAdvert = 134, + NbrSolicit = 135, + NbrAdvert = 136, + RedirectV6 = 137, + + Maxtype6 = 137, +}; + +typedef struct ICMPpkt ICMPpkt; +typedef struct IPICMP IPICMP; +typedef struct Ndpkt Ndpkt; +typedef struct NdiscC NdiscC; + +struct ICMPpkt { + uchar type; + uchar code; + uchar cksum[2]; + uchar icmpid[2]; + uchar seq[2]; +}; + +struct IPICMP { + /* Ip6hdr; */ + uchar vcf[4]; /* version:4, traffic class:8, flow label:20 */ + uchar ploadlen[2]; /* payload length: packet length - 40 */ + uchar proto; /* next header type */ + uchar ttl; /* hop limit */ + uchar src[IPaddrlen]; + uchar dst[IPaddrlen]; + + /* ICMPpkt; */ + uchar type; + uchar code; + uchar cksum[2]; + uchar icmpid[2]; + uchar seq[2]; +}; + +struct NdiscC +{ + /* IPICMP; */ + /* Ip6hdr; */ + uchar vcf[4]; /* version:4, traffic class:8, flow label:20 */ + uchar ploadlen[2]; /* payload length: packet length - 40 */ + uchar proto; /* next header type */ + uchar ttl; /* hop limit */ + uchar src[IPaddrlen]; + uchar dst[IPaddrlen]; + + /* ICMPpkt; */ + uchar type; + uchar code; + uchar cksum[2]; + uchar icmpid[2]; + uchar seq[2]; + + uchar target[IPaddrlen]; +}; + +struct Ndpkt +{ + /* NdiscC; */ + /* IPICMP; */ + /* Ip6hdr; */ + uchar vcf[4]; /* version:4, traffic class:8, flow label:20 */ + uchar ploadlen[2]; /* payload length: packet length - 40 */ + uchar proto; /* next header type */ + uchar ttl; /* hop limit */ + uchar src[IPaddrlen]; + uchar dst[IPaddrlen]; + + /* ICMPpkt; */ + uchar type; + uchar code; + uchar cksum[2]; + uchar icmpid[2]; + uchar seq[2]; + + uchar target[IPaddrlen]; + + uchar otype; + uchar olen; /* length in units of 8 octets(incl type, code), + * 1 for IEEE 802 addresses */ + uchar lnaddr[6]; /* link-layer address */ +}; + +typedef struct Icmppriv6 +{ + ulong stats[Nstats6]; + + /* message counts */ + ulong in[Maxtype6+1]; + ulong out[Maxtype6+1]; +} Icmppriv6; + +typedef struct Icmpcb6 +{ + QLock qlock; + uchar headers; +} Icmpcb6; + +char *icmpnames6[Maxtype6+1] = +{ +[EchoReply] "EchoReply", +[UnreachableV6] "UnreachableV6", +[PacketTooBigV6] "PacketTooBigV6", +[TimeExceedV6] "TimeExceedV6", +[SrcQuench] "SrcQuench", +[Redirect] "Redirect", +[EchoRequest] "EchoRequest", +[TimeExceed] "TimeExceed", +[InParmProblem] "InParmProblem", +[Timestamp] "Timestamp", +[TimestampReply] "TimestampReply", +[InfoRequest] "InfoRequest", +[InfoReply] "InfoReply", +[AddrMaskRequest] "AddrMaskRequest", +[AddrMaskReply] "AddrMaskReply", +[EchoRequestV6] "EchoRequestV6", +[EchoReplyV6] "EchoReplyV6", +[RouterSolicit] "RouterSolicit", +[RouterAdvert] "RouterAdvert", +[NbrSolicit] "NbrSolicit", +[NbrAdvert] "NbrAdvert", +[RedirectV6] "RedirectV6", +}; + +static char *statnames6[Nstats6] = +{ +[InMsgs6] "InMsgs", +[InErrors6] "InErrors", +[OutMsgs6] "OutMsgs", +[CsumErrs6] "CsumErrs", +[LenErrs6] "LenErrs", +[HlenErrs6] "HlenErrs", +[HoplimErrs6] "HoplimErrs", +[IcmpCodeErrs6] "IcmpCodeErrs", +[TargetErrs6] "TargetErrs", +[OptlenErrs6] "OptlenErrs", +[AddrmxpErrs6] "AddrmxpErrs", +[RouterAddrErrs6] "RouterAddrErrs", +}; + +static char *unreachcode[] = +{ +[Icmp6_no_route] "no route to destination", +[Icmp6_ad_prohib] "comm with destination administratively prohibited", +[Icmp6_out_src_scope] "beyond scope of source address", +[Icmp6_adr_unreach] "address unreachable", +[Icmp6_port_unreach] "port unreachable", +[Icmp6_gress_src_fail] "source address failed ingress/egress policy", +[Icmp6_rej_route] "reject route to destination", +[Icmp6_unknown] "icmp unreachable: unknown code", +}; + +static void icmpkick6(void *x, Block *bp); + +static void +icmpcreate6(Conv *c) +{ + c->rq = qopen(64*1024, Qmsg, 0, c); + c->wq = qbypass(icmpkick6, c); +} + +static void +set_cksum(Block *bp) +{ + IPICMP *p = (IPICMP *)(bp->rp); + + hnputl(p->vcf, 0); /* borrow IP header as pseudoheader */ + hnputs(p->ploadlen, blocklen(bp) - IP6HDR); + p->proto = 0; + p->ttl = ICMPv6; /* ttl gets set later */ + hnputs(p->cksum, 0); + hnputs(p->cksum, ptclcsum(bp, 0, blocklen(bp))); + p->proto = ICMPv6; +} + +static Block * +newIPICMP(int packetlen) +{ + Block *nbp; + + nbp = allocb(packetlen); + nbp->wp += packetlen; + memset(nbp->rp, 0, packetlen); + return nbp; +} + +void +icmpadvise6(Proto *icmp, Block *bp, char *msg) +{ + ushort recid; + Conv **c, *s; + IPICMP *p; + + p = (IPICMP *)bp->rp; + recid = nhgets(p->icmpid); + + for(c = icmp->conv; *c; c++) { + s = *c; + if(s->lport == recid && ipcmp(s->raddr, p->dst) == 0){ + qhangup(s->rq, msg); + qhangup(s->wq, msg); + break; + } + } + freeblist(bp); +} + +static void +icmpkick6(void *x, Block *bp) +{ + uchar laddr[IPaddrlen], raddr[IPaddrlen]; + Conv *c = x; + IPICMP *p; + Icmppriv6 *ipriv = c->p->priv; + Icmpcb6 *icb = (Icmpcb6*)c->ptcl; + + if(bp == nil) + return; + + if(icb->headers==6) { + /* get user specified addresses */ + bp = pullupblock(bp, ICMP_USEAD6); + if(bp == nil) + return; + bp->rp += 8; + ipmove(laddr, bp->rp); + bp->rp += IPaddrlen; + ipmove(raddr, bp->rp); + bp->rp += IPaddrlen; + bp = padblock(bp, sizeof(Ip6hdr)); + } + + if(blocklen(bp) < sizeof(IPICMP)){ + freeblist(bp); + return; + } + p = (IPICMP *)(bp->rp); + if(icb->headers == 6) { + ipmove(p->dst, raddr); + ipmove(p->src, laddr); + } else { + ipmove(p->dst, c->raddr); + ipmove(p->src, c->laddr); + hnputs(p->icmpid, c->lport); + } + + set_cksum(bp); + p->vcf[0] = 0x06 << 4; + if(p->type <= Maxtype6) + ipriv->out[p->type]++; + ipoput6(c->p->f, bp, 0, c->ttl, c->tos, nil); +} + +char* +icmpctl6(Conv *c, char **argv, int argc) +{ + Icmpcb6 *icb; + + icb = (Icmpcb6*) c->ptcl; + if(argc==1 && strcmp(argv[0], "headers")==0) { + icb->headers = 6; + return nil; + } + return "unknown control request"; +} + +static void +goticmpkt6(Proto *icmp, Block *bp, int muxkey) +{ + ushort recid; + uchar *addr; + Conv **c, *s; + IPICMP *p = (IPICMP *)bp->rp; + + if(muxkey == 0) { + recid = nhgets(p->icmpid); + addr = p->src; + } else { + recid = muxkey; + addr = p->dst; + } + + for(c = icmp->conv; *c; c++){ + s = *c; + if(s->lport == recid && ipcmp(s->raddr, addr) == 0){ + bp = concatblock(bp); + if(bp != nil) + qpass(s->rq, bp); + return; + } + } + + freeblist(bp); +} + +static Block * +mkechoreply6(Block *bp, Ipifc *ifc) +{ + uchar addr[IPaddrlen]; + IPICMP *p = (IPICMP *)(bp->rp); + + ipmove(addr, p->src); + if(!isv6mcast(p->dst)) + ipmove(p->src, p->dst); + else if (!ipv6anylocal(ifc, p->src)) + return nil; + ipmove(p->dst, addr); + p->type = EchoReplyV6; + set_cksum(bp); + return bp; +} + +/* + * sends out an ICMPv6 neighbor solicitation + * suni == SRC_UNSPEC or SRC_UNI, + * tuni == TARG_MULTI => multicast for address resolution, + * and tuni == TARG_UNI => neighbor reachability. + */ +extern void +icmpns(Fs *f, uchar* src, int suni, uchar* targ, int tuni, uchar* mac) +{ + Block *nbp; + Ndpkt *np; + Proto *icmp = f->t2p[ICMPv6]; + Icmppriv6 *ipriv = icmp->priv; + + nbp = newIPICMP(sizeof(Ndpkt)); + np = (Ndpkt*) nbp->rp; + + if(suni == SRC_UNSPEC) + memmove(np->src, v6Unspecified, IPaddrlen); + else + memmove(np->src, src, IPaddrlen); + + if(tuni == TARG_UNI) + memmove(np->dst, targ, IPaddrlen); + else + ipv62smcast(np->dst, targ); + + np->type = NbrSolicit; + np->code = 0; + memmove(np->target, targ, IPaddrlen); + if(suni != SRC_UNSPEC) { + np->otype = SRC_LLADDR; + np->olen = 1; /* 1+1+6 = 8 = 1 8-octet */ + memmove(np->lnaddr, mac, sizeof(np->lnaddr)); + } else + nbp->wp -= sizeof(Ndpkt) - sizeof(NdiscC); + + set_cksum(nbp); + np = (Ndpkt*)nbp->rp; + np->ttl = HOP_LIMIT; + np->vcf[0] = 0x06 << 4; + ipriv->out[NbrSolicit]++; + netlog(f, Logicmp, "sending neighbor solicitation %I\n", targ); + ipoput6(f, nbp, 0, MAXTTL, DFLTTOS, nil); +} + +/* + * sends out an ICMPv6 neighbor advertisement. pktflags == RSO flags. + */ +extern void +icmpna(Fs *f, uchar* src, uchar* dst, uchar* targ, uchar* mac, uchar flags) +{ + Block *nbp; + Ndpkt *np; + Proto *icmp = f->t2p[ICMPv6]; + Icmppriv6 *ipriv = icmp->priv; + + nbp = newIPICMP(sizeof(Ndpkt)); + np = (Ndpkt*)nbp->rp; + + memmove(np->src, src, IPaddrlen); + memmove(np->dst, dst, IPaddrlen); + + np->type = NbrAdvert; + np->code = 0; + np->icmpid[0] = flags; + memmove(np->target, targ, IPaddrlen); + + np->otype = TARGET_LLADDR; + np->olen = 1; + memmove(np->lnaddr, mac, sizeof(np->lnaddr)); + + set_cksum(nbp); + np = (Ndpkt*) nbp->rp; + np->ttl = HOP_LIMIT; + np->vcf[0] = 0x06 << 4; + ipriv->out[NbrAdvert]++; + netlog(f, Logicmp, "sending neighbor advertisement %I\n", src); + ipoput6(f, nbp, 0, MAXTTL, DFLTTOS, nil); +} + +extern void +icmphostunr(Fs *f, Ipifc *ifc, Block *bp, int code, int free) +{ + int osz = BLEN(bp); + int sz = MIN(sizeof(IPICMP) + osz, v6MINTU); + Block *nbp; + IPICMP *np; + Ip6hdr *p; + Proto *icmp = f->t2p[ICMPv6]; + Icmppriv6 *ipriv = icmp->priv; + + p = (Ip6hdr *)bp->rp; + + if(isv6mcast(p->src)) + goto clean; + + nbp = newIPICMP(sz); + np = (IPICMP *)nbp->rp; + + RLOCK(ifc); + if(ipv6anylocal(ifc, np->src)) + netlog(f, Logicmp, "send icmphostunr -> s%I d%I\n", + p->src, p->dst); + else { + netlog(f, Logicmp, "icmphostunr fail -> s%I d%I\n", + p->src, p->dst); + freeblist(nbp); + if(free) + goto clean; + else + return; + } + + memmove(np->dst, p->src, IPaddrlen); + np->type = UnreachableV6; + np->code = code; + memmove(nbp->rp + sizeof(IPICMP), bp->rp, sz - sizeof(IPICMP)); + set_cksum(nbp); + np->ttl = HOP_LIMIT; + np->vcf[0] = 0x06 << 4; + ipriv->out[UnreachableV6]++; + + if(free) + ipiput6(f, ifc, nbp); + else { + ipoput6(f, nbp, 0, MAXTTL, DFLTTOS, nil); + return; + } + +clean: + RUNLOCK(ifc); + freeblist(bp); +} + +extern void +icmpttlexceeded6(Fs *f, Ipifc *ifc, Block *bp) +{ + int osz = BLEN(bp); + int sz = MIN(sizeof(IPICMP) + osz, v6MINTU); + Block *nbp; + IPICMP *np; + Ip6hdr *p; + Proto *icmp = f->t2p[ICMPv6]; + Icmppriv6 *ipriv = icmp->priv; + + p = (Ip6hdr *)bp->rp; + + if(isv6mcast(p->src)) + return; + + nbp = newIPICMP(sz); + np = (IPICMP *) nbp->rp; + + if(ipv6anylocal(ifc, np->src)) + netlog(f, Logicmp, "send icmpttlexceeded6 -> s%I d%I\n", + p->src, p->dst); + else { + netlog(f, Logicmp, "icmpttlexceeded6 fail -> s%I d%I\n", + p->src, p->dst); + return; + } + + memmove(np->dst, p->src, IPaddrlen); + np->type = TimeExceedV6; + np->code = 0; + memmove(nbp->rp + sizeof(IPICMP), bp->rp, sz - sizeof(IPICMP)); + set_cksum(nbp); + np->ttl = HOP_LIMIT; + np->vcf[0] = 0x06 << 4; + ipriv->out[TimeExceedV6]++; + ipoput6(f, nbp, 0, MAXTTL, DFLTTOS, nil); +} + +extern void +icmppkttoobig6(Fs *f, Ipifc *ifc, Block *bp) +{ + int osz = BLEN(bp); + int sz = MIN(sizeof(IPICMP) + osz, v6MINTU); + Block *nbp; + IPICMP *np; + Ip6hdr *p; + Proto *icmp = f->t2p[ICMPv6]; + Icmppriv6 *ipriv = icmp->priv; + + p = (Ip6hdr *)bp->rp; + + if(isv6mcast(p->src)) + return; + + nbp = newIPICMP(sz); + np = (IPICMP *)nbp->rp; + + if(ipv6anylocal(ifc, np->src)) + netlog(f, Logicmp, "send icmppkttoobig6 -> s%I d%I\n", + p->src, p->dst); + else { + netlog(f, Logicmp, "icmppkttoobig6 fail -> s%I d%I\n", + p->src, p->dst); + return; + } + + memmove(np->dst, p->src, IPaddrlen); + np->type = PacketTooBigV6; + np->code = 0; + hnputl(np->icmpid, ifc->maxtu - ifc->m->hsize); + memmove(nbp->rp + sizeof(IPICMP), bp->rp, sz - sizeof(IPICMP)); + set_cksum(nbp); + np->ttl = HOP_LIMIT; + np->vcf[0] = 0x06 << 4; + ipriv->out[PacketTooBigV6]++; + ipoput6(f, nbp, 0, MAXTTL, DFLTTOS, nil); +} + +/* + * RFC 2461, pages 39-40, pages 57-58. + */ +static int +valid(Proto *icmp, Ipifc *ifc, Block *bp, Icmppriv6 *ipriv) +{ + int sz, osz, unsp, n, ttl, iplen; + int pktsz = BLEN(bp); + uchar *packet = bp->rp; + IPICMP *p = (IPICMP *) packet; + Ndpkt *np; + + USED(ifc); + n = blocklen(bp); + if(n < sizeof(IPICMP)) { + ipriv->stats[HlenErrs6]++; + netlog(icmp->f, Logicmp, "icmp hlen %d\n", n); + goto err; + } + + iplen = nhgets(p->ploadlen); + if(iplen > n - IP6HDR || ((uint)iplen % 1) != 0) { + ipriv->stats[LenErrs6]++; + netlog(icmp->f, Logicmp, "icmp length %d\n", iplen); + goto err; + } + + /* Rather than construct explicit pseudoheader, overwrite IPv6 header */ + if(p->proto != ICMPv6) { + /* This code assumes no extension headers!!! */ + netlog(icmp->f, Logicmp, "icmp error: extension header\n"); + goto err; + } + memset(packet, 0, 4); + ttl = p->ttl; + p->ttl = p->proto; + p->proto = 0; + if(ptclcsum(bp, 0, iplen + IP6HDR)) { + ipriv->stats[CsumErrs6]++; + netlog(icmp->f, Logicmp, "icmp checksum error\n"); + goto err; + } + p->proto = p->ttl; + p->ttl = ttl; + + /* additional tests for some pkt types */ + if (p->type == NbrSolicit || p->type == NbrAdvert || + p->type == RouterAdvert || p->type == RouterSolicit || + p->type == RedirectV6) { + if(p->ttl != HOP_LIMIT) { + ipriv->stats[HoplimErrs6]++; + goto err; + } + if(p->code != 0) { + ipriv->stats[IcmpCodeErrs6]++; + goto err; + } + + switch (p->type) { + case NbrSolicit: + case NbrAdvert: + np = (Ndpkt*) p; + if(isv6mcast(np->target)) { + ipriv->stats[TargetErrs6]++; + goto err; + } + if(optexsts(np) && np->olen == 0) { + ipriv->stats[OptlenErrs6]++; + goto err; + } + + if (p->type == NbrSolicit && + ipcmp(np->src, v6Unspecified) == 0) + if(!issmcast(np->dst) || optexsts(np)) { + ipriv->stats[AddrmxpErrs6]++; + goto err; + } + + if(p->type == NbrAdvert) + if(isv6mcast(np->dst) && + (nhgets(np->icmpid) & Sflag)){ + ipriv->stats[AddrmxpErrs6]++; + goto err; + } + break; + + case RouterAdvert: + if(pktsz - sizeof(Ip6hdr) < 16) { + ipriv->stats[HlenErrs6]++; + goto err; + } + if(!islinklocal(p->src)) { + ipriv->stats[RouterAddrErrs6]++; + goto err; + } + sz = sizeof(IPICMP) + 8; + while (sz+1 < pktsz) { + osz = packet[sz+1]; + if(osz <= 0) { + ipriv->stats[OptlenErrs6]++; + goto err; + } + sz += 8*osz; + } + break; + + case RouterSolicit: + if(pktsz - sizeof(Ip6hdr) < 8) { + ipriv->stats[HlenErrs6]++; + goto err; + } + unsp = (ipcmp(p->src, v6Unspecified) == 0); + sz = sizeof(IPICMP) + 8; + while (sz+1 < pktsz) { + osz = packet[sz+1]; + if(osz <= 0 || + (unsp && packet[sz] == SRC_LLADDR)) { + ipriv->stats[OptlenErrs6]++; + goto err; + } + sz += 8*osz; + } + break; + + case RedirectV6: + /* to be filled in */ + break; + + default: + goto err; + } + } + return 1; +err: + ipriv->stats[InErrors6]++; + return 0; +} + +static int +targettype(Fs *f, Ipifc *ifc, uchar *target) +{ + Iplifc *lifc; + int t; + + RLOCK(ifc); + if(ipproxyifc(f, ifc, target)) { + RUNLOCK(ifc); + return Tuniproxy; + } + + for(lifc = ifc->lifc; lifc; lifc = lifc->next) + if(ipcmp(lifc->local, target) == 0) { + t = (lifc->tentative)? Tunitent: Tunirany; + RUNLOCK(ifc); + return t; + } + + RUNLOCK(ifc); + return 0; +} + +static void +icmpiput6(Proto *icmp, Ipifc *ipifc, Block *bp) +{ + int refresh = 1; + char *msg, m2[128]; + uchar pktflags; + uchar *packet = bp->rp; + uchar lsrc[IPaddrlen]; + Block *r; + IPICMP *p = (IPICMP *)packet; + Icmppriv6 *ipriv = icmp->priv; + Iplifc *lifc; + Ndpkt* np; + Proto *pr; + + if(!valid(icmp, ipifc, bp, ipriv) || p->type > Maxtype6) + goto raise; + + ipriv->in[p->type]++; + + switch(p->type) { + case EchoRequestV6: + r = mkechoreply6(bp, ipifc); + if(r == nil) + goto raise; + ipriv->out[EchoReply]++; + ipoput6(icmp->f, r, 0, MAXTTL, DFLTTOS, nil); + break; + + case UnreachableV6: + if(p->code >= nelem(unreachcode)) + msg = unreachcode[Icmp6_unknown]; + else + msg = unreachcode[p->code]; + + bp->rp += sizeof(IPICMP); + if(blocklen(bp) < 8){ + ipriv->stats[LenErrs6]++; + goto raise; + } + p = (IPICMP *)bp->rp; + pr = Fsrcvpcolx(icmp->f, p->proto); + if(pr != nil && pr->advise != nil) { + (*pr->advise)(pr, bp, msg); + return; + } + + bp->rp -= sizeof(IPICMP); + goticmpkt6(icmp, bp, 0); + break; + + case TimeExceedV6: + if(p->code == 0){ + sprint(m2, "ttl exceeded at %I", p->src); + + bp->rp += sizeof(IPICMP); + if(blocklen(bp) < 8){ + ipriv->stats[LenErrs6]++; + goto raise; + } + p = (IPICMP *)bp->rp; + pr = Fsrcvpcolx(icmp->f, p->proto); + if(pr && pr->advise) { + (*pr->advise)(pr, bp, m2); + return; + } + bp->rp -= sizeof(IPICMP); + } + + goticmpkt6(icmp, bp, 0); + break; + + case RouterAdvert: + case RouterSolicit: + /* using lsrc as a temp, munge hdr for goticmp6 */ + if (0) { + memmove(lsrc, p->src, IPaddrlen); + memmove(p->src, p->dst, IPaddrlen); + memmove(p->dst, lsrc, IPaddrlen); + } + goticmpkt6(icmp, bp, p->type); + break; + + case NbrSolicit: + np = (Ndpkt*) p; + pktflags = 0; + switch (targettype(icmp->f, ipifc, np->target)) { + case Tunirany: + pktflags |= Oflag; + /* fall through */ + + case Tuniproxy: + if(ipcmp(np->src, v6Unspecified) != 0) { + arpenter(icmp->f, V6, np->src, np->lnaddr, + 8*np->olen-2, 0); + pktflags |= Sflag; + } + if(ipv6local(ipifc, lsrc)) + icmpna(icmp->f, lsrc, + (ipcmp(np->src, v6Unspecified) == 0? + v6allnodesL: np->src), + np->target, ipifc->mac, pktflags); + else + freeblist(bp); + break; + + case Tunitent: + /* not clear what needs to be done. send up + * an icmp mesg saying don't use this address? */ + default: + freeblist(bp); + } + break; + + case NbrAdvert: + np = (Ndpkt*) p; + + /* + * if the target address matches one of the local interface + * addresses and the local interface address has tentative bit + * set, insert into ARP table. this is so the duplicate address + * detection part of ipconfig can discover duplication through + * the arp table. + */ + lifc = iplocalonifc(ipifc, np->target); + if(lifc && lifc->tentative) + refresh = 0; + arpenter(icmp->f, V6, np->target, np->lnaddr, 8*np->olen-2, + refresh); + freeblist(bp); + break; + + case PacketTooBigV6: + default: + goticmpkt6(icmp, bp, 0); + break; + } + return; +raise: + freeblist(bp); +} + +int +icmpstats6(Proto *icmp6, char *buf, int len) +{ + Icmppriv6 *priv; + char *p, *e; + int i; + + priv = icmp6->priv; + p = buf; + e = p+len; + for(i = 0; i < Nstats6; i++) + p = seprint(p, e, "%s: %lud\n", statnames6[i], priv->stats[i]); + for(i = 0; i <= Maxtype6; i++) + if(icmpnames6[i]) + p = seprint(p, e, "%s: %lud %lud\n", icmpnames6[i], + priv->in[i], priv->out[i]); +/* else + p = seprint(p, e, "%d: %lud %lud\n", i, priv->in[i], + priv->out[i]); + */ + return p - buf; +} + + +/* import from icmp.c */ +extern int icmpstate(Conv *c, char *state, int n); +extern char* icmpannounce(Conv *c, char **argv, int argc); +extern char* icmpconnect(Conv *c, char **argv, int argc); +extern void icmpclose(Conv *c); + +void +icmp6init(Fs *fs) +{ + Proto *icmp6 = smalloc(sizeof(Proto)); + + icmp6->priv = smalloc(sizeof(Icmppriv6)); + icmp6->name = "icmpv6"; + icmp6->connect = icmpconnect; + icmp6->announce = icmpannounce; + icmp6->state = icmpstate; + icmp6->create = icmpcreate6; + icmp6->close = icmpclose; + icmp6->rcv = icmpiput6; + icmp6->stats = icmpstats6; + icmp6->ctl = icmpctl6; + icmp6->advise = icmpadvise6; + icmp6->gc = nil; + icmp6->ipproto = ICMPv6; + icmp6->nc = 16; + icmp6->ptclsize = sizeof(Icmpcb6); + + Fsproto(fs, icmp6); +} diff --git a/src/9vx/a/ip/igmp.c b/src/9vx/a/ip/igmp.c @@ -0,0 +1,294 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" + +enum +{ + IGMP_IPHDRSIZE = 20, /* size of ip header */ + IGMP_HDRSIZE = 8, /* size of IGMP header */ + IP_IGMPPROTO = 2, + + IGMPquery = 1, + IGMPreport = 2, + + MSPTICK = 100, + MAXTIMEOUT = 10000/MSPTICK, /* at most 10 secs for a response */ +}; + +typedef struct IGMPpkt IGMPpkt; +typedef char byte; + +struct IGMPpkt +{ + /* ip header */ + byte vihl; /* Version and header length */ + byte tos; /* Type of service */ + byte len[2]; /* packet length (including headers) */ + byte id[2]; /* Identification */ + byte frag[2]; /* Fragment information */ + byte Unused; + byte proto; /* Protocol */ + byte cksum[2]; /* checksum of ip portion */ + byte src[IPaddrlen]; /* Ip source */ + byte dst[IPaddrlen]; /* Ip destination */ + + /* igmp header */ + byte vertype; /* version and type */ + byte unused; + byte igmpcksum[2]; /* checksum of igmp portion */ + byte group[IPaddrlen]; /* multicast group */ +}; + +/* + * lists for group reports + */ +typedef struct IGMPrep IGMPrep; +struct IGMPrep +{ + IGMPrep *next; + Media *m; + int ticks; + Multicast *multi; +}; + +typedef struct IGMP IGMP; +struct IGMP +{ + Lock lk; + + Rendez r; + IGMPrep *reports; +}; + +IGMP igmpalloc; + + Proto igmp; +extern Fs fs; + +static struct Stats +{ + ulong inqueries; + ulong outqueries; + ulong inreports; + ulong outreports; +} stats; + +void +igmpsendreport(Media *m, byte *addr) +{ + IGMPpkt *p; + Block *bp; + + bp = allocb(sizeof(IGMPpkt)); + if(bp == nil) + return; + p = (IGMPpkt*)bp->wp; + p->vihl = IP_VER4; + bp->wp += sizeof(IGMPpkt); + memset(bp->rp, 0, sizeof(IGMPpkt)); + hnputl(p->src, Mediagetaddr(m)); + hnputl(p->dst, Ipallsys); + p->vertype = (1<<4) | IGMPreport; + p->proto = IP_IGMPPROTO; + memmove(p->group, addr, IPaddrlen); + hnputs(p->igmpcksum, ptclcsum(bp, IGMP_IPHDRSIZE, IGMP_HDRSIZE)); + netlog(Logigmp, "igmpreport %I\n", p->group); + stats.outreports++; + ipoput4(bp, 0, 1, DFLTTOS, nil); /* TTL of 1 */ +} + +static int +isreport(void *a) +{ + USED(a); + return igmpalloc.reports != 0; +} + + +void +igmpproc(void *a) +{ + IGMPrep *rp, **lrp; + Multicast *mp, **lmp; + byte ip[IPaddrlen]; + + USED(a); + + for(;;){ + sleep(&igmpalloc.r, isreport, 0); + for(;;){ + lock(&igmpalloc); + + if(igmpalloc.reports == nil) + break; + + /* look for a single report */ + lrp = &igmpalloc.reports; + mp = nil; + for(rp = *lrp; rp; rp = *lrp){ + rp->ticks++; + lmp = &rp->multi; + for(mp = *lmp; mp; mp = *lmp){ + if(rp->ticks >= mp->timeout){ + *lmp = mp->next; + break; + } + lmp = &mp->next; + } + if(mp != nil) + break; + + if(rp->multi != nil){ + lrp = &rp->next; + continue; + } else { + *lrp = rp->next; + free(rp); + } + } + unlock(&igmpalloc); + + if(mp){ + /* do a single report and try again */ + hnputl(ip, mp->addr); + igmpsendreport(rp->m, ip); + free(mp); + continue; + } + + tsleep(&up->sleep, return0, 0, MSPTICK); + } + unlock(&igmpalloc); + } + +} + +void +igmpiput(Media *m, Ipifc *, Block *bp) +{ + int n; + IGMPpkt *ghp; + Ipaddr group; + IGMPrep *rp, **lrp; + Multicast *mp, **lmp; + + ghp = (IGMPpkt*)(bp->rp); + netlog(Logigmp, "igmpiput: %d %I\n", ghp->vertype, ghp->group); + + n = blocklen(bp); + if(n < IGMP_IPHDRSIZE+IGMP_HDRSIZE){ + netlog(Logigmp, "igmpiput: bad len\n"); + goto error; + } + if((ghp->vertype>>4) != 1){ + netlog(Logigmp, "igmpiput: bad igmp type\n"); + goto error; + } + if(ptclcsum(bp, IGMP_IPHDRSIZE, IGMP_HDRSIZE)){ + netlog(Logigmp, "igmpiput: checksum error %I\n", ghp->src); + goto error; + } + + group = nhgetl(ghp->group); + + lock(&igmpalloc); + switch(ghp->vertype & 0xf){ + case IGMPquery: + /* + * start reporting groups that we're a member of. + */ + stats.inqueries++; + for(rp = igmpalloc.reports; rp; rp = rp->next) + if(rp->m == m) + break; + if(rp != nil) + break; /* already reporting */ + + mp = Mediacopymulti(m); + if(mp == nil) + break; + + rp = malloc(sizeof(*rp)); + if(rp == nil) + break; + + rp->m = m; + rp->multi = mp; + rp->ticks = 0; + for(; mp; mp = mp->next) + mp->timeout = nrand(MAXTIMEOUT); + rp->next = igmpalloc.reports; + igmpalloc.reports = rp; + + wakeup(&igmpalloc.r); + + break; + case IGMPreport: + /* + * find report list for this medium + */ + stats.inreports++; + lrp = &igmpalloc.reports; + for(rp = *lrp; rp; rp = *lrp){ + if(rp->m == m) + break; + lrp = &rp->next; + } + if(rp == nil) + break; + + /* + * if someone else has reported a group, + * we don't have to. + */ + lmp = &rp->multi; + for(mp = *lmp; mp; mp = *lmp){ + if(mp->addr == group){ + *lmp = mp->next; + free(mp); + break; + } + lmp = &mp->next; + } + + break; + } + unlock(&igmpalloc); + +error: + freeb(bp); +} + +int +igmpstats(char *buf, int len) +{ + return snprint(buf, len, "\trcvd %d %d\n\tsent %d %d\n", + stats.inqueries, stats.inreports, + stats.outqueries, stats.outreports); +} + +void +igmpinit(Fs *fs) +{ + igmp.name = "igmp"; + igmp.connect = nil; + igmp.announce = nil; + igmp.ctl = nil; + igmp.state = nil; + igmp.close = nil; + igmp.rcv = igmpiput; + igmp.stats = igmpstats; + igmp.ipproto = IP_IGMPPROTO; + igmp.nc = 0; + igmp.ptclsize = 0; + + igmpreportfn = igmpsendreport; + kproc("igmpproc", igmpproc, 0); + + Fsproto(fs, &igmp); +} diff --git a/src/9vx/a/ip/il.c b/src/9vx/a/ip/il.c @@ -0,0 +1,1408 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" + +enum /* Connection state */ +{ + Ilclosed, + Ilsyncer, + Ilsyncee, + Ilestablished, + Illistening, + Ilclosing, + Ilopening, /* only for file server */ +}; + +char *ilstates[] = +{ + "Closed", + "Syncer", + "Syncee", + "Established", + "Listen", + "Closing", + "Opening", /* only for file server */ +}; + +enum /* Packet types */ +{ + Ilsync, + Ildata, + Ildataquery, + Ilack, + Ilquery, + Ilstate, + Ilclose, +}; + +char *iltype[] = +{ + "sync", + "data", + "dataquery", + "ack", + "query", + "state", + "close" +}; + +enum +{ + Seconds = 1000, + Iltickms = 50, /* time base */ + AckDelay = 2*Iltickms, /* max time twixt message rcvd & ack sent */ + MaxTimeout = 30*Seconds, /* max time between rexmit */ + QueryTime = 10*Seconds, /* time between subsequent queries */ + DeathTime = 30*QueryTime, + + MaxRexmit = 16, /* max retransmissions before hangup */ + Defaultwin = 20, + + LogAGain = 3, + AGain = 1<<LogAGain, + LogDGain = 2, + DGain = 1<<LogDGain, + + DefByteRate = 100, /* assume a megabit link */ + DefRtt = 50, /* cross country on a great day */ + + Maxrq = 64*1024, +}; + +enum +{ + Nqt= 8, +}; + +typedef struct Ilcb Ilcb; +struct Ilcb /* Control block */ +{ + int state; /* Connection state */ + Conv *conv; + QLock ackq; /* Unacknowledged queue */ + Block *unacked; + Block *unackedtail; + ulong unackedbytes; + QLock outo; /* Out of order packet queue */ + Block *outoforder; + ulong next; /* Id of next to send */ + ulong recvd; /* Last packet received */ + ulong acksent; /* Last packet acked */ + ulong start; /* Local start id */ + ulong rstart; /* Remote start id */ + int window; /* Maximum receive window */ + int rxquery; /* number of queries on this connection */ + int rxtot; /* number of retransmits on this connection */ + int rexmit; /* number of retransmits of *unacked */ + ulong qt[Nqt+1]; /* state table for query messages */ + int qtx; /* ... index into qt */ + + /* if set, fasttimeout causes a connection request to terminate after 4*Iltickms */ + int fasttimeout; + + /* timers */ + ulong lastxmit; /* time of last xmit */ + ulong lastrecv; /* time of last recv */ + ulong timeout; /* retransmission time for *unacked */ + ulong acktime; /* time to send next ack */ + ulong querytime; /* time to send next query */ + + /* adaptive measurements */ + int delay; /* Average of the fixed rtt delay */ + int rate; /* Average uchar rate */ + int mdev; /* Mean deviation of rtt */ + int maxrtt; /* largest rtt seen */ + ulong rttack; /* The ack we are waiting for */ + int rttlen; /* Length of rttack packet */ + uvlong rttstart; /* Time we issued rttack packet */ +}; + +enum +{ + IL_IPSIZE = 20, + IL_HDRSIZE = 18, + IL_LISTEN = 0, + IL_CONNECT = 1, + IP_ILPROTO = 40, +}; + +typedef struct Ilhdr Ilhdr; +struct Ilhdr +{ + uchar vihl; /* Version and header length */ + uchar tos; /* Type of service */ + uchar length[2]; /* packet length */ + uchar id[2]; /* Identification */ + uchar frag[2]; /* Fragment information */ + uchar ttl; /* Time to live */ + uchar proto; /* Protocol */ + uchar cksum[2]; /* Header checksum */ + uchar src[4]; /* Ip source */ + uchar dst[4]; /* Ip destination */ + uchar ilsum[2]; /* Checksum including header */ + uchar illen[2]; /* Packet length */ + uchar iltype; /* Packet type */ + uchar ilspec; /* Special */ + uchar ilsrc[2]; /* Src port */ + uchar ildst[2]; /* Dst port */ + uchar ilid[4]; /* Sequence id */ + uchar ilack[4]; /* Acked sequence */ +}; + +enum +{ + InMsgs, + OutMsgs, + CsumErrs, /* checksum errors */ + HlenErrs, /* header length error */ + LenErrs, /* short packet */ + OutOfOrder, /* out of order */ + Retrans, /* retransmissions */ + DupMsg, + DupBytes, + DroppedMsgs, + + Nstats, +}; + +static char *statnames[] = +{ +[InMsgs] "InMsgs", +[OutMsgs] "OutMsgs", +[CsumErrs] "CsumErrs", +[HlenErrs] "HlenErr", +[LenErrs] "LenErrs", +[OutOfOrder] "OutOfOrder", +[Retrans] "Retrans", +[DupMsg] "DupMsg", +[DupBytes] "DupBytes", +[DroppedMsgs] "DroppedMsgs", +}; + +typedef struct Ilpriv Ilpriv; +struct Ilpriv +{ + Ipht ht; + + ulong stats[Nstats]; + + ulong csumerr; /* checksum errors */ + ulong hlenerr; /* header length error */ + ulong lenerr; /* short packet */ + ulong order; /* out of order */ + ulong rexmit; /* retransmissions */ + ulong dup; + ulong dupb; + + /* keeping track of the ack kproc */ + int ackprocstarted; + QLock apl; +}; + +/* state for query/dataquery messages */ + + +void ilrcvmsg(Conv*, Block*); +void ilsendctl(Conv*, Ilhdr*, int, ulong, ulong, int); +void ilackq(Ilcb*, Block*); +void ilprocess(Conv*, Ilhdr*, Block*); +void ilpullup(Conv*); +void ilhangup(Conv*, char*); +void ilfreeq(Ilcb*); +void ilrexmit(Ilcb*); +void ilbackoff(Ilcb*); +void ilsettimeout(Ilcb*); +char* ilstart(Conv*, int, int); +void ilackproc(void*); +void iloutoforder(Conv*, Ilhdr*, Block*); +void iliput(Proto*, Ipifc*, Block*); +void iladvise(Proto*, Block*, char*); +int ilnextqt(Ilcb*); +void ilcbinit(Ilcb*); +int later(ulong, ulong, char*); +void ilreject(Fs*, Ilhdr*); +void illocalclose(Conv *c); + int ilcksum = 1; +static int initseq = 25001; +static ulong scalediv, scalemul; +static char *etime = "connection timed out"; + +static char* +ilconnect(Conv *c, char **argv, int argc) +{ + char *e, *p; + int fast; + + /* huge hack to quickly try an il connection */ + fast = 0; + if(argc > 1){ + p = strstr(argv[1], "!fasttimeout"); + if(p != nil){ + *p = 0; + fast = 1; + } + } + + e = Fsstdconnect(c, argv, argc); + if(e != nil) + return e; + return ilstart(c, IL_CONNECT, fast); +} + +static int +ilstate(Conv *c, char *state, int n) +{ + Ilcb *ic; + + ic = (Ilcb*)(c->ptcl); + return snprint(state, n, "%s qin %d qout %d del %5.5d Br %5.5d md %5.5d una %5.5lud rex %5.5d rxq %5.5d max %5.5d\n", + ilstates[ic->state], + c->rq ? qlen(c->rq) : 0, + c->wq ? qlen(c->wq) : 0, + ic->delay>>LogAGain, ic->rate>>LogAGain, ic->mdev>>LogDGain, + ic->unackedbytes, ic->rxtot, ic->rxquery, ic->maxrtt); +} + +static int +ilinuse(Conv *c) +{ + Ilcb *ic; + + ic = (Ilcb*)(c->ptcl); + return ic->state != Ilclosed; + +} + +/* called with c locked */ +static char* +ilannounce(Conv *c, char **argv, int argc) +{ + char *e; + + e = Fsstdannounce(c, argv, argc); + if(e != nil) + return e; + e = ilstart(c, IL_LISTEN, 0); + if(e != nil) + return e; + Fsconnected(c, nil); + + return nil; +} + +void +illocalclose(Conv *c) +{ + Ilcb *ic; + Ilpriv *ipriv; + + ipriv = c->p->priv; + ic = (Ilcb*)c->ptcl; + ic->state = Ilclosed; + iphtrem(&ipriv->ht, c); + ipmove(c->laddr, IPnoaddr); + c->lport = 0; +} + +static void +ilclose(Conv *c) +{ + Ilcb *ic; + + ic = (Ilcb*)c->ptcl; + + qclose(c->rq); + qclose(c->wq); + qclose(c->eq); + + switch(ic->state) { + case Ilclosing: + case Ilclosed: + break; + case Ilsyncer: + case Ilsyncee: + case Ilestablished: + ic->state = Ilclosing; + ilsettimeout(ic); + ilsendctl(c, nil, Ilclose, ic->next, ic->recvd, 0); + break; + case Illistening: + illocalclose(c); + break; + } + ilfreeq(ic); +} + +void +ilkick(void *x, Block *bp) +{ + Conv *c = x; + Ilhdr *ih; + Ilcb *ic; + int dlen; + ulong id, ack; + Fs *f; + Ilpriv *priv; + + f = c->p->f; + priv = c->p->priv; + ic = (Ilcb*)c->ptcl; + + if(bp == nil) + return; + + switch(ic->state) { + case Ilclosed: + case Illistening: + case Ilclosing: + freeblist(bp); + qhangup(c->rq, nil); + return; + } + + dlen = blocklen(bp); + + /* Make space to fit il & ip */ + bp = padblock(bp, IL_IPSIZE+IL_HDRSIZE); + ih = (Ilhdr *)(bp->rp); + ih->vihl = IP_VER4; + + /* Ip fields */ + ih->frag[0] = 0; + ih->frag[1] = 0; + v6tov4(ih->dst, c->raddr); + v6tov4(ih->src, c->laddr); + ih->proto = IP_ILPROTO; + + /* Il fields */ + hnputs(ih->illen, dlen+IL_HDRSIZE); + hnputs(ih->ilsrc, c->lport); + hnputs(ih->ildst, c->rport); + + qlock(&ic->ackq); + id = ic->next++; + hnputl(ih->ilid, id); + ack = ic->recvd; + hnputl(ih->ilack, ack); + ic->acksent = ack; + ic->acktime = NOW + AckDelay; + ih->iltype = Ildata; + ih->ilspec = 0; + ih->ilsum[0] = 0; + ih->ilsum[1] = 0; + + /* Checksum of ilheader plus data (not ip & no pseudo header) */ + if(ilcksum) + hnputs(ih->ilsum, ptclcsum(bp, IL_IPSIZE, dlen+IL_HDRSIZE)); + + ilackq(ic, bp); + qunlock(&ic->ackq); + + /* Start the round trip timer for this packet if the timer is free */ + if(ic->rttack == 0) { + ic->rttack = id; + ic->rttstart = fastticks(nil); + ic->rttlen = dlen + IL_IPSIZE + IL_HDRSIZE; + } + + if(later(NOW, ic->timeout, nil)) + ilsettimeout(ic); + ipoput4(f, bp, 0, c->ttl, c->tos, c); + priv->stats[OutMsgs]++; +} + +static void +ilcreate(Conv *c) +{ + c->rq = qopen(Maxrq, 0, 0, c); + c->wq = qbypass(ilkick, c); +} + +int +ilxstats(Proto *il, char *buf, int len) +{ + Ilpriv *priv; + char *p, *e; + int i; + + priv = il->priv; + p = buf; + e = p+len; + for(i = 0; i < Nstats; i++) + p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]); + return p - buf; +} + +void +ilackq(Ilcb *ic, Block *bp) +{ + Block *np; + int n; + + n = blocklen(bp); + + /* Enqueue a copy on the unacked queue in case this one gets lost */ + np = copyblock(bp, n); + if(ic->unacked) + ic->unackedtail->list = np; + else + ic->unacked = np; + ic->unackedtail = np; + np->list = nil; + ic->unackedbytes += n; +} + +static +void +ilrttcalc(Ilcb *ic, Block *bp) +{ + int rtt, tt, pt, delay, rate; + + rtt = fastticks(nil) - ic->rttstart; + rtt = (rtt*scalemul)/scalediv; + delay = ic->delay; + rate = ic->rate; + + /* Guard against zero wrap */ + if(rtt > 120000 || rtt < 0) + return; + + /* this block had to be transmitted after the one acked so count its size */ + ic->rttlen += blocklen(bp) + IL_IPSIZE + IL_HDRSIZE; + + if(ic->rttlen < 256){ + /* guess fixed delay as rtt of small packets */ + delay += rtt - (delay>>LogAGain); + if(delay < AGain) + delay = AGain; + ic->delay = delay; + } else { + /* if packet took longer than avg rtt delay, recalc rate */ + tt = rtt - (delay>>LogAGain); + if(tt > 0){ + rate += ic->rttlen/tt - (rate>>LogAGain); + if(rate < AGain) + rate = AGain; + ic->rate = rate; + } + } + + /* mdev */ + pt = ic->rttlen/(rate>>LogAGain) + (delay>>LogAGain); + ic->mdev += abs(rtt-pt) - (ic->mdev>>LogDGain); + + if(rtt > ic->maxrtt) + ic->maxrtt = rtt; +} + +void +ilackto(Ilcb *ic, ulong ackto, Block *bp) +{ + Ilhdr *h; + ulong id; + + if(ic->rttack == ackto) + ilrttcalc(ic, bp); + + /* Cancel if we've passed the packet we were interested in */ + if(ic->rttack <= ackto) + ic->rttack = 0; + + qlock(&ic->ackq); + while(ic->unacked) { + h = (Ilhdr *)ic->unacked->rp; + id = nhgetl(h->ilid); + if(ackto < id) + break; + + bp = ic->unacked; + ic->unacked = bp->list; + bp->list = nil; + ic->unackedbytes -= blocklen(bp); + freeblist(bp); + ic->rexmit = 0; + ilsettimeout(ic); + } + qunlock(&ic->ackq); +} + +void +iliput(Proto *il, Ipifc *dummy, Block *bp) +{ + char *st; + Ilcb *ic; + Ilhdr *ih; + uchar raddr[IPaddrlen]; + uchar laddr[IPaddrlen]; + ushort sp, dp, csum; + int plen, illen; + Conv *new, *s; + Ilpriv *ipriv; + + ipriv = il->priv; + + ih = (Ilhdr *)bp->rp; + plen = blocklen(bp); + if(plen < IL_IPSIZE+IL_HDRSIZE){ + netlog(il->f, Logil, "il: hlenerr\n"); + ipriv->stats[HlenErrs]++; + goto raise; + } + + illen = nhgets(ih->illen); + if(illen+IL_IPSIZE > plen){ + netlog(il->f, Logil, "il: lenerr\n"); + ipriv->stats[LenErrs]++; + goto raise; + } + + sp = nhgets(ih->ildst); + dp = nhgets(ih->ilsrc); + v4tov6(raddr, ih->src); + v4tov6(laddr, ih->dst); + + if((csum = ptclcsum(bp, IL_IPSIZE, illen)) != 0) { + if(ih->iltype > Ilclose) + st = "?"; + else + st = iltype[ih->iltype]; + ipriv->stats[CsumErrs]++; + netlog(il->f, Logil, "il: cksum %ux %ux, pkt(%s id %lud ack %lud %I/%d->%d)\n", + csum, st, nhgetl(ih->ilid), nhgetl(ih->ilack), raddr, sp, dp); + goto raise; + } + + QLOCK(il); + s = iphtlook(&ipriv->ht, raddr, dp, laddr, sp); + if(s == nil){ + if(ih->iltype == Ilsync) + ilreject(il->f, ih); /* no listener */ + QUNLOCK(il); + goto raise; + } + + ic = (Ilcb*)s->ptcl; + if(ic->state == Illistening){ + if(ih->iltype != Ilsync){ + QUNLOCK(il); + if(ih->iltype > Ilclose) + st = "?"; + else + st = iltype[ih->iltype]; + ilreject(il->f, ih); /* no channel and not sync */ + netlog(il->f, Logil, "il: no channel, pkt(%s id %lud ack %lud %I/%ud->%ud)\n", + st, nhgetl(ih->ilid), nhgetl(ih->ilack), raddr, sp, dp); + goto raise; + } + + new = Fsnewcall(s, raddr, dp, laddr, sp, V4); + if(new == nil){ + QUNLOCK(il); + netlog(il->f, Logil, "il: bad newcall %I/%ud->%ud\n", raddr, sp, dp); + ilsendctl(s, ih, Ilclose, 0, nhgetl(ih->ilid), 0); + goto raise; + } + s = new; + + ic = (Ilcb*)s->ptcl; + + ic->conv = s; + ic->state = Ilsyncee; + ilcbinit(ic); + ic->rstart = nhgetl(ih->ilid); + iphtadd(&ipriv->ht, s); + } + + QLOCK(s); + QUNLOCK(il); + if(waserror()){ + QUNLOCK(s); + nexterror(); + } + ilprocess(s, ih, bp); + QUNLOCK(s); + poperror(); + return; +raise: + freeblist(bp); +} + +void +_ilprocess(Conv *s, Ilhdr *h, Block *bp) +{ + Ilcb *ic; + ulong id, ack; + Ilpriv *priv; + + id = nhgetl(h->ilid); + ack = nhgetl(h->ilack); + + ic = (Ilcb*)s->ptcl; + + ic->lastrecv = NOW; + ic->querytime = NOW + QueryTime; + priv = s->p->priv; + priv->stats[InMsgs]++; + + switch(ic->state) { + default: + netlog(s->p->f, Logil, "il: unknown state %d\n", ic->state); + case Ilclosed: + freeblist(bp); + break; + case Ilsyncer: + switch(h->iltype) { + default: + break; + case Ilsync: + if(ack != ic->start) + ilhangup(s, "connection rejected"); + else { + ic->recvd = id; + ic->rstart = id; + ilsendctl(s, nil, Ilack, ic->next, ic->recvd, 0); + ic->state = Ilestablished; + ic->fasttimeout = 0; + ic->rexmit = 0; + Fsconnected(s, nil); + ilpullup(s); + } + break; + case Ilclose: + if(ack == ic->start) + ilhangup(s, "connection rejected"); + break; + } + freeblist(bp); + break; + case Ilsyncee: + switch(h->iltype) { + default: + break; + case Ilsync: + if(id != ic->rstart || ack != 0){ + illocalclose(s); + } else { + ic->recvd = id; + ilsendctl(s, nil, Ilsync, ic->start, ic->recvd, 0); + } + break; + case Ilack: + if(ack == ic->start) { + ic->state = Ilestablished; + ic->fasttimeout = 0; + ic->rexmit = 0; + ilpullup(s); + } + break; + case Ildata: + if(ack == ic->start) { + ic->state = Ilestablished; + ic->fasttimeout = 0; + ic->rexmit = 0; + goto established; + } + break; + case Ilclose: + if(ack == ic->start) + ilhangup(s, "remote close"); + break; + } + freeblist(bp); + break; + case Ilestablished: + established: + switch(h->iltype) { + case Ilsync: + if(id != ic->rstart) + ilhangup(s, "remote close"); + else + ilsendctl(s, nil, Ilack, ic->next, ic->rstart, 0); + freeblist(bp); + break; + case Ildata: + /* + * avoid consuming all the mount rpc buffers in the + * system. if the input queue is too long, drop this + * packet. + */ + if (s->rq && qlen(s->rq) >= Maxrq) { + priv->stats[DroppedMsgs]++; + freeblist(bp); + break; + } + + ilackto(ic, ack, bp); + iloutoforder(s, h, bp); + ilpullup(s); + break; + case Ildataquery: + ilackto(ic, ack, bp); + iloutoforder(s, h, bp); + ilpullup(s); + ilsendctl(s, nil, Ilstate, ic->next, ic->recvd, h->ilspec); + break; + case Ilack: + ilackto(ic, ack, bp); + freeblist(bp); + break; + case Ilquery: + ilackto(ic, ack, bp); + ilsendctl(s, nil, Ilstate, ic->next, ic->recvd, h->ilspec); + freeblist(bp); + break; + case Ilstate: + if(ack >= ic->rttack) + ic->rttack = 0; + ilackto(ic, ack, bp); + if(h->ilspec > Nqt) + h->ilspec = 0; + if(ic->qt[h->ilspec] > ack){ + ilrexmit(ic); + ilsettimeout(ic); + } + freeblist(bp); + break; + case Ilclose: + freeblist(bp); + if(ack < ic->start || ack > ic->next) + break; + ic->recvd = id; + ilsendctl(s, nil, Ilclose, ic->next, ic->recvd, 0); + ic->state = Ilclosing; + ilsettimeout(ic); + ilfreeq(ic); + break; + } + break; + case Illistening: + freeblist(bp); + break; + case Ilclosing: + switch(h->iltype) { + case Ilclose: + ic->recvd = id; + ilsendctl(s, nil, Ilclose, ic->next, ic->recvd, 0); + if(ack == ic->next) + ilhangup(s, nil); + break; + default: + break; + } + freeblist(bp); + break; + } +} + +void +ilrexmit(Ilcb *ic) +{ + Ilhdr *h; + Block *nb; + Conv *c; + ulong id; + Ilpriv *priv; + + nb = nil; + qlock(&ic->ackq); + if(ic->unacked) + nb = copyblock(ic->unacked, blocklen(ic->unacked)); + qunlock(&ic->ackq); + + if(nb == nil) + return; + + h = (Ilhdr*)nb->rp; + h->vihl = IP_VER4; + + h->iltype = Ildataquery; + hnputl(h->ilack, ic->recvd); + h->ilspec = ilnextqt(ic); + h->ilsum[0] = 0; + h->ilsum[1] = 0; + hnputs(h->ilsum, ptclcsum(nb, IL_IPSIZE, nhgets(h->illen))); + + c = ic->conv; + id = nhgetl(h->ilid); + netlog(c->p->f, Logil, "il: rexmit %d %ud: %d %d: %i %d/%d\n", id, ic->recvd, + ic->rexmit, ic->timeout, + c->raddr, c->lport, c->rport); + + ilbackoff(ic); + + ipoput4(c->p->f, nb, 0, c->ttl, c->tos, c); + + /* statistics */ + ic->rxtot++; + priv = c->p->priv; + priv->rexmit++; +} + +/* DEBUG */ +void +ilprocess(Conv *s, Ilhdr *h, Block *bp) +{ + Ilcb *ic; + + ic = (Ilcb*)s->ptcl; + + USED(ic); + netlog(s->p->f, Logilmsg, "%11s rcv %d/%d snt %d/%d pkt(%s id %d ack %d %d->%d) ", + ilstates[ic->state], ic->rstart, ic->recvd, ic->start, + ic->next, iltype[h->iltype], nhgetl(h->ilid), + nhgetl(h->ilack), nhgets(h->ilsrc), nhgets(h->ildst)); + + _ilprocess(s, h, bp); + + netlog(s->p->f, Logilmsg, "%11s rcv %d snt %d\n", ilstates[ic->state], ic->recvd, ic->next); +} + +void +ilhangup(Conv *s, char *msg) +{ + Ilcb *ic; + int callout; + + netlog(s->p->f, Logil, "il: hangup! %I %d/%d: %s\n", s->raddr, + s->lport, s->rport, msg?msg:"no reason"); + + ic = (Ilcb*)s->ptcl; + callout = ic->state == Ilsyncer; + illocalclose(s); + + qhangup(s->rq, msg); + qhangup(s->wq, msg); + + if(callout) + Fsconnected(s, msg); +} + +void +ilpullup(Conv *s) +{ + Ilcb *ic; + Ilhdr *oh; + Block *bp; + ulong oid, dlen; + Ilpriv *ipriv; + + ic = (Ilcb*)s->ptcl; + if(ic->state != Ilestablished) + return; + + qlock(&ic->outo); + while(ic->outoforder) { + bp = ic->outoforder; + oh = (Ilhdr*)bp->rp; + oid = nhgetl(oh->ilid); + if(oid <= ic->recvd) { + ic->outoforder = bp->list; + freeblist(bp); + continue; + } + if(oid != ic->recvd+1){ + ipriv = s->p->priv; + ipriv->stats[OutOfOrder]++; + break; + } + + ic->recvd = oid; + ic->outoforder = bp->list; + + bp->list = nil; + dlen = nhgets(oh->illen)-IL_HDRSIZE; + bp = trimblock(bp, IL_IPSIZE+IL_HDRSIZE, dlen); + /* + * Upper levels don't know about multiple-block + * messages so copy all into one (yick). + */ + bp = concatblock(bp); + if(bp == 0) + panic("ilpullup"); + bp = packblock(bp); + if(bp == 0) + panic("ilpullup2"); + qpass(s->rq, bp); + } + qunlock(&ic->outo); +} + +void +iloutoforder(Conv *s, Ilhdr *h, Block *bp) +{ + Ilcb *ic; + uchar *lid; + Block *f, **l; + ulong id, newid; + Ilpriv *ipriv; + + ipriv = s->p->priv; + ic = (Ilcb*)s->ptcl; + bp->list = nil; + + id = nhgetl(h->ilid); + /* Window checks */ + if(id <= ic->recvd || id > ic->recvd+ic->window) { + netlog(s->p->f, Logil, "il: message outside window %ud <%ud-%ud>: %i %d/%d\n", + id, ic->recvd, ic->recvd+ic->window, s->raddr, s->lport, s->rport); + freeblist(bp); + return; + } + + /* Packet is acceptable so sort onto receive queue for pullup */ + qlock(&ic->outo); + if(ic->outoforder == nil) + ic->outoforder = bp; + else { + l = &ic->outoforder; + for(f = *l; f; f = f->list) { + lid = ((Ilhdr*)(f->rp))->ilid; + newid = nhgetl(lid); + if(id <= newid) { + if(id == newid) { + ipriv->stats[DupMsg]++; + ipriv->stats[DupBytes] += blocklen(bp); + qunlock(&ic->outo); + freeblist(bp); + return; + } + bp->list = f; + *l = bp; + qunlock(&ic->outo); + return; + } + l = &f->list; + } + *l = bp; + } + qunlock(&ic->outo); +} + +void +ilsendctl(Conv *ipc, Ilhdr *inih, int type, ulong id, ulong ack, int ilspec) +{ + Ilhdr *ih; + Ilcb *ic; + Block *bp; + int ttl, tos; + + bp = allocb(IL_IPSIZE+IL_HDRSIZE); + bp->wp += IL_IPSIZE+IL_HDRSIZE; + + ih = (Ilhdr *)(bp->rp); + ih->vihl = IP_VER4; + + /* Ip fields */ + ih->proto = IP_ILPROTO; + hnputs(ih->illen, IL_HDRSIZE); + ih->frag[0] = 0; + ih->frag[1] = 0; + if(inih) { + hnputl(ih->dst, nhgetl(inih->src)); + hnputl(ih->src, nhgetl(inih->dst)); + hnputs(ih->ilsrc, nhgets(inih->ildst)); + hnputs(ih->ildst, nhgets(inih->ilsrc)); + hnputl(ih->ilid, nhgetl(inih->ilack)); + hnputl(ih->ilack, nhgetl(inih->ilid)); + ttl = MAXTTL; + tos = DFLTTOS; + } + else { + v6tov4(ih->dst, ipc->raddr); + v6tov4(ih->src, ipc->laddr); + hnputs(ih->ilsrc, ipc->lport); + hnputs(ih->ildst, ipc->rport); + hnputl(ih->ilid, id); + hnputl(ih->ilack, ack); + ic = (Ilcb*)ipc->ptcl; + ic->acksent = ack; + ic->acktime = NOW; + ttl = ipc->ttl; + tos = ipc->tos; + } + ih->iltype = type; + ih->ilspec = ilspec; + ih->ilsum[0] = 0; + ih->ilsum[1] = 0; + + if(ilcksum) + hnputs(ih->ilsum, ptclcsum(bp, IL_IPSIZE, IL_HDRSIZE)); + +if(ipc==nil) + panic("ipc is nil caller is %#p", getcallerpc(&ipc)); +if(ipc->p==nil) + panic("ipc->p is nil"); + + netlog(ipc->p->f, Logilmsg, "ctl(%s id %d ack %d %d->%d)\n", + iltype[ih->iltype], nhgetl(ih->ilid), nhgetl(ih->ilack), + nhgets(ih->ilsrc), nhgets(ih->ildst)); + + ipoput4(ipc->p->f, bp, 0, ttl, tos, ipc); +} + +void +ilreject(Fs *f, Ilhdr *inih) +{ + Ilhdr *ih; + Block *bp; + + bp = allocb(IL_IPSIZE+IL_HDRSIZE); + bp->wp += IL_IPSIZE+IL_HDRSIZE; + + ih = (Ilhdr *)(bp->rp); + ih->vihl = IP_VER4; + + /* Ip fields */ + ih->proto = IP_ILPROTO; + hnputs(ih->illen, IL_HDRSIZE); + ih->frag[0] = 0; + ih->frag[1] = 0; + hnputl(ih->dst, nhgetl(inih->src)); + hnputl(ih->src, nhgetl(inih->dst)); + hnputs(ih->ilsrc, nhgets(inih->ildst)); + hnputs(ih->ildst, nhgets(inih->ilsrc)); + hnputl(ih->ilid, nhgetl(inih->ilack)); + hnputl(ih->ilack, nhgetl(inih->ilid)); + ih->iltype = Ilclose; + ih->ilspec = 0; + ih->ilsum[0] = 0; + ih->ilsum[1] = 0; + + if(ilcksum) + hnputs(ih->ilsum, ptclcsum(bp, IL_IPSIZE, IL_HDRSIZE)); + + ipoput4(f, bp, 0, MAXTTL, DFLTTOS, nil); +} + +void +ilsettimeout(Ilcb *ic) +{ + ulong pt; + + pt = (ic->delay>>LogAGain) + + ic->unackedbytes/(ic->rate>>LogAGain) + + (ic->mdev>>(LogDGain-1)) + + AckDelay; + if(pt > MaxTimeout) + pt = MaxTimeout; + ic->timeout = NOW + pt; +} + +void +ilbackoff(Ilcb *ic) +{ + ulong pt; + int i; + + pt = (ic->delay>>LogAGain) + + ic->unackedbytes/(ic->rate>>LogAGain) + + (ic->mdev>>(LogDGain-1)) + + AckDelay; + for(i = 0; i < ic->rexmit; i++) + pt = pt + (pt>>1); + if(pt > MaxTimeout) + pt = MaxTimeout; + ic->timeout = NOW + pt; + + if(ic->fasttimeout) + ic->timeout = NOW+Iltickms; + + ic->rexmit++; +} + +// complain if two numbers not within an hour of each other +#define Tfuture (1000*60*60) +int +later(ulong t1, ulong t2, char *x) +{ + int dt; + + dt = t1 - t2; + if(dt > 0) { + if(x != nil && dt > Tfuture) + print("%s: way future %d\n", x, dt); + return 1; + } + if(dt < -Tfuture) { + if(x != nil) + print("%s: way past %d\n", x, -dt); + return 1; + } + return 0; +} + +void +ilackproc(void *x) +{ + Ilcb *ic; + Conv **s, *p; + Proto *il; + + il = x; + +loop: + tsleep(&up->sleep, return0, 0, Iltickms); + for(s = il->conv; s && *s; s++) { + p = *s; + ic = (Ilcb*)p->ptcl; + + switch(ic->state) { + case Ilclosed: + case Illistening: + break; + case Ilclosing: + if(later(NOW, ic->timeout, "timeout0")) { + if(ic->rexmit > MaxRexmit){ + ilhangup(p, nil); + break; + } + ilsendctl(p, nil, Ilclose, ic->next, ic->recvd, 0); + ilbackoff(ic); + } + break; + + case Ilsyncee: + case Ilsyncer: + if(later(NOW, ic->timeout, "timeout1")) { + if(ic->rexmit > MaxRexmit){ + ilhangup(p, etime); + break; + } + ilsendctl(p, nil, Ilsync, ic->start, ic->recvd, 0); + ilbackoff(ic); + } + break; + + case Ilestablished: + if(ic->recvd != ic->acksent) + if(later(NOW, ic->acktime, "acktime")) + ilsendctl(p, nil, Ilack, ic->next, ic->recvd, 0); + + if(later(NOW, ic->querytime, "querytime")){ + if(later(NOW, ic->lastrecv+DeathTime, "deathtime")){ + netlog(il->f, Logil, "il: hangup: deathtime\n"); + ilhangup(p, etime); + break; + } + ilsendctl(p, nil, Ilquery, ic->next, ic->recvd, ilnextqt(ic)); + ic->querytime = NOW + QueryTime; + } + + if(ic->unacked != nil) + if(later(NOW, ic->timeout, "timeout2")) { + if(ic->rexmit > MaxRexmit){ + netlog(il->f, Logil, "il: hangup: too many rexmits\n"); + ilhangup(p, etime); + break; + } + ilsendctl(p, nil, Ilquery, ic->next, ic->recvd, ilnextqt(ic)); + ic->rxquery++; + ilbackoff(ic); + } + break; + } + } + goto loop; +} + +void +ilcbinit(Ilcb *ic) +{ + ic->start = nrand(0x1000000); + ic->next = ic->start+1; + ic->recvd = 0; + ic->window = Defaultwin; + ic->unackedbytes = 0; + ic->unacked = nil; + ic->outoforder = nil; + ic->rexmit = 0; + ic->rxtot = 0; + ic->rxquery = 0; + ic->qtx = 1; + ic->fasttimeout = 0; + + /* timers */ + ic->delay = DefRtt<<LogAGain; + ic->mdev = DefRtt<<LogDGain; + ic->rate = DefByteRate<<LogAGain; + ic->querytime = NOW + QueryTime; + ic->lastrecv = NOW; /* or we'll timeout right away */ + ilsettimeout(ic); +} + +char* +ilstart(Conv *c, int type, int fasttimeout) +{ + Ilcb *ic; + Ilpriv *ipriv; + char kpname[KNAMELEN]; + + ipriv = c->p->priv; + + if(ipriv->ackprocstarted == 0){ + qlock(&ipriv->apl); + if(ipriv->ackprocstarted == 0){ + sprint(kpname, "#I%dilack", c->p->f->dev); + kproc(kpname, ilackproc, c->p); + ipriv->ackprocstarted = 1; + } + qunlock(&ipriv->apl); + } + + ic = (Ilcb*)c->ptcl; + ic->conv = c; + + if(ic->state != Ilclosed) + return nil; + + ilcbinit(ic); + + if(fasttimeout){ + /* timeout if we can't connect quickly */ + ic->fasttimeout = 1; + ic->timeout = NOW+Iltickms; + ic->rexmit = MaxRexmit - 4; + }; + + switch(type) { + default: + netlog(c->p->f, Logil, "il: start: type %d\n", type); + break; + case IL_LISTEN: + ic->state = Illistening; + iphtadd(&ipriv->ht, c); + break; + case IL_CONNECT: + ic->state = Ilsyncer; + iphtadd(&ipriv->ht, c); + ilsendctl(c, nil, Ilsync, ic->start, ic->recvd, 0); + break; + } + + return nil; +} + +void +ilfreeq(Ilcb *ic) +{ + Block *bp, *next; + + qlock(&ic->ackq); + for(bp = ic->unacked; bp; bp = next) { + next = bp->list; + freeblist(bp); + } + ic->unacked = nil; + qunlock(&ic->ackq); + + qlock(&ic->outo); + for(bp = ic->outoforder; bp; bp = next) { + next = bp->list; + freeblist(bp); + } + ic->outoforder = nil; + qunlock(&ic->outo); +} + +void +iladvise(Proto *il, Block *bp, char *msg) +{ + Ilhdr *h; + Ilcb *ic; + uchar source[IPaddrlen], dest[IPaddrlen]; + ushort psource; + Conv *s, **p; + + h = (Ilhdr*)(bp->rp); + + v4tov6(dest, h->dst); + v4tov6(source, h->src); + psource = nhgets(h->ilsrc); + + + /* Look for a connection, unfortunately the destination port is missing */ + QLOCK(il); + for(p = il->conv; *p; p++) { + s = *p; + if(s->lport == psource) + if(ipcmp(s->laddr, source) == 0) + if(ipcmp(s->raddr, dest) == 0){ + QUNLOCK(il); + ic = (Ilcb*)s->ptcl; + switch(ic->state){ + case Ilsyncer: + ilhangup(s, msg); + break; + } + freeblist(bp); + return; + } + } + QUNLOCK(il); + freeblist(bp); +} + +int +ilnextqt(Ilcb *ic) +{ + int x; + + qlock(&ic->ackq); + x = ic->qtx; + if(++x > Nqt) + x = 1; + ic->qtx = x; + ic->qt[x] = ic->next-1; /* highest xmitted packet */ + ic->qt[0] = ic->qt[x]; /* compatibility with old implementations */ + qunlock(&ic->ackq); + + return x; +} + +/* calculate scale constants that converts fast ticks to ms (more or less) */ +static void +inittimescale(void) +{ + uvlong hz; + + fastticks(&hz); + if(hz > 1000){ + scalediv = hz/1000; + scalemul = 1; + } else { + scalediv = 1; + scalemul = 1000/hz; + } +} + +void +ilinit(Fs *f) +{ + Proto *il; + + inittimescale(); + + il = smalloc(sizeof(Proto)); + il->priv = smalloc(sizeof(Ilpriv)); + il->name = "il"; + il->connect = ilconnect; + il->announce = ilannounce; + il->state = ilstate; + il->create = ilcreate; + il->close = ilclose; + il->rcv = iliput; + il->ctl = nil; + il->advise = iladvise; + il->stats = ilxstats; + il->inuse = ilinuse; + il->gc = nil; + il->ipproto = IP_ILPROTO; + il->nc = scalednconv(); + il->ptclsize = sizeof(Ilcb); + Fsproto(f, il); +} diff --git a/src/9vx/a/ip/inferno.c b/src/9vx/a/ip/inferno.c @@ -0,0 +1,46 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" +#include "ip.h" + +/* + * some hacks for commonality twixt inferno and plan9 + */ + +char* +commonuser(void) +{ + return up->user; +} + +Chan* +commonfdtochan(int fd, int mode, int a, int b) +{ + return fdtochan(fd, mode, a, b); +} + +char* +commonerror(void) +{ + return up->errstr; +} + +char* +bootp(Ipifc* _) +{ + return "unimplmented"; +} + +int +bootpread(char* _, ulong __, int ___) +{ + return 0; +} + +Medium tripmedium = +{ + "trip", +}; diff --git a/src/9vx/a/ip/ip.c b/src/9vx/a/ip/ip.c @@ -0,0 +1,776 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" + +typedef struct Fragment4 Fragment4; +typedef struct Fragment6 Fragment6; +typedef struct Ipfrag Ipfrag; + +#define BLKIPVER(xp) (((Ip4hdr*)((xp)->rp))->vihl&0xF0) + +/* MIB II counters */ +enum +{ + Forwarding, + DefaultTTL, + InReceives, + InHdrErrors, + InAddrErrors, + ForwDatagrams, + InUnknownProtos, + InDiscards, + InDelivers, + OutRequests, + OutDiscards, + OutNoRoutes, + ReasmTimeout, + ReasmReqds, + ReasmOKs, + ReasmFails, + FragOKs, + FragFails, + FragCreates, + + Nstats, +}; + +struct Fragment4 +{ + Block* blist; + Fragment4* next; + ulong src; + ulong dst; + ushort id; + ulong age; +}; + +struct Fragment6 +{ + Block* blist; + Fragment6* next; + uchar src[IPaddrlen]; + uchar dst[IPaddrlen]; + uint id; + ulong age; +}; + +struct Ipfrag +{ + ushort foff; + ushort flen; +}; + +/* an instance of IP */ +struct IP +{ + ulong stats[Nstats]; + + QLock fraglock4; + Fragment4* flisthead4; + Fragment4* fragfree4; + Ref id4; + + QLock fraglock6; + Fragment6* flisthead6; + Fragment6* fragfree6; + Ref id6; + + int iprouting; /* true if we route like a gateway */ +}; + +static char *statnames[] = +{ +[Forwarding] "Forwarding", +[DefaultTTL] "DefaultTTL", +[InReceives] "InReceives", +[InHdrErrors] "InHdrErrors", +[InAddrErrors] "InAddrErrors", +[ForwDatagrams] "ForwDatagrams", +[InUnknownProtos] "InUnknownProtos", +[InDiscards] "InDiscards", +[InDelivers] "InDelivers", +[OutRequests] "OutRequests", +[OutDiscards] "OutDiscards", +[OutNoRoutes] "OutNoRoutes", +[ReasmTimeout] "ReasmTimeout", +[ReasmReqds] "ReasmReqds", +[ReasmOKs] "ReasmOKs", +[ReasmFails] "ReasmFails", +[FragOKs] "FragOKs", +[FragFails] "FragFails", +[FragCreates] "FragCreates", +}; + +#define BLKIP(xp) ((Ip4hdr*)((xp)->rp)) +/* + * This sleazy macro relies on the media header size being + * larger than sizeof(Ipfrag). ipreassemble checks this is true + */ +#define BKFG(xp) ((Ipfrag*)((xp)->base)) + +ushort ipcsum(uchar*); +Block* ip4reassemble(IP*, int, Block*, Ip4hdr*); +void ipfragfree4(IP*, Fragment4*); +Fragment4* ipfragallo4(IP*); + +void +ip_init_6(Fs *f) +{ + v6params *v6p; + + v6p = smalloc(sizeof(v6params)); + + v6p->rp.mflag = 0; /* default not managed */ + v6p->rp.oflag = 0; + v6p->rp.maxraint = 600000; /* millisecs */ + v6p->rp.minraint = 200000; + v6p->rp.linkmtu = 0; /* no mtu sent */ + v6p->rp.reachtime = 0; + v6p->rp.rxmitra = 0; + v6p->rp.ttl = MAXTTL; + v6p->rp.routerlt = 3 * v6p->rp.maxraint; + + v6p->hp.rxmithost = 1000; /* v6 RETRANS_TIMER */ + + v6p->cdrouter = -1; + + f->v6p = v6p; +} + +void +initfrag(IP *ip, int size) +{ + Fragment4 *fq4, *eq4; + Fragment6 *fq6, *eq6; + + ip->fragfree4 = (Fragment4*)malloc(sizeof(Fragment4) * size); + if(ip->fragfree4 == nil) + panic("initfrag"); + + eq4 = &ip->fragfree4[size]; + for(fq4 = ip->fragfree4; fq4 < eq4; fq4++) + fq4->next = fq4+1; + + ip->fragfree4[size-1].next = nil; + + ip->fragfree6 = (Fragment6*)malloc(sizeof(Fragment6) * size); + if(ip->fragfree6 == nil) + panic("initfrag"); + + eq6 = &ip->fragfree6[size]; + for(fq6 = ip->fragfree6; fq6 < eq6; fq6++) + fq6->next = fq6+1; + + ip->fragfree6[size-1].next = nil; +} + +void +ip_init(Fs *f) +{ + IP *ip; + + ip = smalloc(sizeof(IP)); + initfrag(ip, 100); + f->ip = ip; + + ip_init_6(f); +} + +void +iprouting(Fs *f, int on) +{ + f->ip->iprouting = on; + if(f->ip->iprouting==0) + f->ip->stats[Forwarding] = 2; + else + f->ip->stats[Forwarding] = 1; +} + +int +ipoput4(Fs *f, Block *bp, int gating, int ttl, int tos, Conv *c) +{ + Ipifc *ifc; + uchar *gate; + ulong fragoff; + Block *xp, *nb; + Ip4hdr *eh, *feh; + int lid, len, seglen, chunk, dlen, blklen, offset, medialen; + Route *r, *sr; + IP *ip; + int rv = 0; + + ip = f->ip; + + /* Fill out the ip header */ + eh = (Ip4hdr*)(bp->rp); + + ip->stats[OutRequests]++; + + /* Number of uchars in data and ip header to write */ + len = blocklen(bp); + + if(gating){ + chunk = nhgets(eh->length); + if(chunk > len){ + ip->stats[OutDiscards]++; + netlog(f, Logip, "short gated packet\n"); + goto free; + } + if(chunk < len) + len = chunk; + } + if(len >= IP_MAX){ + ip->stats[OutDiscards]++; + netlog(f, Logip, "exceeded ip max size %V\n", eh->dst); + goto free; + } + + r = v4lookup(f, eh->dst, c); + if(r == nil){ + ip->stats[OutNoRoutes]++; + netlog(f, Logip, "no interface %V\n", eh->dst); + rv = -1; + goto free; + } + + ifc = r->ifc; + if(r->type & (Rifc|Runi)) + gate = eh->dst; + else + if(r->type & (Rbcast|Rmulti)) { + gate = eh->dst; + sr = v4lookup(f, eh->src, nil); + if(sr != nil && (sr->type & Runi)) + ifc = sr->ifc; + } + else + gate = r->v4.gate; + + if(!gating) + eh->vihl = IP_VER4|IP_HLEN4; + eh->ttl = ttl; + if(!gating) + eh->tos = tos; + + if(!CANRLOCK(ifc)) + goto free; + if(waserror()){ + RUNLOCK(ifc); + nexterror(); + } + if(ifc->m == nil) + goto raise; + + /* If we dont need to fragment just send it */ + medialen = ifc->maxtu - ifc->m->hsize; + if(len <= medialen) { + if(!gating) + hnputs(eh->id, incref(&ip->id4)); + hnputs(eh->length, len); + if(!gating){ + eh->frag[0] = 0; + eh->frag[1] = 0; + } + eh->cksum[0] = 0; + eh->cksum[1] = 0; + hnputs(eh->cksum, ipcsum(&eh->vihl)); + ifc->m->bwrite(ifc, bp, V4, gate); + RUNLOCK(ifc); + poperror(); + return 0; + } + +if((eh->frag[0] & (IP_DF>>8)) && !gating) print("%V: DF set\n", eh->dst); + + if(eh->frag[0] & (IP_DF>>8)){ + ip->stats[FragFails]++; + ip->stats[OutDiscards]++; + icmpcantfrag(f, bp, medialen); + netlog(f, Logip, "%V: eh->frag[0] & (IP_DF>>8)\n", eh->dst); + goto raise; + } + + seglen = (medialen - IP4HDR) & ~7; + if(seglen < 8){ + ip->stats[FragFails]++; + ip->stats[OutDiscards]++; + netlog(f, Logip, "%V seglen < 8\n", eh->dst); + goto raise; + } + + dlen = len - IP4HDR; + xp = bp; + if(gating) + lid = nhgets(eh->id); + else + lid = incref(&ip->id4); + + offset = IP4HDR; + while(xp != nil && offset && offset >= BLEN(xp)) { + offset -= BLEN(xp); + xp = xp->next; + } + xp->rp += offset; + + if(gating) + fragoff = nhgets(eh->frag)<<3; + else + fragoff = 0; + dlen += fragoff; + for(; fragoff < dlen; fragoff += seglen) { + nb = allocb(IP4HDR+seglen); + feh = (Ip4hdr*)(nb->rp); + + memmove(nb->wp, eh, IP4HDR); + nb->wp += IP4HDR; + + if((fragoff + seglen) >= dlen) { + seglen = dlen - fragoff; + hnputs(feh->frag, fragoff>>3); + } + else + hnputs(feh->frag, (fragoff>>3)|IP_MF); + + hnputs(feh->length, seglen + IP4HDR); + hnputs(feh->id, lid); + + /* Copy up the data area */ + chunk = seglen; + while(chunk) { + if(!xp) { + ip->stats[OutDiscards]++; + ip->stats[FragFails]++; + freeblist(nb); + netlog(f, Logip, "!xp: chunk %d\n", chunk); + goto raise; + } + blklen = chunk; + if(BLEN(xp) < chunk) + blklen = BLEN(xp); + memmove(nb->wp, xp->rp, blklen); + nb->wp += blklen; + xp->rp += blklen; + chunk -= blklen; + if(xp->rp == xp->wp) + xp = xp->next; + } + + feh->cksum[0] = 0; + feh->cksum[1] = 0; + hnputs(feh->cksum, ipcsum(&feh->vihl)); + ifc->m->bwrite(ifc, nb, V4, gate); + ip->stats[FragCreates]++; + } + ip->stats[FragOKs]++; +raise: + RUNLOCK(ifc); + poperror(); +free: + freeblist(bp); + return rv; +} + +void +ipiput4(Fs *f, Ipifc *ifc, Block *bp) +{ + int hl; + int hop, tos, proto, olen; + Ip4hdr *h; + Proto *p; + ushort frag; + int notforme; + uchar *dp, v6dst[IPaddrlen]; + IP *ip; + Route *r; + + if(BLKIPVER(bp) != IP_VER4) { + ipiput6(f, ifc, bp); + return; + } + + ip = f->ip; + ip->stats[InReceives]++; + + /* + * Ensure we have all the header info in the first + * block. Make life easier for other protocols by + * collecting up to the first 64 bytes in the first block. + */ + if(BLEN(bp) < 64) { + hl = blocklen(bp); + if(hl < IP4HDR) + hl = IP4HDR; + if(hl > 64) + hl = 64; + bp = pullupblock(bp, hl); + if(bp == nil) + return; + } + + h = (Ip4hdr*)(bp->rp); + + /* dump anything that whose header doesn't checksum */ + if((bp->flag & Bipck) == 0 && ipcsum(&h->vihl)) { + ip->stats[InHdrErrors]++; + netlog(f, Logip, "ip: checksum error %V\n", h->src); + freeblist(bp); + return; + } + v4tov6(v6dst, h->dst); + notforme = ipforme(f, v6dst) == 0; + + /* Check header length and version */ + if((h->vihl&0x0F) != IP_HLEN4) { + hl = (h->vihl&0xF)<<2; + if(hl < (IP_HLEN4<<2)) { + ip->stats[InHdrErrors]++; + netlog(f, Logip, "ip: %V bad hivl %ux\n", h->src, h->vihl); + freeblist(bp); + return; + } + /* If this is not routed strip off the options */ + if(notforme == 0) { + olen = nhgets(h->length); + dp = bp->rp + (hl - (IP_HLEN4<<2)); + memmove(dp, h, IP_HLEN4<<2); + bp->rp = dp; + h = (Ip4hdr*)(bp->rp); + h->vihl = (IP_VER4|IP_HLEN4); + hnputs(h->length, olen-hl+(IP_HLEN4<<2)); + } + } + + /* route */ + if(notforme) { + Conv conv; + + if(!ip->iprouting){ + freeb(bp); + return; + } + + /* don't forward to source's network */ + conv.r = nil; + r = v4lookup(f, h->dst, &conv); + if(r == nil || r->ifc == ifc){ + ip->stats[OutDiscards]++; + freeblist(bp); + return; + } + + /* don't forward if packet has timed out */ + hop = h->ttl; + if(hop < 1) { + ip->stats[InHdrErrors]++; + icmpttlexceeded(f, ifc->lifc->local, bp); + freeblist(bp); + return; + } + + /* reassemble if the interface expects it */ +if(r->ifc == nil) panic("nil route rfc"); + if(r->ifc->reassemble){ + frag = nhgets(h->frag); + if(frag) { + h->tos = 0; + if(frag & IP_MF) + h->tos = 1; + bp = ip4reassemble(ip, frag, bp, h); + if(bp == nil) + return; + h = (Ip4hdr*)(bp->rp); + } + } + + ip->stats[ForwDatagrams]++; + tos = h->tos; + hop = h->ttl; + ipoput4(f, bp, 1, hop - 1, tos, &conv); + return; + } + + frag = nhgets(h->frag); + if(frag) { + h->tos = 0; + if(frag & IP_MF) + h->tos = 1; + bp = ip4reassemble(ip, frag, bp, h); + if(bp == nil) + return; + h = (Ip4hdr*)(bp->rp); + } + + /* don't let any frag info go up the stack */ + h->frag[0] = 0; + h->frag[1] = 0; + + proto = h->proto; + p = Fsrcvpcol(f, proto); + if(p != nil && p->rcv != nil) { + ip->stats[InDelivers]++; + (*p->rcv)(p, ifc, bp); + return; + } + ip->stats[InDiscards]++; + ip->stats[InUnknownProtos]++; + freeblist(bp); +} + +int +ipstats(Fs *f, char *buf, int len) +{ + IP *ip; + char *p, *e; + int i; + + ip = f->ip; + ip->stats[DefaultTTL] = MAXTTL; + + p = buf; + e = p+len; + for(i = 0; i < Nstats; i++) + p = seprint(p, e, "%s: %lud\n", statnames[i], ip->stats[i]); + return p - buf; +} + +Block* +ip4reassemble(IP *ip, int offset, Block *bp, Ip4hdr *ih) +{ + int fend; + ushort id; + Fragment4 *f, *fnext; + ulong src, dst; + Block *bl, **l, *last, *prev; + int ovlap, len, fragsize, pktposn; + + src = nhgetl(ih->src); + dst = nhgetl(ih->dst); + id = nhgets(ih->id); + + /* + * block lists are too hard, pullupblock into a single block + */ + if(bp->next){ + bp = pullupblock(bp, blocklen(bp)); + ih = (Ip4hdr*)(bp->rp); + } + + qlock(&ip->fraglock4); + + /* + * find a reassembly queue for this fragment + */ + for(f = ip->flisthead4; f; f = fnext){ + fnext = f->next; /* because ipfragfree4 changes the list */ + if(f->src == src && f->dst == dst && f->id == id) + break; + if(f->age < NOW){ + ip->stats[ReasmTimeout]++; + ipfragfree4(ip, f); + } + } + + /* + * if this isn't a fragmented packet, accept it + * and get rid of any fragments that might go + * with it. + */ + if(!ih->tos && (offset & ~(IP_MF|IP_DF)) == 0) { + if(f != nil) { + ipfragfree4(ip, f); + ip->stats[ReasmFails]++; + } + qunlock(&ip->fraglock4); + return bp; + } + + if(bp->base+sizeof(Ipfrag) >= bp->rp){ + bp = padblock(bp, sizeof(Ipfrag)); + bp->rp += sizeof(Ipfrag); + } + + BKFG(bp)->foff = offset<<3; + BKFG(bp)->flen = nhgets(ih->length)-IP4HDR; + + /* First fragment allocates a reassembly queue */ + if(f == nil) { + f = ipfragallo4(ip); + f->id = id; + f->src = src; + f->dst = dst; + + f->blist = bp; + + qunlock(&ip->fraglock4); + ip->stats[ReasmReqds]++; + return nil; + } + + /* + * find the new fragment's position in the queue + */ + prev = nil; + l = &f->blist; + bl = f->blist; + while(bl != nil && BKFG(bp)->foff > BKFG(bl)->foff) { + prev = bl; + l = &bl->next; + bl = bl->next; + } + + /* Check overlap of a previous fragment - trim away as necessary */ + if(prev) { + ovlap = BKFG(prev)->foff + BKFG(prev)->flen - BKFG(bp)->foff; + if(ovlap > 0) { + if(ovlap >= BKFG(bp)->flen) { + freeblist(bp); + qunlock(&ip->fraglock4); + return nil; + } + BKFG(prev)->flen -= ovlap; + } + } + + /* Link onto assembly queue */ + bp->next = *l; + *l = bp; + + /* Check to see if succeeding segments overlap */ + if(bp->next) { + l = &bp->next; + fend = BKFG(bp)->foff + BKFG(bp)->flen; + /* Take completely covered segments out */ + while(*l) { + ovlap = fend - BKFG(*l)->foff; + if(ovlap <= 0) + break; + if(ovlap < BKFG(*l)->flen) { + BKFG(*l)->flen -= ovlap; + BKFG(*l)->foff += ovlap; + /* move up ih hdrs */ + memmove((*l)->rp + ovlap, (*l)->rp, IP4HDR); + (*l)->rp += ovlap; + break; + } + last = (*l)->next; + (*l)->next = nil; + freeblist(*l); + *l = last; + } + } + + /* + * look for a complete packet. if we get to a fragment + * without IP_MF set, we're done. + */ + pktposn = 0; + for(bl = f->blist; bl; bl = bl->next) { + if(BKFG(bl)->foff != pktposn) + break; + if((BLKIP(bl)->frag[0]&(IP_MF>>8)) == 0) { + bl = f->blist; + len = nhgets(BLKIP(bl)->length); + bl->wp = bl->rp + len; + + /* Pullup all the fragment headers and + * return a complete packet + */ + for(bl = bl->next; bl; bl = bl->next) { + fragsize = BKFG(bl)->flen; + len += fragsize; + bl->rp += IP4HDR; + bl->wp = bl->rp + fragsize; + } + + bl = f->blist; + f->blist = nil; + ipfragfree4(ip, f); + ih = BLKIP(bl); + hnputs(ih->length, len); + qunlock(&ip->fraglock4); + ip->stats[ReasmOKs]++; + return bl; + } + pktposn += BKFG(bl)->flen; + } + qunlock(&ip->fraglock4); + return nil; +} + +/* + * ipfragfree4 - Free a list of fragments - assume hold fraglock4 + */ +void +ipfragfree4(IP *ip, Fragment4 *frag) +{ + Fragment4 *fl, **l; + + if(frag->blist) + freeblist(frag->blist); + + frag->src = 0; + frag->id = 0; + frag->blist = nil; + + l = &ip->flisthead4; + for(fl = *l; fl; fl = fl->next) { + if(fl == frag) { + *l = frag->next; + break; + } + l = &fl->next; + } + + frag->next = ip->fragfree4; + ip->fragfree4 = frag; + +} + +/* + * ipfragallo4 - allocate a reassembly queue - assume hold fraglock4 + */ +Fragment4 * +ipfragallo4(IP *ip) +{ + Fragment4 *f; + + while(ip->fragfree4 == nil) { + /* free last entry on fraglist */ + for(f = ip->flisthead4; f->next; f = f->next) + ; + ipfragfree4(ip, f); + } + f = ip->fragfree4; + ip->fragfree4 = f->next; + f->next = ip->flisthead4; + ip->flisthead4 = f; + f->age = NOW + 30000; + + return f; +} + +ushort +ipcsum(uchar *addr) +{ + int len; + ulong sum; + + sum = 0; + len = (addr[0]&0xf)<<2; + + while(len > 0) { + sum += addr[0]<<8 | addr[1] ; + len -= 2; + addr += 2; + } + + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + + return (sum^0xffff); +} diff --git a/src/9vx/a/ip/ip.h b/src/9vx/a/ip/ip.h @@ -0,0 +1,677 @@ +typedef struct Conv Conv; +typedef struct Fs Fs; +typedef union Hwaddr Hwaddr; +typedef struct IP IP; +typedef struct IPaux IPaux; +typedef struct Ipself Ipself; +typedef struct Ipselftab Ipselftab; +typedef struct Iplink Iplink; +typedef struct Iplifc Iplifc; +typedef struct Ipmulti Ipmulti; +typedef struct Ipifc Ipifc; +typedef struct Iphash Iphash; +typedef struct Ipht Ipht; +typedef struct Netlog Netlog; +typedef struct Medium Medium; +typedef struct Proto Proto; +typedef struct Arpent Arpent; +typedef struct Arp Arp; +typedef struct Route Route; + +typedef struct Routerparams Routerparams; +typedef struct Hostparams Hostparams; +typedef struct v6router v6router; +typedef struct v6params v6params; + +enum +{ + Addrlen= 64, + Maxproto= 20, + Nhash= 64, + Maxincall= 5, + Nchans= 1024, + MAClen= 16, /* longest mac address */ + + MAXTTL= 255, + DFLTTOS= 0, + + IPaddrlen= 16, + IPv4addrlen= 4, + IPv4off= 12, + IPllen= 4, + + /* ip versions */ + V4= 4, + V6= 6, + IP_VER4= 0x40, + IP_VER6= 0x60, + IP_HLEN4= 5, /* v4: Header length in words */ + IP_DF= 0x4000, /* v4: Don't fragment */ + IP_MF= 0x2000, /* v4: More fragments */ + IP4HDR= 20, /* sizeof(Ip4hdr) */ + IP_MAX= 64*1024, /* Max. Internet packet size, v4 & v6 */ + + /* 2^Lroot trees in the root table */ + Lroot= 10, + + Maxpath = 64, +}; + +enum +{ + Idle= 0, + Announcing= 1, + Announced= 2, + Connecting= 3, + Connected= 4, +}; + +/* on the wire packet header */ +typedef struct Ip4hdr Ip4hdr; +struct Ip4hdr +{ + uchar vihl; /* Version and header length */ + uchar tos; /* Type of service */ + uchar length[2]; /* packet length */ + uchar id[2]; /* ip->identification */ + uchar frag[2]; /* Fragment information */ + uchar ttl; /* Time to live */ + uchar proto; /* Protocol */ + uchar cksum[2]; /* Header checksum */ + uchar src[4]; /* IP source */ + uchar dst[4]; /* IP destination */ +}; + +/* + * one per conversation directory + */ +struct Conv +{ + QLock qlock; + + int x; /* conversation index */ + Proto* p; + + int restricted; /* remote port is restricted */ + uint ttl; /* max time to live */ + uint tos; /* type of service */ + int ignoreadvice; /* don't terminate connection on icmp errors */ + + uchar ipversion; + uchar laddr[IPaddrlen]; /* local IP address */ + uchar raddr[IPaddrlen]; /* remote IP address */ + ushort lport; /* local port number */ + ushort rport; /* remote port number */ + + char *owner; /* protections */ + int perm; + int inuse; /* opens of listen/data/ctl */ + int length; + int state; + + int maxfragsize; /* If set, used for fragmentation */ + + /* udp specific */ + int headers; /* data src/dst headers in udp */ + int reliable; /* true if reliable udp */ + + Conv* incall; /* calls waiting to be listened for */ + Conv* next; + + Queue* rq; /* queued data waiting to be read */ + Queue* wq; /* queued data waiting to be written */ + Queue* eq; /* returned error packets */ + Queue* sq; /* snooping queue */ + Ref snoopers; /* number of processes with snoop open */ + + QLock car; + Rendez cr; + char cerr[ERRMAX]; + + QLock listenq; + Rendez listenr; + + Ipmulti *multi; /* multicast bindings for this interface */ + + void* ptcl; /* protocol specific stuff */ + + Route *r; /* last route used */ + ulong rgen; /* routetable generation for *r */ +}; + +struct Medium +{ + char *name; + int hsize; /* medium header size */ + int mintu; /* default min mtu */ + int maxtu; /* default max mtu */ + int maclen; /* mac address length */ + void (*bind)(Ipifc*, int, char**); + void (*unbind)(Ipifc*); + void (*bwrite)(Ipifc *ifc, Block *b, int version, uchar *ip); + + /* for arming interfaces to receive multicast */ + void (*addmulti)(Ipifc *ifc, uchar *a, uchar *ia); + void (*remmulti)(Ipifc *ifc, uchar *a, uchar *ia); + + /* process packets written to 'data' */ + void (*pktin)(Fs *f, Ipifc *ifc, Block *bp); + + /* routes for router boards */ + void (*addroute)(Ipifc *ifc, int, uchar*, uchar*, uchar*, int); + void (*remroute)(Ipifc *ifc, int, uchar*, uchar*); + void (*flushroutes)(Ipifc *ifc); + + /* for routing multicast groups */ + void (*joinmulti)(Ipifc *ifc, uchar *a, uchar *ia); + void (*leavemulti)(Ipifc *ifc, uchar *a, uchar *ia); + + /* address resolution */ + void (*ares)(Fs*, int, uchar*, uchar*, int, int); /* resolve */ + void (*areg)(Ipifc*, uchar*); /* register */ + + /* v6 address generation */ + void (*pref2addr)(uchar *pref, uchar *ea); + + int unbindonclose; /* if non-zero, unbind on last close */ +}; + +/* logical interface associated with a physical one */ +struct Iplifc +{ + uchar local[IPaddrlen]; + uchar mask[IPaddrlen]; + uchar remote[IPaddrlen]; + uchar net[IPaddrlen]; + uchar tentative; /* =1 => v6 dup disc on, =0 => confirmed unique */ + uchar onlink; /* =1 => onlink, =0 offlink. */ + uchar autoflag; /* v6 autonomous flag */ + long validlt; /* v6 valid lifetime */ + long preflt; /* v6 preferred lifetime */ + long origint; /* time when addr was added */ + Iplink *link; /* addresses linked to this lifc */ + Iplifc *next; +}; + +/* binding twixt Ipself and Iplifc */ +struct Iplink +{ + Ipself *self; + Iplifc *lifc; + Iplink *selflink; /* next link for this local address */ + Iplink *lifclink; /* next link for this ifc */ + ulong expire; + Iplink *next; /* free list */ + int ref; +}; + +/* rfc 2461, pp.40—43. */ + +/* default values, one per stack */ +struct Routerparams { + int mflag; /* flag: managed address configuration */ + int oflag; /* flag: other stateful configuration */ + int maxraint; /* max. router adv interval (ms) */ + int minraint; /* min. router adv interval (ms) */ + int linkmtu; /* mtu options */ + int reachtime; /* reachable time */ + int rxmitra; /* retransmit interval */ + int ttl; /* cur hop count limit */ + int routerlt; /* router lifetime */ +}; + +struct Hostparams { + int rxmithost; +}; + +struct Ipifc +{ + RWlock rwlock; + + Conv *conv; /* link to its conversation structure */ + char dev[64]; /* device we're attached to */ + Medium *m; /* Media pointer */ + int maxtu; /* Maximum transfer unit */ + int mintu; /* Minumum tranfer unit */ + int mbps; /* megabits per second */ + void *arg; /* medium specific */ + int reassemble; /* reassemble IP packets before forwarding */ + + /* these are used so that we can unbind on the fly */ + Lock idlock; + uchar ifcid; /* incremented each 'bind/unbind/add/remove' */ + int ref; /* number of proc's using this ipifc */ + Rendez wait; /* where unbinder waits for ref == 0 */ + int unbinding; + + uchar mac[MAClen]; /* MAC address */ + + Iplifc *lifc; /* logical interfaces on this physical one */ + + ulong in, out; /* message statistics */ + ulong inerr, outerr; /* ... */ + + uchar sendra6; /* flag: send router advs on this ifc */ + uchar recvra6; /* flag: recv router advs on this ifc */ + Routerparams rp; /* router parameters as in RFC 2461, pp.40—43. + used only if node is router */ +}; + +/* + * one per multicast-lifc pair used by a Conv + */ +struct Ipmulti +{ + uchar ma[IPaddrlen]; + uchar ia[IPaddrlen]; + Ipmulti *next; +}; + +/* + * hash table for 2 ip addresses + 2 ports + */ +enum +{ + Nipht= 521, /* convenient prime */ + + IPmatchexact= 0, /* match on 4 tuple */ + IPmatchany, /* *!* */ + IPmatchport, /* *!port */ + IPmatchaddr, /* addr!* */ + IPmatchpa, /* addr!port */ +}; +struct Iphash +{ + Iphash *next; + Conv *c; + int match; +}; +struct Ipht +{ + Lock lk; + + Iphash *tab[Nipht]; +}; +void iphtadd(Ipht*, Conv*); +void iphtrem(Ipht*, Conv*); +Conv* iphtlook(Ipht *ht, uchar *sa, ushort sp, uchar *da, ushort dp); + +/* + * one per multiplexed protocol + */ +struct Proto +{ + QLock qlock; + + char* name; /* protocol name */ + int x; /* protocol index */ + int ipproto; /* ip protocol type */ + + char* (*connect)(Conv*, char**, int); + char* (*announce)(Conv*, char**, int); + char* (*bind)(Conv*, char**, int); + int (*state)(Conv*, char*, int); + void (*create)(Conv*); + void (*close)(Conv*); + void (*rcv)(Proto*, Ipifc*, Block*); + char* (*ctl)(Conv*, char**, int); + void (*advise)(Proto*, Block*, char*); + int (*stats)(Proto*, char*, int); + int (*local)(Conv*, char*, int); + int (*remote)(Conv*, char*, int); + int (*inuse)(Conv*); + int (*gc)(Proto*); /* returns true if any conversations are freed */ + + Fs *f; /* file system this proto is part of */ + Conv **conv; /* array of conversations */ + int ptclsize; /* size of per protocol ctl block */ + int nc; /* number of conversations */ + int ac; + Qid qid; /* qid for protocol directory */ + ushort nextrport; + + void *priv; +}; + + +/* + * one per IP protocol stack + */ +struct Fs +{ + RWlock rwlock; + + Conv *conv; /* link to its conversation structure */ + int dev; + + int np; + Proto* p[Maxproto+1]; /* list of supported protocols */ + Proto* t2p[256]; /* vector of all protocols */ + Proto* ipifc; /* kludge for ipifcremroute & ipifcaddroute */ + Proto* ipmux; /* kludge for finding an ip multiplexor */ + + IP *ip; + Ipselftab *self; + Arp *arp; + v6params *v6p; + + Route *v4root[1<<Lroot]; /* v4 routing forest */ + Route *v6root[1<<Lroot]; /* v6 routing forest */ + Route *queue; /* used as temp when reinjecting routes */ + + Netlog *alog; + + char ndb[1024]; /* an ndb entry for this interface */ + int ndbvers; + long ndbmtime; +}; + +/* one per default router known to host */ +struct v6router { + uchar inuse; + Ipifc *ifc; + int ifcid; + uchar routeraddr[IPaddrlen]; + long ltorigin; + Routerparams rp; +}; + +struct v6params +{ + Routerparams rp; /* v6 params, one copy per node now */ + Hostparams hp; + v6router v6rlist[3]; /* max 3 default routers, currently */ + int cdrouter; /* uses only v6rlist[cdrouter] if */ + /* cdrouter >= 0. */ +}; + + +int Fsconnected(Conv*, char*); +Conv* Fsnewcall(Conv*, uchar*, ushort, uchar*, ushort, uchar); +int Fspcolstats(char*, int); +int Fsproto(Fs*, Proto*); +int Fsbuiltinproto(Fs*, uchar); +Conv* Fsprotoclone(Proto*, char*); +Proto* Fsrcvpcol(Fs*, uchar); +Proto* Fsrcvpcolx(Fs*, uchar); +char* Fsstdconnect(Conv*, char**, int); +char* Fsstdannounce(Conv*, char**, int); +char* Fsstdbind(Conv*, char**, int); +ulong scalednconv(void); +void closeconv(Conv*); +/* + * logging + */ +enum +{ + Logip= 1<<1, + Logtcp= 1<<2, + Logfs= 1<<3, + Logil= 1<<4, + Logicmp= 1<<5, + Logudp= 1<<6, + Logcompress= 1<<7, + Logilmsg= 1<<8, + Loggre= 1<<9, + Logppp= 1<<10, + Logtcprxmt= 1<<11, + Logigmp= 1<<12, + Logudpmsg= 1<<13, + Logipmsg= 1<<14, + Logrudp= 1<<15, + Logrudpmsg= 1<<16, + Logesp= 1<<17, + Logtcpwin= 1<<18, +}; + +void netloginit(Fs*); +void netlogopen(Fs*); +void netlogclose(Fs*); +void netlogctl(Fs*, char*, int); +long netlogread(Fs*, void*, ulong, long); +void netlog(Fs*, int, char*, ...); +void ifcloginit(Fs*); +long ifclogread(Fs*, Chan *,void*, ulong, long); +void ifclog(Fs*, uchar *, int); +void ifclogopen(Fs*, Chan*); +void ifclogclose(Fs*, Chan*); + +/* + * iproute.c + */ +typedef struct RouteTree RouteTree; +typedef struct Routewalk Routewalk; +typedef struct V4route V4route; +typedef struct V6route V6route; + +enum +{ + + /* type bits */ + Rv4= (1<<0), /* this is a version 4 route */ + Rifc= (1<<1), /* this route is a directly connected interface */ + Rptpt= (1<<2), /* this route is a pt to pt interface */ + Runi= (1<<3), /* a unicast self address */ + Rbcast= (1<<4), /* a broadcast self address */ + Rmulti= (1<<5), /* a multicast self address */ + Rproxy= (1<<6), /* this route should be proxied */ +}; + +struct Routewalk +{ + int o; + int h; + char* p; + char* e; + void* state; + void (*walk)(Route*, Routewalk*); +}; + +struct RouteTree +{ + Route* right; + Route* left; + Route* mid; + uchar depth; + uchar type; + uchar ifcid; /* must match ifc->id */ + Ipifc *ifc; + char tag[4]; + int ref; +}; + +struct V4route +{ + ulong address; + ulong endaddress; + uchar gate[IPv4addrlen]; +}; + +struct V6route +{ + ulong address[IPllen]; + ulong endaddress[IPllen]; + uchar gate[IPaddrlen]; +}; + +struct Route +{ +/* RouteTree; */ + Route* right; + Route* left; + Route* mid; + uchar depth; + uchar type; + uchar ifcid; /* must match ifc->id */ + Ipifc *ifc; + char tag[4]; + int ref; + + union { + V6route v6; + V4route v4; + }; +}; +extern void v4addroute(Fs *f, char *tag, uchar *a, uchar *mask, uchar *gate, int type); +extern void v6addroute(Fs *f, char *tag, uchar *a, uchar *mask, uchar *gate, int type); +extern void v4delroute(Fs *f, uchar *a, uchar *mask, int dolock); +extern void v6delroute(Fs *f, uchar *a, uchar *mask, int dolock); +extern Route* v4lookup(Fs *f, uchar *a, Conv *c); +extern Route* v6lookup(Fs *f, uchar *a, Conv *c); +extern long routeread(Fs *f, char*, ulong, int); +extern long routewrite(Fs *f, Chan*, char*, int); +extern void routetype(int, char*); +extern void ipwalkroutes(Fs*, Routewalk*); +extern void convroute(Route*, uchar*, uchar*, uchar*, char*, int*); + +/* + * devip.c + */ + +/* + * Hanging off every ip channel's ->aux is the following structure. + * It maintains the state used by devip and iproute. + */ +struct IPaux +{ + char *owner; /* the user that did the attach */ + char tag[4]; +}; + +extern IPaux* newipaux(char*, char*); + +/* + * arp.c + */ +struct Arpent +{ + uchar ip[IPaddrlen]; + uchar mac[MAClen]; + Medium *type; /* media type */ + Arpent* hash; + Block* hold; + Block* last; + uint ctime; /* time entry was created or refreshed */ + uint utime; /* time entry was last used */ + uchar state; + Arpent *nextrxt; /* re-transmit chain */ + uint rtime; /* time for next retransmission */ + uchar rxtsrem; + Ipifc *ifc; + uchar ifcid; /* must match ifc->id */ +}; + +extern void arpinit(Fs*); +extern int arpread(Arp*, char*, ulong, int); +extern int arpwrite(Fs*, char*, int); +extern Arpent* arpget(Arp*, Block *bp, int version, Ipifc *ifc, uchar *ip, uchar *h); +extern void arprelease(Arp*, Arpent *a); +extern Block* arpresolve(Arp*, Arpent *a, Medium *type, uchar *mac); +extern void arpenter(Fs*, int version, uchar *ip, uchar *mac, int len, int norefresh); + +/* + * ipaux.c + */ + +extern int myetheraddr(uchar*, char*); +extern vlong parseip(uchar*, char*); +extern vlong parseipmask(uchar*, char*); +extern char* v4parseip(uchar*, char*); +extern void maskip(uchar *from, uchar *mask, uchar *to); +extern int parsemac(uchar *to, char *from, int len); +extern uchar* defmask(uchar*); +extern int isv4(uchar*); +extern void v4tov6(uchar *v6, uchar *v4); +extern int v6tov4(uchar *v4, uchar *v6); +extern int eipfmt(Fmt*); + +#define ipmove(x, y) memmove(x, y, IPaddrlen) +#define ipcmp(x, y) ( (x)[IPaddrlen-1] != (y)[IPaddrlen-1] || memcmp(x, y, IPaddrlen) ) + +extern uchar IPv4bcast[IPaddrlen]; +extern uchar IPv4bcastobs[IPaddrlen]; +extern uchar IPv4allsys[IPaddrlen]; +extern uchar IPv4allrouter[IPaddrlen]; +extern uchar IPnoaddr[IPaddrlen]; +extern uchar v4prefix[IPaddrlen]; +extern uchar IPallbits[IPaddrlen]; + +#define NOW msec() + +/* + * media + */ +extern Medium ethermedium; +extern Medium nullmedium; +extern Medium pktmedium; +extern Medium tripmedium; + +/* + * ipifc.c + */ +extern Medium* ipfindmedium(char *name); +extern void addipmedium(Medium *med); +extern int ipforme(Fs*, uchar *addr); +extern int iptentative(Fs*, uchar *addr); +extern int ipisbm(uchar *); +extern int ipismulticast(uchar *); +extern Ipifc* findipifc(Fs*, uchar *remote, int type); +extern void findlocalip(Fs*, uchar *local, uchar *remote); +extern int ipv4local(Ipifc *ifc, uchar *addr); +extern int ipv6local(Ipifc *ifc, uchar *addr); +extern int ipv6anylocal(Ipifc *ifc, uchar *addr); +extern Iplifc* iplocalonifc(Ipifc *ifc, uchar *ip); +extern int ipproxyifc(Fs *f, Ipifc *ifc, uchar *ip); +extern int ipismulticast(uchar *ip); +extern int ipisbooting(void); +extern int ipifccheckin(Ipifc *ifc, Medium *med); +extern void ipifccheckout(Ipifc *ifc); +extern int ipifcgrab(Ipifc *ifc); +extern void ipifcaddroute(Fs*, int, uchar*, uchar*, uchar*, int); +extern void ipifcremroute(Fs*, int, uchar*, uchar*); +extern void ipifcremmulti(Conv *c, uchar *ma, uchar *ia); +extern void ipifcaddmulti(Conv *c, uchar *ma, uchar *ia); +extern char* ipifcrem(Ipifc *ifc, char **argv, int argc); +extern char* ipifcadd(Ipifc *ifc, char **argv, int argc, int tentative, Iplifc *lifcp); +extern long ipselftabread(Fs*, char *a, ulong offset, int n); +extern char* ipifcadd6(Ipifc *ifc, char**argv, int argc); +/* + * ip.c + */ +extern void iprouting(Fs*, int); +extern void icmpnoconv(Fs*, Block*); +extern void icmpcantfrag(Fs*, Block*, int); +extern void icmpttlexceeded(Fs*, uchar*, Block*); +extern ushort ipcsum(uchar*); +extern void ipiput4(Fs*, Ipifc*, Block*); +extern void ipiput6(Fs*, Ipifc*, Block*); +extern int ipoput4(Fs*, Block*, int, int, int, Conv*); +extern int ipoput6(Fs*, Block*, int, int, int, Conv*); +extern int ipstats(Fs*, char*, int); +extern ushort ptclbsum(uchar*, int); +extern ushort ptclcsum(Block*, int, int); +extern void ip_init(Fs*); +extern void update_mtucache(uchar*, ulong); +extern ulong restrict_mtu(uchar*, ulong); +/* + * bootp.c + */ +extern char* bootp(Ipifc*); +extern int bootpread(char*, ulong, int); + +/* + * resolving inferno/plan9 differences + */ +Chan* commonfdtochan(int, int, int, int); +char* commonuser(void); +char* commonerror(void); + +/* + * chandial.c + */ +extern Chan* chandial(char*, char*, char*, Chan**); + +/* + * global to all of the stack + */ +extern void (*igmpreportfn)(Ipifc*, uchar*); diff --git a/src/9vx/a/ip/ipaux.c b/src/9vx/a/ip/ipaux.c @@ -0,0 +1,368 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" +#include "ip.h" +#include "ipv6.h" + +char *v6hdrtypes[Maxhdrtype] = +{ + [HBH] "HopbyHop", + [ICMP] "ICMP", + [IGMP] "IGMP", + [GGP] "GGP", + [IPINIP] "IP", + [ST] "ST", + [TCP] "TCP", + [UDP] "UDP", + [ISO_TP4] "ISO_TP4", + [RH] "Routinghdr", + [FH] "Fraghdr", + [IDRP] "IDRP", + [RSVP] "RSVP", + [AH] "Authhdr", + [ESP] "ESP", + [ICMPv6] "ICMPv6", + [NNH] "Nonexthdr", + [ISO_IP] "ISO_IP", + [IGRP] "IGRP", + [OSPF] "OSPF", +}; + +/* + * well known IPv6 addresses + */ +uchar v6Unspecified[IPaddrlen] = { + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 +}; +uchar v6loopback[IPaddrlen] = { + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0x01 +}; + +uchar v6linklocal[IPaddrlen] = { + 0xfe, 0x80, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 +}; +uchar v6linklocalmask[IPaddrlen] = { + 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + 0, 0, 0, 0, + 0, 0, 0, 0 +}; +int v6llpreflen = 8; /* link-local prefix length in bytes */ + +uchar v6multicast[IPaddrlen] = { + 0xff, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 +}; +uchar v6multicastmask[IPaddrlen] = { + 0xff, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 +}; +int v6mcpreflen = 1; /* multicast prefix length */ + +uchar v6allnodesN[IPaddrlen] = { + 0xff, 0x01, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0x01 +}; +uchar v6allroutersN[IPaddrlen] = { + 0xff, 0x01, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0x02 +}; +uchar v6allnodesNmask[IPaddrlen] = { + 0xff, 0xff, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 +}; +int v6aNpreflen = 2; /* all nodes (N) prefix */ + +uchar v6allnodesL[IPaddrlen] = { + 0xff, 0x02, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0x01 +}; +uchar v6allroutersL[IPaddrlen] = { + 0xff, 0x02, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0x02 +}; +uchar v6allnodesLmask[IPaddrlen] = { + 0xff, 0xff, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 +}; +int v6aLpreflen = 2; /* all nodes (L) prefix */ + +uchar v6solicitednode[IPaddrlen] = { + 0xff, 0x02, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0x01, + 0xff, 0, 0, 0 +}; +uchar v6solicitednodemask[IPaddrlen] = { + 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, + 0xff, 0x0, 0x0, 0x0 +}; +int v6snpreflen = 13; + +ushort +ptclcsum(Block *bp, int offset, int len) +{ + uchar *addr; + ulong losum, hisum; + ushort csum; + int odd, blocklen, x; + + /* Correct to front of data area */ + while(bp != nil && offset && offset >= BLEN(bp)) { + offset -= BLEN(bp); + bp = bp->next; + } + if(bp == nil) + return 0; + + addr = bp->rp + offset; + blocklen = BLEN(bp) - offset; + + if(bp->next == nil) { + if(blocklen < len) + len = blocklen; + return ~ptclbsum(addr, len) & 0xffff; + } + + losum = 0; + hisum = 0; + + odd = 0; + while(len) { + x = blocklen; + if(len < x) + x = len; + + csum = ptclbsum(addr, x); + if(odd) + hisum += csum; + else + losum += csum; + odd = (odd+x) & 1; + len -= x; + + bp = bp->next; + if(bp == nil) + break; + blocklen = BLEN(bp); + addr = bp->rp; + } + + losum += hisum>>8; + losum += (hisum&0xff)<<8; + while((csum = losum>>16) != 0) + losum = csum + (losum & 0xffff); + + return ~losum & 0xffff; +} + +enum +{ + Isprefix= 16, +}; + +#define CLASS(p) ((*(uchar*)(p))>>6) + +void +ipv62smcast(uchar *smcast, uchar *a) +{ + assert(IPaddrlen == 16); + memmove(smcast, v6solicitednode, IPaddrlen); + smcast[13] = a[13]; + smcast[14] = a[14]; + smcast[15] = a[15]; +} + + +/* + * parse a hex mac address + */ +int +parsemac(uchar *to, char *from, int len) +{ + char nip[4]; + char *p; + int i; + + p = from; + memset(to, 0, len); + for(i = 0; i < len; i++){ + if(p[0] == '\0' || p[1] == '\0') + break; + + nip[0] = p[0]; + nip[1] = p[1]; + nip[2] = '\0'; + p += 2; + + to[i] = strtoul(nip, 0, 16); + if(*p == ':') + p++; + } + return i; +} + +/* + * hashing tcp, udp, ... connections + */ +ulong +iphash(uchar *sa, ushort sp, uchar *da, ushort dp) +{ + return (ulong)(sa[IPaddrlen-1]<<24 ^ sp<< 16 ^ da[IPaddrlen-1]<<8 ^ dp) % Nhash; +} + +void +iphtadd(Ipht *ht, Conv *c) +{ + ulong hv; + Iphash *h; + + hv = iphash(c->raddr, c->rport, c->laddr, c->lport); + h = smalloc(sizeof(*h)); + if(ipcmp(c->raddr, IPnoaddr) != 0) + h->match = IPmatchexact; + else { + if(ipcmp(c->laddr, IPnoaddr) != 0){ + if(c->lport == 0) + h->match = IPmatchaddr; + else + h->match = IPmatchpa; + } else { + if(c->lport == 0) + h->match = IPmatchany; + else + h->match = IPmatchport; + } + } + h->c = c; + + LOCK(ht); + h->next = ht->tab[hv]; + ht->tab[hv] = h; + UNLOCK(ht); +} + +void +iphtrem(Ipht *ht, Conv *c) +{ + ulong hv; + Iphash **l, *h; + + hv = iphash(c->raddr, c->rport, c->laddr, c->lport); + LOCK(ht); + for(l = &ht->tab[hv]; (*l) != nil; l = &(*l)->next) + if((*l)->c == c){ + h = *l; + (*l) = h->next; + free(h); + break; + } + UNLOCK(ht); +} + +/* look for a matching conversation with the following precedence + * connected && raddr,rport,laddr,lport + * announced && laddr,lport + * announced && *,lport + * announced && laddr,* + * announced && *,* + */ +Conv* +iphtlook(Ipht *ht, uchar *sa, ushort sp, uchar *da, ushort dp) +{ + ulong hv; + Iphash *h; + Conv *c; + + /* exact 4 pair match (connection) */ + hv = iphash(sa, sp, da, dp); + LOCK(ht); + for(h = ht->tab[hv]; h != nil; h = h->next){ + if(h->match != IPmatchexact) + continue; + c = h->c; + if(sp == c->rport && dp == c->lport + && ipcmp(sa, c->raddr) == 0 && ipcmp(da, c->laddr) == 0){ + UNLOCK(ht); + return c; + } + } + + /* match local address and port */ + hv = iphash(IPnoaddr, 0, da, dp); + for(h = ht->tab[hv]; h != nil; h = h->next){ + if(h->match != IPmatchpa) + continue; + c = h->c; + if(dp == c->lport && ipcmp(da, c->laddr) == 0){ + UNLOCK(ht); + return c; + } + } + + /* match just port */ + hv = iphash(IPnoaddr, 0, IPnoaddr, dp); + for(h = ht->tab[hv]; h != nil; h = h->next){ + if(h->match != IPmatchport) + continue; + c = h->c; + if(dp == c->lport){ + UNLOCK(ht); + return c; + } + } + + /* match local address */ + hv = iphash(IPnoaddr, 0, da, 0); + for(h = ht->tab[hv]; h != nil; h = h->next){ + if(h->match != IPmatchaddr) + continue; + c = h->c; + if(ipcmp(da, c->laddr) == 0){ + UNLOCK(ht); + return c; + } + } + + /* look for something that matches anything */ + hv = iphash(IPnoaddr, 0, IPnoaddr, 0); + for(h = ht->tab[hv]; h != nil; h = h->next){ + if(h->match != IPmatchany) + continue; + c = h->c; + UNLOCK(ht); + return c; + } + UNLOCK(ht); + return nil; +} diff --git a/src/9vx/a/ip/ipifc.c b/src/9vx/a/ip/ipifc.c @@ -0,0 +1,1654 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" +#include "ipv6.h" + +#define DPRINT if(0)print + +enum { + Maxmedia = 32, + Nself = Maxmedia*5, + NHASH = 1<<6, + NCACHE = 256, + QMAX = 64*1024-1, +}; + +Medium *media[Maxmedia] = { 0 }; + +/* + * cache of local addresses (addresses we answer to) + */ +struct Ipself +{ + uchar a[IPaddrlen]; + Ipself *hnext; /* next address in the hash table */ + Iplink *link; /* binding twixt Ipself and Ipifc */ + ulong expire; + uchar type; /* type of address */ + int ref; + Ipself *next; /* free list */ +}; + +struct Ipselftab +{ + QLock qlock; + int inited; + int acceptall; /* true if an interface has the null address */ + Ipself *hash[NHASH]; /* hash chains */ +}; + +/* + * Multicast addresses are chained onto a Chan so that + * we can remove them when the Chan is closed. + */ +typedef struct Ipmcast Ipmcast; +struct Ipmcast +{ + Ipmcast *next; + uchar ma[IPaddrlen]; /* multicast address */ + uchar ia[IPaddrlen]; /* interface address */ +}; + +/* quick hash for ip addresses */ +#define hashipa(a) ( (ulong)(((a)[IPaddrlen-2]<<8) | (a)[IPaddrlen-1])%NHASH ) + +static char tifc[] = "ifc "; + +static void addselfcache(Fs *f, Ipifc *ifc, Iplifc *lifc, uchar *a, int type); +static void remselfcache(Fs *f, Ipifc *ifc, Iplifc *lifc, uchar *a); +static char* ipifcjoinmulti(Ipifc *ifc, char **argv, int argc); +static char* ipifcleavemulti(Ipifc *ifc, char **argv, int argc); +static void ipifcregisterproxy(Fs*, Ipifc*, uchar*); +static char* ipifcremlifc(Ipifc*, Iplifc*); + +/* + * link in a new medium + */ +void +addipmedium(Medium *med) +{ + int i; + + for(i = 0; i < nelem(media)-1; i++) + if(media[i] == nil){ + media[i] = med; + break; + } +} + +/* + * find the medium with this name + */ +Medium* +ipfindmedium(char *name) +{ + Medium **mp; + + for(mp = media; *mp != nil; mp++) + if(strcmp((*mp)->name, name) == 0) + break; + return *mp; +} + +/* + * attach a device (or pkt driver) to the interface. + * called with c locked + */ +static char* +ipifcbind(Conv *c, char **argv, int argc) +{ + Ipifc *ifc; + Medium *m; + + if(argc < 2) + return Ebadarg; + + ifc = (Ipifc*)c->ptcl; + + /* bind the device to the interface */ + m = ipfindmedium(argv[1]); + if(m == nil) + return "unknown interface type"; + + WLOCK(ifc); + if(ifc->m != nil){ + WUNLOCK(ifc); + return "interface already bound"; + } + if(waserror()){ + WUNLOCK(ifc); + nexterror(); + } + + /* do medium specific binding */ + (*m->bind)(ifc, argc, argv); + + /* set the bound device name */ + if(argc > 2) + strncpy(ifc->dev, argv[2], sizeof(ifc->dev)); + else + snprint(ifc->dev, sizeof ifc->dev, "%s%d", m->name, c->x); + ifc->dev[sizeof(ifc->dev)-1] = 0; + + /* set up parameters */ + ifc->m = m; + ifc->mintu = ifc->m->mintu; + ifc->maxtu = ifc->m->maxtu; + if(ifc->m->unbindonclose == 0) + ifc->conv->inuse++; + ifc->rp.mflag = 0; /* default not managed */ + ifc->rp.oflag = 0; + ifc->rp.maxraint = 600000; /* millisecs */ + ifc->rp.minraint = 200000; + ifc->rp.linkmtu = 0; /* no mtu sent */ + ifc->rp.reachtime = 0; + ifc->rp.rxmitra = 0; + ifc->rp.ttl = MAXTTL; + ifc->rp.routerlt = 3 * ifc->rp.maxraint; + + /* any ancillary structures (like routes) no longer pertain */ + ifc->ifcid++; + + /* reopen all the queues closed by a previous unbind */ + qreopen(c->rq); + qreopen(c->eq); + qreopen(c->sq); + + WUNLOCK(ifc); + poperror(); + + return nil; +} + +/* + * detach a device from an interface, close the interface + * called with ifc->conv closed + */ +static char* +ipifcunbind(Ipifc *ifc) +{ + char *err; + + if(waserror()){ + WUNLOCK(ifc); + nexterror(); + } + WLOCK(ifc); + + /* dissociate routes */ + if(ifc->m != nil && ifc->m->unbindonclose == 0) + ifc->conv->inuse--; + ifc->ifcid++; + + /* disassociate logical interfaces (before zeroing ifc->arg) */ + while(ifc->lifc){ + err = ipifcremlifc(ifc, ifc->lifc); + /* + * note: err non-zero means lifc not found, + * which can't happen in this case. + */ + if(err) + error(err); + } + + /* disassociate device */ + if(ifc->m && ifc->m->unbind) + (*ifc->m->unbind)(ifc); + memset(ifc->dev, 0, sizeof(ifc->dev)); + ifc->arg = nil; + ifc->reassemble = 0; + + /* close queues to stop queuing of packets */ + qclose(ifc->conv->rq); + qclose(ifc->conv->wq); + qclose(ifc->conv->sq); + + ifc->m = nil; + WUNLOCK(ifc); + poperror(); + return nil; +} + +char sfixedformat[] = "device %s maxtu %d sendra %d recvra %d mflag %d oflag" +" %d maxraint %d minraint %d linkmtu %d reachtime %d rxmitra %d ttl %d routerlt" +" %d pktin %lud pktout %lud errin %lud errout %lud\n"; + +char slineformat[] = " %-40I %-10M %-40I %-12lud %-12lud\n"; + +static int +ipifcstate(Conv *c, char *state, int n) +{ + Ipifc *ifc; + Iplifc *lifc; + int m; + + ifc = (Ipifc*)c->ptcl; + m = snprint(state, n, sfixedformat, + ifc->dev, ifc->maxtu, ifc->sendra6, ifc->recvra6, + ifc->rp.mflag, ifc->rp.oflag, ifc->rp.maxraint, + ifc->rp.minraint, ifc->rp.linkmtu, ifc->rp.reachtime, + ifc->rp.rxmitra, ifc->rp.ttl, ifc->rp.routerlt, + ifc->in, ifc->out, ifc->inerr, ifc->outerr); + + RLOCK(ifc); + for(lifc = ifc->lifc; lifc && n > m; lifc = lifc->next) + m += snprint(state+m, n - m, slineformat, lifc->local, + lifc->mask, lifc->remote, lifc->validlt, lifc->preflt); + if(ifc->lifc == nil) + m += snprint(state+m, n - m, "\n"); + RUNLOCK(ifc); + return m; +} + +static int +ipifclocal(Conv *c, char *state, int n) +{ + Ipifc *ifc; + Iplifc *lifc; + Iplink *link; + int m; + + ifc = (Ipifc*)c->ptcl; + m = 0; + + RLOCK(ifc); + for(lifc = ifc->lifc; lifc; lifc = lifc->next){ + m += snprint(state+m, n - m, "%-40.40I ->", lifc->local); + for(link = lifc->link; link; link = link->lifclink) + m += snprint(state+m, n - m, " %-40.40I", link->self->a); + m += snprint(state+m, n - m, "\n"); + } + RUNLOCK(ifc); + return m; +} + +static int +ipifcinuse(Conv *c) +{ + Ipifc *ifc; + + ifc = (Ipifc*)c->ptcl; + return ifc->m != nil; +} + +/* + * called when a process writes to an interface's 'data' + */ +static void +ipifckick(void *x) +{ + Conv *c = x; + Block *bp; + Ipifc *ifc; + + bp = qget(c->wq); + if(bp == nil) + return; + + ifc = (Ipifc*)c->ptcl; + if(!CANRLOCK(ifc)){ + freeb(bp); + return; + } + if(waserror()){ + RUNLOCK(ifc); + nexterror(); + } + if(ifc->m == nil || ifc->m->pktin == nil) + freeb(bp); + else + (*ifc->m->pktin)(c->p->f, ifc, bp); + RUNLOCK(ifc); + poperror(); +} + +/* + * called when a new ipifc structure is created + */ +static void +ipifccreate(Conv *c) +{ + Ipifc *ifc; + + c->rq = qopen(QMAX, 0, 0, 0); + c->sq = qopen(2*QMAX, 0, 0, 0); + c->wq = qopen(QMAX, Qkick, ipifckick, c); + ifc = (Ipifc*)c->ptcl; + ifc->conv = c; + ifc->unbinding = 0; + ifc->m = nil; + ifc->reassemble = 0; +} + +/* + * called after last close of ipifc data or ctl + * called with c locked, we must unlock + */ +static void +ipifcclose(Conv *c) +{ + Ipifc *ifc; + Medium *m; + + ifc = (Ipifc*)c->ptcl; + m = ifc->m; + if(m && m->unbindonclose) + ipifcunbind(ifc); +} + +/* + * change an interface's mtu + */ +char* +ipifcsetmtu(Ipifc *ifc, char **argv, int argc) +{ + int mtu; + + if(argc < 2 || ifc->m == nil) + return Ebadarg; + mtu = strtoul(argv[1], 0, 0); + if(mtu < ifc->m->mintu || mtu > ifc->m->maxtu) + return Ebadarg; + ifc->maxtu = mtu; + return nil; +} + +/* + * add an address to an interface. + */ +char* +ipifcadd(Ipifc *ifc, char **argv, int argc, int tentative, Iplifc *lifcp) +{ + int i, type, mtu, sendnbrdisc = 0; + uchar ip[IPaddrlen], mask[IPaddrlen], rem[IPaddrlen]; + uchar bcast[IPaddrlen], net[IPaddrlen]; + Iplifc *lifc, **l; + Fs *f; + + if(ifc->m == nil) + return "ipifc not yet bound to device"; + + f = ifc->conv->p->f; + + type = Rifc; + memset(ip, 0, IPaddrlen); + memset(mask, 0, IPaddrlen); + memset(rem, 0, IPaddrlen); + switch(argc){ + case 6: + if(strcmp(argv[5], "proxy") == 0) + type |= Rproxy; + /* fall through */ + case 5: + mtu = strtoul(argv[4], 0, 0); + if(mtu >= ifc->m->mintu && mtu <= ifc->m->maxtu) + ifc->maxtu = mtu; + /* fall through */ + case 4: + if (parseip(ip, argv[1]) == -1 || parseip(rem, argv[3]) == -1) + return Ebadip; + parseipmask(mask, argv[2]); + maskip(rem, mask, net); + break; + case 3: + if (parseip(ip, argv[1]) == -1) + return Ebadip; + parseipmask(mask, argv[2]); + maskip(ip, mask, rem); + maskip(rem, mask, net); + break; + case 2: + if (parseip(ip, argv[1]) == -1) + return Ebadip; + memmove(mask, defmask(ip), IPaddrlen); + maskip(ip, mask, rem); + maskip(rem, mask, net); + break; + default: + return Ebadarg; + } + if(isv4(ip)) + tentative = 0; + WLOCK(ifc); + + /* ignore if this is already a local address for this ifc */ + for(lifc = ifc->lifc; lifc; lifc = lifc->next) { + if(ipcmp(lifc->local, ip) == 0) { + if(lifc->tentative != tentative) + lifc->tentative = tentative; + if(lifcp) { + lifc->onlink = lifcp->onlink; + lifc->autoflag = lifcp->autoflag; + lifc->validlt = lifcp->validlt; + lifc->preflt = lifcp->preflt; + lifc->origint = lifcp->origint; + } + goto out; + } + } + + /* add the address to the list of logical ifc's for this ifc */ + lifc = smalloc(sizeof(Iplifc)); + ipmove(lifc->local, ip); + ipmove(lifc->mask, mask); + ipmove(lifc->remote, rem); + ipmove(lifc->net, net); + lifc->tentative = tentative; + if(lifcp) { + lifc->onlink = lifcp->onlink; + lifc->autoflag = lifcp->autoflag; + lifc->validlt = lifcp->validlt; + lifc->preflt = lifcp->preflt; + lifc->origint = lifcp->origint; + } else { /* default values */ + lifc->onlink = lifc->autoflag = 1; + lifc->validlt = lifc->preflt = ~0L; + lifc->origint = NOW / 1000; + } + lifc->next = nil; + + for(l = &ifc->lifc; *l; l = &(*l)->next) + ; + *l = lifc; + + /* check for point-to-point interface */ + if(ipcmp(ip, v6loopback)) /* skip v6 loopback, it's a special address */ + if(ipcmp(mask, IPallbits) == 0) + type |= Rptpt; + + /* add local routes */ + if(isv4(ip)) + v4addroute(f, tifc, rem+IPv4off, mask+IPv4off, rem+IPv4off, type); + else + v6addroute(f, tifc, rem, mask, rem, type); + + addselfcache(f, ifc, lifc, ip, Runi); + + if((type & (Rproxy|Rptpt)) == (Rproxy|Rptpt)){ + ipifcregisterproxy(f, ifc, rem); + goto out; + } + + if(isv4(ip) || ipcmp(ip, IPnoaddr) == 0) { + /* add subnet directed broadcast address to the self cache */ + for(i = 0; i < IPaddrlen; i++) + bcast[i] = (ip[i] & mask[i]) | ~mask[i]; + addselfcache(f, ifc, lifc, bcast, Rbcast); + + /* add subnet directed network address to the self cache */ + for(i = 0; i < IPaddrlen; i++) + bcast[i] = (ip[i] & mask[i]) & mask[i]; + addselfcache(f, ifc, lifc, bcast, Rbcast); + + /* add network directed broadcast address to the self cache */ + memmove(mask, defmask(ip), IPaddrlen); + for(i = 0; i < IPaddrlen; i++) + bcast[i] = (ip[i] & mask[i]) | ~mask[i]; + addselfcache(f, ifc, lifc, bcast, Rbcast); + + /* add network directed network address to the self cache */ + memmove(mask, defmask(ip), IPaddrlen); + for(i = 0; i < IPaddrlen; i++) + bcast[i] = (ip[i] & mask[i]) & mask[i]; + addselfcache(f, ifc, lifc, bcast, Rbcast); + + addselfcache(f, ifc, lifc, IPv4bcast, Rbcast); + } + else { + if(ipcmp(ip, v6loopback) == 0) { + /* add node-local mcast address */ + addselfcache(f, ifc, lifc, v6allnodesN, Rmulti); + + /* add route for all node multicast */ + v6addroute(f, tifc, v6allnodesN, v6allnodesNmask, + v6allnodesN, Rmulti); + } + + /* add all nodes multicast address */ + addselfcache(f, ifc, lifc, v6allnodesL, Rmulti); + + /* add route for all nodes multicast */ + v6addroute(f, tifc, v6allnodesL, v6allnodesLmask, v6allnodesL, + Rmulti); + + /* add solicited-node multicast address */ + ipv62smcast(bcast, ip); + addselfcache(f, ifc, lifc, bcast, Rmulti); + + sendnbrdisc = 1; + } + + /* register the address on this network for address resolution */ + if(isv4(ip) && ifc->m->areg != nil) + (*ifc->m->areg)(ifc, ip); + +out: + WUNLOCK(ifc); + if(tentative && sendnbrdisc) + icmpns(f, 0, SRC_UNSPEC, ip, TARG_MULTI, ifc->mac); + return nil; +} + +/* + * remove a logical interface from an ifc + * always called with ifc WLOCK'd + */ +static char* +ipifcremlifc(Ipifc *ifc, Iplifc *lifc) +{ + Iplifc **l; + Fs *f; + + f = ifc->conv->p->f; + + /* + * find address on this interface and remove from chain. + * for pt to pt we actually specify the remote address as the + * addresss to remove. + */ + for(l = &ifc->lifc; *l != nil && *l != lifc; l = &(*l)->next) + ; + if(*l == nil) + return "address not on this interface"; + *l = lifc->next; + + /* disassociate any addresses */ + while(lifc->link) + remselfcache(f, ifc, lifc, lifc->link->self->a); + + /* remove the route for this logical interface */ + if(isv4(lifc->local)) + v4delroute(f, lifc->remote+IPv4off, lifc->mask+IPv4off, 1); + else { + v6delroute(f, lifc->remote, lifc->mask, 1); + if(ipcmp(lifc->local, v6loopback) == 0) + /* remove route for all node multicast */ + v6delroute(f, v6allnodesN, v6allnodesNmask, 1); + else if(memcmp(lifc->local, v6linklocal, v6llpreflen) == 0) + /* remove route for all link multicast */ + v6delroute(f, v6allnodesL, v6allnodesLmask, 1); + } + + free(lifc); + return nil; +} + +/* + * remove an address from an interface. + * called with c->car locked + */ +char* +ipifcrem(Ipifc *ifc, char **argv, int argc) +{ + char *rv; + uchar ip[IPaddrlen], mask[IPaddrlen], rem[IPaddrlen]; + Iplifc *lifc; + + if(argc < 3) + return Ebadarg; + + if (parseip(ip, argv[1]) == -1) + return Ebadip; + parseipmask(mask, argv[2]); + if(argc < 4) + maskip(ip, mask, rem); + else + if (parseip(rem, argv[3]) == -1) + return Ebadip; + + WLOCK(ifc); + + /* + * find address on this interface and remove from chain. + * for pt to pt we actually specify the remote address as the + * addresss to remove. + */ + for(lifc = ifc->lifc; lifc != nil; lifc = lifc->next) { + if (memcmp(ip, lifc->local, IPaddrlen) == 0 + && memcmp(mask, lifc->mask, IPaddrlen) == 0 + && memcmp(rem, lifc->remote, IPaddrlen) == 0) + break; + } + + rv = ipifcremlifc(ifc, lifc); + WUNLOCK(ifc); + return rv; +} + +/* + * distribute routes to active interfaces like the + * TRIP linecards + */ +void +ipifcaddroute(Fs *f, int vers, uchar *addr, uchar *mask, uchar *gate, int type) +{ + Medium *m; + Conv **cp, **e; + Ipifc *ifc; + + e = &f->ipifc->conv[f->ipifc->nc]; + for(cp = f->ipifc->conv; cp < e; cp++){ + if(*cp != nil) { + ifc = (Ipifc*)(*cp)->ptcl; + m = ifc->m; + if(m && m->addroute) + m->addroute(ifc, vers, addr, mask, gate, type); + } + } +} + +void +ipifcremroute(Fs *f, int vers, uchar *addr, uchar *mask) +{ + Medium *m; + Conv **cp, **e; + Ipifc *ifc; + + e = &f->ipifc->conv[f->ipifc->nc]; + for(cp = f->ipifc->conv; cp < e; cp++){ + if(*cp != nil) { + ifc = (Ipifc*)(*cp)->ptcl; + m = ifc->m; + if(m && m->remroute) + m->remroute(ifc, vers, addr, mask); + } + } +} + +/* + * associate an address with the interface. This wipes out any previous + * addresses. This is a macro that means, remove all the old interfaces + * and add a new one. + */ +static char* +ipifcconnect(Conv* c, char **argv, int argc) +{ + char *err; + Ipifc *ifc; + + ifc = (Ipifc*)c->ptcl; + + if(ifc->m == nil) + return "ipifc not yet bound to device"; + + if(waserror()){ + WUNLOCK(ifc); + nexterror(); + } + WLOCK(ifc); + while(ifc->lifc){ + err = ipifcremlifc(ifc, ifc->lifc); + if(err) + error(err); + } + WUNLOCK(ifc); + poperror(); + + err = ipifcadd(ifc, argv, argc, 0, nil); + if(err) + return err; + + Fsconnected(c, nil); + return nil; +} + +char* +ipifcra6(Ipifc *ifc, char **argv, int argc) +{ + int i, argsleft, vmax = ifc->rp.maxraint, vmin = ifc->rp.minraint; + + argsleft = argc - 1; + i = 1; + + if(argsleft % 2 != 0) + return Ebadarg; + + while (argsleft > 1) { + if(strcmp(argv[i], "recvra") == 0) + ifc->recvra6 = (atoi(argv[i+1]) != 0); + else if(strcmp(argv[i], "sendra") == 0) + ifc->sendra6 = (atoi(argv[i+1]) != 0); + else if(strcmp(argv[i], "mflag") == 0) + ifc->rp.mflag = (atoi(argv[i+1]) != 0); + else if(strcmp(argv[i], "oflag") == 0) + ifc->rp.oflag = (atoi(argv[i+1]) != 0); + else if(strcmp(argv[i], "maxraint") == 0) + ifc->rp.maxraint = atoi(argv[i+1]); + else if(strcmp(argv[i], "minraint") == 0) + ifc->rp.minraint = atoi(argv[i+1]); + else if(strcmp(argv[i], "linkmtu") == 0) + ifc->rp.linkmtu = atoi(argv[i+1]); + else if(strcmp(argv[i], "reachtime") == 0) + ifc->rp.reachtime = atoi(argv[i+1]); + else if(strcmp(argv[i], "rxmitra") == 0) + ifc->rp.rxmitra = atoi(argv[i+1]); + else if(strcmp(argv[i], "ttl") == 0) + ifc->rp.ttl = atoi(argv[i+1]); + else if(strcmp(argv[i], "routerlt") == 0) + ifc->rp.routerlt = atoi(argv[i+1]); + else + return Ebadarg; + + argsleft -= 2; + i += 2; + } + + /* consistency check */ + if(ifc->rp.maxraint < ifc->rp.minraint) { + ifc->rp.maxraint = vmax; + ifc->rp.minraint = vmin; + return Ebadarg; + } + return nil; +} + +/* + * non-standard control messages. + * called with c->car locked. + */ +static char* +ipifcctl(Conv* c, char**argv, int argc) +{ + Ipifc *ifc; + int i; + + ifc = (Ipifc*)c->ptcl; + if(strcmp(argv[0], "add") == 0) + return ipifcadd(ifc, argv, argc, 0, nil); + else if(strcmp(argv[0], "try") == 0) + return ipifcadd(ifc, argv, argc, 1, nil); + else if(strcmp(argv[0], "remove") == 0) + return ipifcrem(ifc, argv, argc); + else if(strcmp(argv[0], "unbind") == 0) + return ipifcunbind(ifc); + else if(strcmp(argv[0], "joinmulti") == 0) + return ipifcjoinmulti(ifc, argv, argc); + else if(strcmp(argv[0], "leavemulti") == 0) + return ipifcleavemulti(ifc, argv, argc); + else if(strcmp(argv[0], "mtu") == 0) + return ipifcsetmtu(ifc, argv, argc); + else if(strcmp(argv[0], "reassemble") == 0){ + ifc->reassemble = 1; + return nil; + } + else if(strcmp(argv[0], "iprouting") == 0){ + i = 1; + if(argc > 1) + i = atoi(argv[1]); + iprouting(c->p->f, i); + return nil; + } + else if(strcmp(argv[0], "add6") == 0) + return ipifcadd6(ifc, argv, argc); + else if(strcmp(argv[0], "ra6") == 0) + return ipifcra6(ifc, argv, argc); + return "unsupported ctl"; +} + +int +ipifcstats(Proto *ipifc, char *buf, int len) +{ + return ipstats(ipifc->f, buf, len); +} + +void +ipifcinit(Fs *f) +{ + Proto *ipifc; + + ipifc = smalloc(sizeof(Proto)); + ipifc->name = "ipifc"; + ipifc->connect = ipifcconnect; + ipifc->announce = nil; + ipifc->bind = ipifcbind; + ipifc->state = ipifcstate; + ipifc->create = ipifccreate; + ipifc->close = ipifcclose; + ipifc->rcv = nil; + ipifc->ctl = ipifcctl; + ipifc->advise = nil; + ipifc->stats = ipifcstats; + ipifc->inuse = ipifcinuse; + ipifc->local = ipifclocal; + ipifc->ipproto = -1; + ipifc->nc = Maxmedia; + ipifc->ptclsize = sizeof(Ipifc); + + f->ipifc = ipifc; /* hack for ipifcremroute, findipifc, ... */ + f->self = smalloc(sizeof(Ipselftab)); /* hack for ipforme */ + + Fsproto(f, ipifc); +} + +/* + * add to self routing cache + * called with c->car locked + */ +static void +addselfcache(Fs *f, Ipifc *ifc, Iplifc *lifc, uchar *a, int type) +{ + Ipself *p; + Iplink *lp; + int h; + + QLOCK(f->self); + + /* see if the address already exists */ + h = hashipa(a); + for(p = f->self->hash[h]; p; p = p->next) + if(memcmp(a, p->a, IPaddrlen) == 0) + break; + + /* allocate a local address and add to hash chain */ + if(p == nil){ + p = smalloc(sizeof(*p)); + ipmove(p->a, a); + p->type = type; + p->next = f->self->hash[h]; + f->self->hash[h] = p; + + /* if the null address, accept all packets */ + if(ipcmp(a, v4prefix) == 0 || ipcmp(a, IPnoaddr) == 0) + f->self->acceptall = 1; + } + + /* look for a link for this lifc */ + for(lp = p->link; lp; lp = lp->selflink) + if(lp->lifc == lifc) + break; + + /* allocate a lifc-to-local link and link to both */ + if(lp == nil){ + lp = smalloc(sizeof(*lp)); + lp->ref = 1; + lp->lifc = lifc; + lp->self = p; + lp->selflink = p->link; + p->link = lp; + lp->lifclink = lifc->link; + lifc->link = lp; + + /* add to routing table */ + if(isv4(a)) + v4addroute(f, tifc, a+IPv4off, IPallbits+IPv4off, + a+IPv4off, type); + else + v6addroute(f, tifc, a, IPallbits, a, type); + + if((type & Rmulti) && ifc->m->addmulti != nil) + (*ifc->m->addmulti)(ifc, a, lifc->local); + } else + lp->ref++; + + QUNLOCK(f->self); +} + +/* + * These structures are unlinked from their chains while + * other threads may be using them. To avoid excessive locking, + * just put them aside for a while before freeing them. + * called with f->self locked + */ +static Iplink *freeiplink; +static Ipself *freeipself; + +static void +iplinkfree(Iplink *p) +{ + Iplink **l, *np; + ulong now = NOW; + + l = &freeiplink; + for(np = *l; np; np = *l){ + if(np->expire > now){ + *l = np->next; + free(np); + continue; + } + l = &np->next; + } + p->expire = now + 5000; /* give other threads 5 secs to get out */ + p->next = nil; + *l = p; +} + +static void +ipselffree(Ipself *p) +{ + Ipself **l, *np; + ulong now = NOW; + + l = &freeipself; + for(np = *l; np; np = *l){ + if(np->expire > now){ + *l = np->next; + free(np); + continue; + } + l = &np->next; + } + p->expire = now + 5000; /* give other threads 5 secs to get out */ + p->next = nil; + *l = p; +} + +/* + * Decrement reference for this address on this link. + * Unlink from selftab if this is the last ref. + * called with c->car locked + */ +static void +remselfcache(Fs *f, Ipifc *ifc, Iplifc *lifc, uchar *a) +{ + Ipself *p, **l; + Iplink *link, **l_self, **l_lifc; + + QLOCK(f->self); + + /* find the unique selftab entry */ + l = &f->self->hash[hashipa(a)]; + for(p = *l; p; p = *l){ + if(ipcmp(p->a, a) == 0) + break; + l = &p->next; + } + + if(p == nil) + goto out; + + /* + * walk down links from an ifc looking for one + * that matches the selftab entry + */ + l_lifc = &lifc->link; + for(link = *l_lifc; link; link = *l_lifc){ + if(link->self == p) + break; + l_lifc = &link->lifclink; + } + + if(link == nil) + goto out; + + /* + * walk down the links from the selftab looking for + * the one we just found + */ + l_self = &p->link; + for(link = *l_self; link; link = *l_self){ + if(link == *l_lifc) + break; + l_self = &link->selflink; + } + + if(link == nil) + panic("remselfcache"); + + if(--(link->ref) != 0) + goto out; + + if((p->type & Rmulti) && ifc->m->remmulti != nil) + (*ifc->m->remmulti)(ifc, a, lifc->local); + + /* ref == 0, remove from both chains and free the link */ + *l_lifc = link->lifclink; + *l_self = link->selflink; + iplinkfree(link); + + if(p->link != nil) + goto out; + + /* remove from routing table */ + if(isv4(a)) + v4delroute(f, a+IPv4off, IPallbits+IPv4off, 1); + else + v6delroute(f, a, IPallbits, 1); + + /* no more links, remove from hash and free */ + *l = p->next; + ipselffree(p); + + /* if IPnoaddr, forget */ + if(ipcmp(a, v4prefix) == 0 || ipcmp(a, IPnoaddr) == 0) + f->self->acceptall = 0; + +out: + QUNLOCK(f->self); +} + +static char *stformat = "%-44.44I %2.2d %4.4s\n"; +enum +{ + Nstformat= 41, +}; + +long +ipselftabread(Fs *f, char *cp, ulong offset, int n) +{ + int i, m, nifc, off; + Ipself *p; + Iplink *link; + char state[8]; + + m = 0; + off = offset; + QLOCK(f->self); + for(i = 0; i < NHASH && m < n; i++){ + for(p = f->self->hash[i]; p != nil && m < n; p = p->next){ + nifc = 0; + for(link = p->link; link; link = link->selflink) + nifc++; + routetype(p->type, state); + m += snprint(cp + m, n - m, stformat, p->a, nifc, state); + if(off > 0){ + off -= m; + m = 0; + } + } + } + QUNLOCK(f->self); + return m; +} + +int +iptentative(Fs *f, uchar *addr) +{ + Ipself *p; + + p = f->self->hash[hashipa(addr)]; + for(; p; p = p->next){ + if(ipcmp(addr, p->a) == 0) + return p->link->lifc->tentative; + } + return 0; +} + +/* + * returns + * 0 - no match + * Runi + * Rbcast + * Rmcast + */ +int +ipforme(Fs *f, uchar *addr) +{ + Ipself *p; + + p = f->self->hash[hashipa(addr)]; + for(; p; p = p->next){ + if(ipcmp(addr, p->a) == 0) + return p->type; + } + + /* hack to say accept anything */ + if(f->self->acceptall) + return Runi; + return 0; +} + +/* + * find the ifc on same net as the remote system. If none, + * return nil. + */ +Ipifc* +findipifc(Fs *f, uchar *remote, int type) +{ + Ipifc *ifc, *x; + Iplifc *lifc; + Conv **cp, **e; + uchar gnet[IPaddrlen], xmask[IPaddrlen]; + + x = nil; + memset(xmask, 0, IPaddrlen); + + /* find most specific match */ + e = &f->ipifc->conv[f->ipifc->nc]; + for(cp = f->ipifc->conv; cp < e; cp++){ + if(*cp == 0) + continue; + ifc = (Ipifc*)(*cp)->ptcl; + for(lifc = ifc->lifc; lifc; lifc = lifc->next){ + maskip(remote, lifc->mask, gnet); + if(ipcmp(gnet, lifc->net) == 0){ + if(x == nil || ipcmp(lifc->mask, xmask) > 0){ + x = ifc; + ipmove(xmask, lifc->mask); + } + } + } + } + if(x != nil) + return x; + + /* for now for broadcast and multicast, just use first interface */ + if(type & (Rbcast|Rmulti)){ + for(cp = f->ipifc->conv; cp < e; cp++){ + if(*cp == 0) + continue; + ifc = (Ipifc*)(*cp)->ptcl; + if(ifc->lifc != nil) + return ifc; + } + } + return nil; +} + +enum { + unknownv6, /* UGH */ +// multicastv6, + unspecifiedv6, + linklocalv6, + globalv6, +}; + +int +v6addrtype(uchar *addr) +{ + if(islinklocal(addr) || + (isv6mcast(addr) && (addr[1] & 0xF) <= Link_local_scop)) + return linklocalv6; + else + return globalv6; +} + +#define v6addrcurr(lifc) ((lifc)->preflt == ~0L || \ + (lifc)->origint + (lifc)->preflt >= NOW/1000) + +static void +findprimaryipv6(Fs *f, uchar *local) +{ + int atype, atypel; + Conv **cp, **e; + Ipifc *ifc; + Iplifc *lifc; + + ipmove(local, v6Unspecified); + atype = unspecifiedv6; + + /* + * find "best" (global > link local > unspecified) + * local address; address must be current. + */ + e = &f->ipifc->conv[f->ipifc->nc]; + for(cp = f->ipifc->conv; cp < e; cp++){ + if(*cp == 0) + continue; + ifc = (Ipifc*)(*cp)->ptcl; + for(lifc = ifc->lifc; lifc; lifc = lifc->next){ + atypel = v6addrtype(lifc->local); + if(atypel > atype && v6addrcurr(lifc)) { + ipmove(local, lifc->local); + atype = atypel; + if(atype == globalv6) + return; + } + } + } +} + +/* + * returns first ip address configured + */ +static void +findprimaryipv4(Fs *f, uchar *local) +{ + Conv **cp, **e; + Ipifc *ifc; + Iplifc *lifc; + + /* find first ifc local address */ + e = &f->ipifc->conv[f->ipifc->nc]; + for(cp = f->ipifc->conv; cp < e; cp++){ + if(*cp == 0) + continue; + ifc = (Ipifc*)(*cp)->ptcl; + if((lifc = ifc->lifc) != nil){ + ipmove(local, lifc->local); + return; + } + } +} + +/* + * find the local address 'closest' to the remote system, copy it to + * local and return the ifc for that address + */ +void +findlocalip(Fs *f, uchar *local, uchar *remote) +{ + int version, atype = unspecifiedv6, atypel = unknownv6; + int atyper, deprecated; + uchar gate[IPaddrlen], gnet[IPaddrlen]; + Ipifc *ifc; + Iplifc *lifc; + Route *r; + + QLOCK(f->ipifc); + r = v6lookup(f, remote, nil); + version = (memcmp(remote, v4prefix, IPv4off) == 0)? V4: V6; + + if(r != nil){ + ifc = r->ifc; + if(r->type & Rv4) + v4tov6(gate, r->v4.gate); + else { + ipmove(gate, r->v6.gate); + ipmove(local, v6Unspecified); + } + + switch(version) { + case V4: + /* find ifc address closest to the gateway to use */ + for(lifc = ifc->lifc; lifc; lifc = lifc->next){ + maskip(gate, lifc->mask, gnet); + if(ipcmp(gnet, lifc->net) == 0){ + ipmove(local, lifc->local); + goto out; + } + } + break; + case V6: + /* find ifc address with scope matching the destination */ + atyper = v6addrtype(remote); + deprecated = 0; + for(lifc = ifc->lifc; lifc; lifc = lifc->next){ + atypel = v6addrtype(lifc->local); + /* prefer appropriate scope */ + if((atypel > atype && atype < atyper) || + (atypel < atype && atype > atyper)){ + ipmove(local, lifc->local); + deprecated = !v6addrcurr(lifc); + atype = atypel; + } else if(atypel == atype){ + /* avoid deprecated addresses */ + if(deprecated && v6addrcurr(lifc)){ + ipmove(local, lifc->local); + atype = atypel; + deprecated = 0; + } + } + if(atype == atyper && !deprecated) + goto out; + } + if(atype >= atyper) + goto out; + break; + default: + panic("findlocalip: version %d", version); + } + } + + switch(version){ + case V4: + findprimaryipv4(f, local); + break; + case V6: + findprimaryipv6(f, local); + break; + default: + panic("findlocalip2: version %d", version); + } + +out: + QUNLOCK(f->ipifc); +} + +/* + * return first v4 address associated with an interface + */ +int +ipv4local(Ipifc *ifc, uchar *addr) +{ + Iplifc *lifc; + + for(lifc = ifc->lifc; lifc; lifc = lifc->next){ + if(isv4(lifc->local)){ + memmove(addr, lifc->local+IPv4off, IPv4addrlen); + return 1; + } + } + return 0; +} + +/* + * return first v6 address associated with an interface + */ +int +ipv6local(Ipifc *ifc, uchar *addr) +{ + Iplifc *lifc; + + for(lifc = ifc->lifc; lifc; lifc = lifc->next){ + if(!isv4(lifc->local) && !(lifc->tentative)){ + ipmove(addr, lifc->local); + return 1; + } + } + return 0; +} + +int +ipv6anylocal(Ipifc *ifc, uchar *addr) +{ + Iplifc *lifc; + + for(lifc = ifc->lifc; lifc; lifc = lifc->next){ + if(!isv4(lifc->local)){ + ipmove(addr, lifc->local); + return SRC_UNI; + } + } + return SRC_UNSPEC; +} + +/* + * see if this address is bound to the interface + */ +Iplifc* +iplocalonifc(Ipifc *ifc, uchar *ip) +{ + Iplifc *lifc; + + for(lifc = ifc->lifc; lifc; lifc = lifc->next) + if(ipcmp(ip, lifc->local) == 0) + return lifc; + return nil; +} + + +/* + * See if we're proxying for this address on this interface + */ +int +ipproxyifc(Fs *f, Ipifc *ifc, uchar *ip) +{ + Route *r; + uchar net[IPaddrlen]; + Iplifc *lifc; + + /* see if this is a direct connected pt to pt address */ + r = v6lookup(f, ip, nil); + if(r == nil || (r->type & (Rifc|Rproxy)) != (Rifc|Rproxy)) + return 0; + + /* see if this is on the right interface */ + for(lifc = ifc->lifc; lifc; lifc = lifc->next){ + maskip(ip, lifc->mask, net); + if(ipcmp(net, lifc->remote) == 0) + return 1; + } + return 0; +} + +/* + * return multicast version if any + */ +int +ipismulticast(uchar *ip) +{ + if(isv4(ip)){ + if(ip[IPv4off] >= 0xe0 && ip[IPv4off] < 0xf0) + return V4; + } + else if(ip[0] == 0xff) + return V6; + return 0; +} +int +ipisbm(uchar *ip) +{ + if(isv4(ip)){ + if(ip[IPv4off] >= 0xe0 && ip[IPv4off] < 0xf0) + return V4; + else if(ipcmp(ip, IPv4bcast) == 0) + return V4; + } + else if(ip[0] == 0xff) + return V6; + return 0; +} + + +/* + * add a multicast address to an interface, called with c->car locked + */ +void +ipifcaddmulti(Conv *c, uchar *ma, uchar *ia) +{ + Ipifc *ifc; + Iplifc *lifc; + Conv **p; + Ipmulti *multi, **l; + Fs *f; + + f = c->p->f; + + for(l = &c->multi; *l; l = &(*l)->next) + if(ipcmp(ma, (*l)->ma) == 0 && ipcmp(ia, (*l)->ia) == 0) + return; /* it's already there */ + + multi = *l = smalloc(sizeof(*multi)); + ipmove(multi->ma, ma); + ipmove(multi->ia, ia); + multi->next = nil; + + for(p = f->ipifc->conv; *p; p++){ + if((*p)->inuse == 0) + continue; + ifc = (Ipifc*)(*p)->ptcl; + if(waserror()){ + WUNLOCK(ifc); + nexterror(); + } + WLOCK(ifc); + for(lifc = ifc->lifc; lifc; lifc = lifc->next) + if(ipcmp(ia, lifc->local) == 0) + addselfcache(f, ifc, lifc, ma, Rmulti); + WUNLOCK(ifc); + poperror(); + } +} + + +/* + * remove a multicast address from an interface, called with c->car locked + */ +void +ipifcremmulti(Conv *c, uchar *ma, uchar *ia) +{ + Ipmulti *multi, **l; + Iplifc *lifc; + Conv **p; + Ipifc *ifc; + Fs *f; + + f = c->p->f; + + for(l = &c->multi; *l; l = &(*l)->next) + if(ipcmp(ma, (*l)->ma) == 0 && ipcmp(ia, (*l)->ia) == 0) + break; + + multi = *l; + if(multi == nil) + return; /* we don't have it open */ + + *l = multi->next; + + for(p = f->ipifc->conv; *p; p++){ + if((*p)->inuse == 0) + continue; + + ifc = (Ipifc*)(*p)->ptcl; + if(waserror()){ + WUNLOCK(ifc); + nexterror(); + } + WLOCK(ifc); + for(lifc = ifc->lifc; lifc; lifc = lifc->next) + if(ipcmp(ia, lifc->local) == 0) + remselfcache(f, ifc, lifc, ma); + WUNLOCK(ifc); + poperror(); + } + + free(multi); +} + +/* + * make lifc's join and leave multicast groups + */ +static char* +ipifcjoinmulti(Ipifc *ifc, char **argv, int argc) +{ + return nil; +} + +static char* +ipifcleavemulti(Ipifc *ifc, char **argv, int argc) +{ + return nil; +} + +static void +ipifcregisterproxy(Fs *f, Ipifc *ifc, uchar *ip) +{ + Conv **cp, **e; + Ipifc *nifc; + Iplifc *lifc; + Medium *m; + uchar net[IPaddrlen]; + + /* register the address on any network that will proxy for us */ + e = &f->ipifc->conv[f->ipifc->nc]; + + if(!isv4(ip)) { /* V6 */ + for(cp = f->ipifc->conv; cp < e; cp++){ + if(*cp == nil || (nifc = (Ipifc*)(*cp)->ptcl) == ifc) + continue; + RLOCK(nifc); + m = nifc->m; + if(m == nil || m->addmulti == nil) { + RUNLOCK(nifc); + continue; + } + for(lifc = nifc->lifc; lifc; lifc = lifc->next){ + maskip(ip, lifc->mask, net); + if(ipcmp(net, lifc->remote) == 0) { + /* add solicited-node multicast addr */ + ipv62smcast(net, ip); + addselfcache(f, nifc, lifc, net, Rmulti); + arpenter(f, V6, ip, nifc->mac, 6, 0); + // (*m->addmulti)(nifc, net, ip); + break; + } + } + RUNLOCK(nifc); + } + } + else { /* V4 */ + for(cp = f->ipifc->conv; cp < e; cp++){ + if(*cp == nil || (nifc = (Ipifc*)(*cp)->ptcl) == ifc) + continue; + RLOCK(nifc); + m = nifc->m; + if(m == nil || m->areg == nil){ + RUNLOCK(nifc); + continue; + } + for(lifc = nifc->lifc; lifc; lifc = lifc->next){ + maskip(ip, lifc->mask, net); + if(ipcmp(net, lifc->remote) == 0){ + (*m->areg)(nifc, ip); + break; + } + } + RUNLOCK(nifc); + } + } +} + + +/* added for new v6 mesg types */ +static void +adddefroute6(Fs *f, uchar *gate, int force) +{ + Route *r; + + r = v6lookup(f, v6Unspecified, nil); + /* + * route entries generated by all other means take precedence + * over router announcements. + */ + if (r && !force && strcmp(r->tag, "ra") != 0) + return; + + v6delroute(f, v6Unspecified, v6Unspecified, 1); + v6addroute(f, "ra", v6Unspecified, v6Unspecified, gate, 0); +} + +enum { + Ngates = 3, +}; + +char* +ipifcadd6(Ipifc *ifc, char**argv, int argc) +{ + int plen = 64; + long origint = NOW / 1000, preflt = ~0L, validlt = ~0L; + char addr[40], preflen[6]; + char *params[3]; + uchar autoflag = 1, onlink = 1; + uchar prefix[IPaddrlen]; + Iplifc *lifc; + + switch(argc) { + case 7: + preflt = atoi(argv[6]); + /* fall through */ + case 6: + validlt = atoi(argv[5]); + /* fall through */ + case 5: + autoflag = atoi(argv[4]); + /* fall through */ + case 4: + onlink = atoi(argv[3]); + /* fall through */ + case 3: + plen = atoi(argv[2]); + /* fall through */ + case 2: + break; + default: + return Ebadarg; + } + + if (parseip(prefix, argv[1]) != 6 || validlt < preflt || plen < 0 || + plen > 64 || islinklocal(prefix)) + return Ebadarg; + + lifc = smalloc(sizeof(Iplifc)); + lifc->onlink = (onlink != 0); + lifc->autoflag = (autoflag != 0); + lifc->validlt = validlt; + lifc->preflt = preflt; + lifc->origint = origint; + + /* issue "add" ctl msg for v6 link-local addr and prefix len */ + if(!ifc->m->pref2addr) + return Ebadarg; + ifc->m->pref2addr(prefix, ifc->mac); /* mac → v6 link-local addr */ + sprint(addr, "%I", prefix); + sprint(preflen, "/%d", plen); + params[0] = "add"; + params[1] = addr; + params[2] = preflen; + + return ipifcadd(ifc, params, 3, 0, lifc); +} diff --git a/src/9vx/a/ip/ipmux.c b/src/9vx/a/ip/ipmux.c @@ -0,0 +1,842 @@ +/* + * IP packet filter + */ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" +#include "ipv6.h" + +typedef struct Ipmuxrock Ipmuxrock; +typedef struct Ipmux Ipmux; + +typedef struct Myip4hdr Myip4hdr; +struct Myip4hdr +{ + uchar vihl; /* Version and header length */ + uchar tos; /* Type of service */ + uchar length[2]; /* packet length */ + uchar id[2]; /* ip->identification */ + uchar frag[2]; /* Fragment information */ + uchar ttl; /* Time to live */ + uchar proto; /* Protocol */ + uchar cksum[2]; /* Header checksum */ + uchar src[4]; /* IP source */ + uchar dst[4]; /* IP destination */ + + uchar data[1]; /* start of data */ +}; +Myip4hdr *ipoff = 0; + +enum +{ + Tproto, + Tdata, + Tiph, + Tdst, + Tsrc, + Tifc, + + Cother = 0, + Cbyte, /* single byte */ + Cmbyte, /* single byte with mask */ + Cshort, /* single short */ + Cmshort, /* single short with mask */ + Clong, /* single long */ + Cmlong, /* single long with mask */ + Cifc, + Cmifc, +}; + +char *ftname[] = +{ +[Tproto] "proto", +[Tdata] "data", +[Tiph] "iph", +[Tdst] "dst", +[Tsrc] "src", +[Tifc] "ifc", +}; + +/* + * a node in the decision tree + */ +struct Ipmux +{ + Ipmux *yes; + Ipmux *no; + uchar type; /* type of field(Txxxx) */ + uchar ctype; /* tupe of comparison(Cxxxx) */ + uchar len; /* length in bytes of item to compare */ + uchar n; /* number of items val points to */ + short off; /* offset of comparison */ + short eoff; /* end offset of comparison */ + uchar skiphdr; /* should offset start after ipheader */ + uchar *val; + uchar *mask; + uchar *e; /* val+n*len*/ + + int ref; /* so we can garbage collect */ + Conv *conv; +}; + +/* + * someplace to hold per conversation data + */ +struct Ipmuxrock +{ + Ipmux *chain; +}; + +static int ipmuxsprint(Ipmux*, int, char*, int); +static void ipmuxkick(void *x); + +static char* +skipwhite(char *p) +{ + while(*p == ' ' || *p == '\t') + p++; + return p; +} + +static char* +follows(char *p, char c) +{ + char *f; + + f = strchr(p, c); + if(f == nil) + return nil; + *f++ = 0; + f = skipwhite(f); + if(*f == 0) + return nil; + return f; +} + +static Ipmux* +parseop(char **pp) +{ + char *p = *pp; + int type, off, end, len; + Ipmux *f; + + p = skipwhite(p); + if(strncmp(p, "dst", 3) == 0){ + type = Tdst; + off = (ulong)(ipoff->dst); + len = IPv4addrlen; + p += 3; + } + else if(strncmp(p, "src", 3) == 0){ + type = Tsrc; + off = (ulong)(ipoff->src); + len = IPv4addrlen; + p += 3; + } + else if(strncmp(p, "ifc", 3) == 0){ + type = Tifc; + off = -IPv4addrlen; + len = IPv4addrlen; + p += 3; + } + else if(strncmp(p, "proto", 5) == 0){ + type = Tproto; + off = (ulong)&(ipoff->proto); + len = 1; + p += 5; + } + else if(strncmp(p, "data", 4) == 0 || strncmp(p, "iph", 3) == 0){ + if(strncmp(p, "data", 4) == 0) { + type = Tdata; + p += 4; + } + else { + type = Tiph; + p += 3; + } + p = skipwhite(p); + if(*p != '[') + return nil; + p++; + off = strtoul(p, &p, 0); + if(off < 0 || off > (64-IP4HDR)) + return nil; + p = skipwhite(p); + if(*p != ':') + end = off; + else { + p++; + p = skipwhite(p); + end = strtoul(p, &p, 0); + if(end < off) + return nil; + p = skipwhite(p); + } + if(*p != ']') + return nil; + p++; + len = end - off + 1; + } + else + return nil; + + f = smalloc(sizeof(*f)); + f->type = type; + f->len = len; + f->off = off; + f->val = nil; + f->mask = nil; + f->n = 1; + f->ref = 1; + if(type == Tdata) + f->skiphdr = 1; + else + f->skiphdr = 0; + + return f; +} + +static int +htoi(char x) +{ + if(x >= '0' && x <= '9') + x -= '0'; + else if(x >= 'a' && x <= 'f') + x -= 'a' - 10; + else if(x >= 'A' && x <= 'F') + x -= 'A' - 10; + else + x = 0; + return x; +} + +static int +hextoi(char *p) +{ + return (htoi(p[0])<<4) | htoi(p[1]); +} + +static void +parseval(uchar *v, char *p, int len) +{ + while(*p && len-- > 0){ + *v++ = hextoi(p); + p += 2; + } +} + +static Ipmux* +parsemux(char *p) +{ + int n, nomask; + Ipmux *f; + char *val; + char *mask; + char *vals[20]; + uchar *v; + + /* parse operand */ + f = parseop(&p); + if(f == nil) + return nil; + + /* find value */ + val = follows(p, '='); + if(val == nil) + goto parseerror; + + /* parse mask */ + mask = follows(p, '&'); + if(mask != nil){ + switch(f->type){ + case Tsrc: + case Tdst: + case Tifc: + f->mask = smalloc(f->len); + v4parseip(f->mask, mask); + break; + case Tdata: + case Tiph: + f->mask = smalloc(f->len); + parseval(f->mask, mask, f->len); + break; + default: + goto parseerror; + } + nomask = 0; + } else { + nomask = 1; + f->mask = smalloc(f->len); + memset(f->mask, 0xff, f->len); + } + + /* parse vals */ + f->n = getfields(val, vals, sizeof(vals)/sizeof(char*), 1, "|"); + if(f->n == 0) + goto parseerror; + f->val = smalloc(f->n*f->len); + v = f->val; + for(n = 0; n < f->n; n++){ + switch(f->type){ + case Tsrc: + case Tdst: + case Tifc: + v4parseip(v, vals[n]); + break; + case Tproto: + case Tdata: + case Tiph: + parseval(v, vals[n], f->len); + break; + } + v += f->len; + } + + f->eoff = f->off + f->len; + f->e = f->val + f->n*f->len; + f->ctype = Cother; + if(f->n == 1){ + switch(f->len){ + case 1: + f->ctype = nomask ? Cbyte : Cmbyte; + break; + case 2: + f->ctype = nomask ? Cshort : Cmshort; + break; + case 4: + if(f->type == Tifc) + f->ctype = nomask ? Cifc : Cmifc; + else + f->ctype = nomask ? Clong : Cmlong; + break; + } + } + return f; + +parseerror: + if(f->mask) + free(f->mask); + if(f->val) + free(f->val); + free(f); + return nil; +} + +/* + * Compare relative ordering of two ipmuxs. This doesn't compare the + * values, just the fields being looked at. + * + * returns: <0 if a is a more specific match + * 0 if a and b are matching on the same fields + * >0 if b is a more specific match + */ +static int +ipmuxcmp(Ipmux *a, Ipmux *b) +{ + int n; + + /* compare types, lesser ones are more important */ + n = a->type - b->type; + if(n != 0) + return n; + + /* compare offsets, call earlier ones more specific */ + n = (a->off+((int)a->skiphdr)*(ulong)ipoff->data) - + (b->off+((int)b->skiphdr)*(ulong)ipoff->data); + if(n != 0) + return n; + + /* compare match lengths, longer ones are more specific */ + n = b->len - a->len; + if(n != 0) + return n; + + /* + * if we get here we have two entries matching + * the same bytes of the record. Now check + * the mask for equality. Longer masks are + * more specific. + */ + if(a->mask != nil && b->mask == nil) + return -1; + if(a->mask == nil && b->mask != nil) + return 1; + if(a->mask != nil && b->mask != nil){ + n = memcmp(b->mask, a->mask, a->len); + if(n != 0) + return n; + } + return 0; +} + +/* + * Compare the values of two ipmuxs. We're assuming that ipmuxcmp + * returned 0 comparing them. + */ +static int +ipmuxvalcmp(Ipmux *a, Ipmux *b) +{ + int n; + + n = b->len*b->n - a->len*a->n; + if(n != 0) + return n; + return memcmp(a->val, b->val, a->len*a->n); +} + +/* + * add onto an existing ipmux chain in the canonical comparison + * order + */ +static void +ipmuxchain(Ipmux **l, Ipmux *f) +{ + for(; *l; l = &(*l)->yes) + if(ipmuxcmp(f, *l) < 0) + break; + f->yes = *l; + *l = f; +} + +/* + * copy a tree + */ +static Ipmux* +ipmuxcopy(Ipmux *f) +{ + Ipmux *nf; + + if(f == nil) + return nil; + nf = smalloc(sizeof *nf); + *nf = *f; + nf->no = ipmuxcopy(f->no); + nf->yes = ipmuxcopy(f->yes); + nf->val = smalloc(f->n*f->len); + nf->e = nf->val + f->len*f->n; + memmove(nf->val, f->val, f->n*f->len); + return nf; +} + +static void +ipmuxfree(Ipmux *f) +{ + if(f->val != nil) + free(f->val); + free(f); +} + +static void +ipmuxtreefree(Ipmux *f) +{ + if(f == nil) + return; + if(f->no != nil) + ipmuxfree(f->no); + if(f->yes != nil) + ipmuxfree(f->yes); + ipmuxfree(f); +} + +/* + * merge two trees + */ +static Ipmux* +ipmuxmerge(Ipmux *a, Ipmux *b) +{ + int n; + Ipmux *f; + + if(a == nil) + return b; + if(b == nil) + return a; + n = ipmuxcmp(a, b); + if(n < 0){ + f = ipmuxcopy(b); + a->yes = ipmuxmerge(a->yes, b); + a->no = ipmuxmerge(a->no, f); + return a; + } + if(n > 0){ + f = ipmuxcopy(a); + b->yes = ipmuxmerge(b->yes, a); + b->no = ipmuxmerge(b->no, f); + return b; + } + if(ipmuxvalcmp(a, b) == 0){ + a->yes = ipmuxmerge(a->yes, b->yes); + a->no = ipmuxmerge(a->no, b->no); + a->ref++; + ipmuxfree(b); + return a; + } + a->no = ipmuxmerge(a->no, b); + return a; +} + +/* + * remove a chain from a demux tree. This is like merging accept that + * we remove instead of insert. + */ +static int +ipmuxremove(Ipmux **l, Ipmux *f) +{ + int n, rv; + Ipmux *ft; + + if(f == nil) + return 0; /* we've removed it all */ + if(*l == nil) + return -1; + + ft = *l; + n = ipmuxcmp(ft, f); + if(n < 0){ + /* *l is maching an earlier field, descend both paths */ + rv = ipmuxremove(&ft->yes, f); + rv += ipmuxremove(&ft->no, f); + return rv; + } + if(n > 0){ + /* f represents an earlier field than *l, this should be impossible */ + return -1; + } + + /* if we get here f and *l are comparing the same fields */ + if(ipmuxvalcmp(ft, f) != 0){ + /* different values mean mutually exclusive */ + return ipmuxremove(&ft->no, f); + } + + /* we found a match */ + if(--(ft->ref) == 0){ + /* + * a dead node implies the whole yes side is also dead. + * since our chain is constrained to be on that side, + * we're done. + */ + ipmuxtreefree(ft->yes); + *l = ft->no; + ipmuxfree(ft); + return 0; + } + + /* + * free the rest of the chain. it is constrained to match the + * yes side. + */ + return ipmuxremove(&ft->yes, f->yes); +} + +/* + * connection request is a semi separated list of filters + * e.g. proto=17;data[0:4]=11aa22bb;ifc=135.104.9.2&255.255.255.0 + * + * there's no protection against overlapping specs. + */ +static char* +ipmuxconnect(Conv *c, char **argv, int argc) +{ + int i, n; + char *field[10]; + Ipmux *mux, *chain; + Ipmuxrock *r; + Fs *f; + + f = c->p->f; + + if(argc != 2) + return Ebadarg; + + n = getfields(argv[1], field, nelem(field), 1, ";"); + if(n <= 0) + return Ebadarg; + + chain = nil; + mux = nil; + for(i = 0; i < n; i++){ + mux = parsemux(field[i]); + if(mux == nil){ + ipmuxtreefree(chain); + return Ebadarg; + } + ipmuxchain(&chain, mux); + } + if(chain == nil) + return Ebadarg; + mux->conv = c; + + /* save a copy of the chain so we can later remove it */ + mux = ipmuxcopy(chain); + r = (Ipmuxrock*)(c->ptcl); + r->chain = chain; + + /* add the chain to the protocol demultiplexor tree */ + WLOCK(f); + f->ipmux->priv = ipmuxmerge(f->ipmux->priv, mux); + WUNLOCK(f); + + Fsconnected(c, nil); + return nil; +} + +static int +ipmuxstate(Conv *c, char *state, int n) +{ + Ipmuxrock *r; + + r = (Ipmuxrock*)(c->ptcl); + return ipmuxsprint(r->chain, 0, state, n); +} + +static void +ipmuxcreate(Conv *c) +{ + Ipmuxrock *r; + + c->rq = qopen(64*1024, Qmsg, 0, c); + c->wq = qopen(64*1024, Qkick, ipmuxkick, c); + r = (Ipmuxrock*)(c->ptcl); + r->chain = nil; +} + +static char* +ipmuxannounce(Conv* _, char** __, int ___) +{ + return "ipmux does not support announce"; +} + +static void +ipmuxclose(Conv *c) +{ + Ipmux *i; + Ipmuxrock *r; + Fs *f = c->p->f; + + r = (Ipmuxrock*)(c->ptcl); + + qclose(c->rq); + qclose(c->wq); + qclose(c->eq); + ipmove(c->laddr, IPnoaddr); + ipmove(c->raddr, IPnoaddr); + c->lport = 0; + c->rport = 0; + + WLOCK(f); + i = (Ipmux *)c->p->priv; + ipmuxremove(&i, r->chain); + WUNLOCK(f); + ipmuxtreefree(r->chain); + r->chain = nil; +} + +/* + * takes a fully formed ip packet and just passes it down + * the stack + */ +static void +ipmuxkick(void *x) +{ + Conv *c = x; + Block *bp; + + bp = qget(c->wq); + if(bp != nil) { + Myip4hdr *ih4 = (Myip4hdr*)(bp->rp); + + if((ih4->vihl & 0xF0) != IP_VER6) + ipoput4(c->p->f, bp, 0, ih4->ttl, ih4->tos, nil); + else + ipoput6(c->p->f, bp, 0, ((Ip6hdr*)ih4)->ttl, 0, nil); + } +} + +static void +ipmuxiput(Proto *p, Ipifc *ifc, Block *bp) +{ + int len, hl; + Fs *f = p->f; + uchar *m, *h, *v, *e, *ve, *hp; + Conv *c; + Ipmux *mux; + Myip4hdr *ip; + Ip6hdr *ip6; + + ip = (Myip4hdr*)bp->rp; + hl = (ip->vihl&0x0F)<<2; + + if(p->priv == nil) + goto nomatch; + + h = bp->rp; + len = BLEN(bp); + + /* run the v4 filter */ + RLOCK(f); + c = nil; + mux = f->ipmux->priv; + while(mux != nil){ + if(mux->eoff > len){ + mux = mux->no; + continue; + } + hp = h + mux->off + ((int)mux->skiphdr)*hl; + switch(mux->ctype){ + case Cbyte: + if(*mux->val == *hp) + goto yes; + break; + case Cmbyte: + if((*hp & *mux->mask) == *mux->val) + goto yes; + break; + case Cshort: + if(*((ushort*)mux->val) == *(ushort*)hp) + goto yes; + break; + case Cmshort: + if((*(ushort*)hp & (*((ushort*)mux->mask))) == *((ushort*)mux->val)) + goto yes; + break; + case Clong: + if(*((ulong*)mux->val) == *(ulong*)hp) + goto yes; + break; + case Cmlong: + if((*(ulong*)hp & (*((ulong*)mux->mask))) == *((ulong*)mux->val)) + goto yes; + break; + case Cifc: + if(*((ulong*)mux->val) == *(ulong*)(ifc->lifc->local + IPv4off)) + goto yes; + break; + case Cmifc: + if((*(ulong*)(ifc->lifc->local + IPv4off) & (*((ulong*)mux->mask))) == *((ulong*)mux->val)) + goto yes; + break; + default: + v = mux->val; + for(e = mux->e; v < e; v = ve){ + m = mux->mask; + hp = h + mux->off; + for(ve = v + mux->len; v < ve; v++){ + if((*hp++ & *m++) != *v) + break; + } + if(v == ve) + goto yes; + } + } + mux = mux->no; + continue; +yes: + if(mux->conv != nil) + c = mux->conv; + mux = mux->yes; + } + RUNLOCK(f); + + if(c != nil){ + /* tack on interface address */ + bp = padblock(bp, IPaddrlen); + ipmove(bp->rp, ifc->lifc->local); + bp = concatblock(bp); + if(bp != nil) + if(qpass(c->rq, bp) < 0) + print("Q"); + return; + } + +nomatch: + /* doesn't match any filter, hand it to the specific protocol handler */ + ip = (Myip4hdr*)bp->rp; + if((ip->vihl & 0xF0) == IP_VER4) { + p = f->t2p[ip->proto]; + } else { + ip6 = (Ip6hdr*)bp->rp; + p = f->t2p[ip6->proto]; + } + if(p && p->rcv) + (*p->rcv)(p, ifc, bp); + else + freeblist(bp); + return; +} + +static int +ipmuxsprint(Ipmux *mux, int level, char *buf, int len) +{ + int i, j, n; + uchar *v; + + n = 0; + for(i = 0; i < level; i++) + n += snprint(buf+n, len-n, " "); + if(mux == nil){ + n += snprint(buf+n, len-n, "\n"); + return n; + } + n += snprint(buf+n, len-n, "h[%d:%d]&", + mux->off+((int)mux->skiphdr)*((int)ipoff->data), + mux->off+(((int)mux->skiphdr)*((int)ipoff->data))+mux->len-1); + for(i = 0; i < mux->len; i++) + n += snprint(buf+n, len - n, "%2.2ux", mux->mask[i]); + n += snprint(buf+n, len-n, "="); + v = mux->val; + for(j = 0; j < mux->n; j++){ + for(i = 0; i < mux->len; i++) + n += snprint(buf+n, len - n, "%2.2ux", *v++); + n += snprint(buf+n, len-n, "|"); + } + n += snprint(buf+n, len-n, "\n"); + level++; + n += ipmuxsprint(mux->no, level, buf+n, len-n); + n += ipmuxsprint(mux->yes, level, buf+n, len-n); + return n; +} + +static int +ipmuxstats(Proto *p, char *buf, int len) +{ + int n; + Fs *f = p->f; + + RLOCK(f); + n = ipmuxsprint(p->priv, 0, buf, len); + RUNLOCK(f); + + return n; +} + +void +ipmuxinit(Fs *f) +{ + Proto *ipmux; + + ipmux = smalloc(sizeof(Proto)); + ipmux->priv = nil; + ipmux->name = "ipmux"; + ipmux->connect = ipmuxconnect; + ipmux->announce = ipmuxannounce; + ipmux->state = ipmuxstate; + ipmux->create = ipmuxcreate; + ipmux->close = ipmuxclose; + ipmux->rcv = ipmuxiput; + ipmux->ctl = nil; + ipmux->advise = nil; + ipmux->stats = ipmuxstats; + ipmux->ipproto = -1; + ipmux->nc = 64; + ipmux->ptclsize = sizeof(Ipmuxrock); + + f->ipmux = ipmux; /* hack for Fsrcvpcol */ + + Fsproto(f, ipmux); +} diff --git a/src/9vx/a/ip/iproute.c b/src/9vx/a/ip/iproute.c @@ -0,0 +1,854 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" + +static void walkadd(Fs*, Route**, Route*); +static void addnode(Fs*, Route**, Route*); +static void calcd(Route*); + +/* these are used for all instances of IP */ +static Route* v4freelist; +static Route* v6freelist; +static RWlock routelock; +static ulong v4routegeneration, v6routegeneration; + +static void +freeroute(Route *r) +{ + Route **l; + + r->left = nil; + r->right = nil; + if(r->type & Rv4) + l = &v4freelist; + else + l = &v6freelist; + r->mid = *l; + *l = r; +} + +static Route* +allocroute(int type) +{ + Route *r; + int n; + Route **l; + + if(type & Rv4){ + n = sizeof(RouteTree) + sizeof(V4route); + l = &v4freelist; + } else { + n = sizeof(RouteTree) + sizeof(V6route); + l = &v6freelist; + } + + r = *l; + if(r != nil){ + *l = r->mid; + } else { + r = malloc(n); + if(r == nil) + panic("out of routing nodes"); + } + memset(r, 0, n); + r->type = type; + r->ifc = nil; + r->ref = 1; + + return r; +} + +static void +addqueue(Route **q, Route *r) +{ + Route *l; + + if(r == nil) + return; + + l = allocroute(r->type); + l->mid = *q; + *q = l; + l->left = r; +} + +/* + * compare 2 v6 addresses + */ +static int +lcmp(ulong *a, ulong *b) +{ + int i; + + for(i = 0; i < IPllen; i++){ + if(a[i] > b[i]) + return 1; + if(a[i] < b[i]) + return -1; + } + return 0; +} + +/* + * compare 2 v4 or v6 ranges + */ +enum +{ + Rpreceeds, + Rfollows, + Requals, + Rcontains, + Rcontained, +}; + +static int +rangecompare(Route *a, Route *b) +{ + if(a->type & Rv4){ + if(a->v4.endaddress < b->v4.address) + return Rpreceeds; + + if(a->v4.address > b->v4.endaddress) + return Rfollows; + + if(a->v4.address <= b->v4.address + && a->v4.endaddress >= b->v4.endaddress){ + if(a->v4.address == b->v4.address + && a->v4.endaddress == b->v4.endaddress) + return Requals; + return Rcontains; + } + return Rcontained; + } + + if(lcmp(a->v6.endaddress, b->v6.address) < 0) + return Rpreceeds; + + if(lcmp(a->v6.address, b->v6.endaddress) > 0) + return Rfollows; + + if(lcmp(a->v6.address, b->v6.address) <= 0 + && lcmp(a->v6.endaddress, b->v6.endaddress) >= 0){ + if(lcmp(a->v6.address, b->v6.address) == 0 + && lcmp(a->v6.endaddress, b->v6.endaddress) == 0) + return Requals; + return Rcontains; + } + + return Rcontained; +} + +static void +copygate(Route *old, Route *new) +{ + if(new->type & Rv4) + memmove(old->v4.gate, new->v4.gate, IPv4addrlen); + else + memmove(old->v6.gate, new->v6.gate, IPaddrlen); +} + +/* + * walk down a tree adding nodes back in + */ +static void +walkadd(Fs *f, Route **root, Route *p) +{ + Route *l, *r; + + l = p->left; + r = p->right; + p->left = 0; + p->right = 0; + addnode(f, root, p); + if(l) + walkadd(f, root, l); + if(r) + walkadd(f, root, r); +} + +/* + * calculate depth + */ +static void +calcd(Route *p) +{ + Route *q; + int d; + + if(p) { + d = 0; + q = p->left; + if(q) + d = q->depth; + q = p->right; + if(q && q->depth > d) + d = q->depth; + q = p->mid; + if(q && q->depth > d) + d = q->depth; + p->depth = d+1; + } +} + +/* + * balance the tree at the current node + */ +static void +balancetree(Route **cur) +{ + Route *p, *l, *r; + int dl, dr; + + /* + * if left and right are + * too out of balance, + * rotate tree node + */ + p = *cur; + dl = 0; if((l = p->left) != nil) dl = l->depth; + dr = 0; if((r = p->right) != nil) dr = r->depth; + + if(dl > dr+1) { + p->left = l->right; + l->right = p; + *cur = l; + calcd(p); + calcd(l); + } else + if(dr > dl+1) { + p->right = r->left; + r->left = p; + *cur = r; + calcd(p); + calcd(r); + } else + calcd(p); +} + +/* + * add a new node to the tree + */ +static void +addnode(Fs *f, Route **cur, Route *new) +{ + Route *p; + + p = *cur; + if(p == 0) { + *cur = new; + new->depth = 1; + return; + } + + switch(rangecompare(new, p)){ + case Rpreceeds: + addnode(f, &p->left, new); + break; + case Rfollows: + addnode(f, &p->right, new); + break; + case Rcontains: + /* + * if new node is superset + * of tree node, + * replace tree node and + * queue tree node to be + * merged into root. + */ + *cur = new; + new->depth = 1; + addqueue(&f->queue, p); + break; + case Requals: + /* + * supercede the old entry if the old one isn't + * a local interface. + */ + if((p->type & Rifc) == 0){ + p->type = new->type; + p->ifcid = -1; + copygate(p, new); + } else if(new->type & Rifc) + p->ref++; + freeroute(new); + break; + case Rcontained: + addnode(f, &p->mid, new); + break; + } + + balancetree(cur); +} + +#define V4H(a) ((a&0x07ffffff)>>(32-Lroot-5)) + +void +v4addroute(Fs *f, char *tag, uchar *a, uchar *mask, uchar *gate, int type) +{ + Route *p; + ulong sa; + ulong m; + ulong ea; + int h, eh; + + m = nhgetl(mask); + sa = nhgetl(a) & m; + ea = sa | ~m; + + eh = V4H(ea); + for(h=V4H(sa); h<=eh; h++) { + p = allocroute(Rv4 | type); + p->v4.address = sa; + p->v4.endaddress = ea; + memmove(p->v4.gate, gate, sizeof(p->v4.gate)); + memmove(p->tag, tag, sizeof(p->tag)); + + wlock(&routelock); + addnode(f, &f->v4root[h], p); + while((p = f->queue) != nil) { + f->queue = p->mid; + walkadd(f, &f->v4root[h], p->left); + freeroute(p); + } + wunlock(&routelock); + } + v4routegeneration++; + + ipifcaddroute(f, Rv4, a, mask, gate, type); +} + +#define V6H(a) (((a)[IPllen-1] & 0x07ffffff)>>(32-Lroot-5)) +#define ISDFLT(a, mask, tag) ((ipcmp((a),v6Unspecified)==0) && (ipcmp((mask),v6Unspecified)==0) && (strcmp((tag), "ra")!=0)) + +void +v6addroute(Fs *f, char *tag, uchar *a, uchar *mask, uchar *gate, int type) +{ + Route *p; + ulong sa[IPllen], ea[IPllen]; + ulong x, y; + int h, eh; + + /* + if(ISDFLT(a, mask, tag)) + f->v6p->cdrouter = -1; + */ + + + for(h = 0; h < IPllen; h++){ + x = nhgetl(a+4*h); + y = nhgetl(mask+4*h); + sa[h] = x & y; + ea[h] = x | ~y; + } + + eh = V6H(ea); + for(h = V6H(sa); h <= eh; h++) { + p = allocroute(type); + memmove(p->v6.address, sa, IPaddrlen); + memmove(p->v6.endaddress, ea, IPaddrlen); + memmove(p->v6.gate, gate, IPaddrlen); + memmove(p->tag, tag, sizeof(p->tag)); + + wlock(&routelock); + addnode(f, &f->v6root[h], p); + while((p = f->queue) != nil) { + f->queue = p->mid; + walkadd(f, &f->v6root[h], p->left); + freeroute(p); + } + wunlock(&routelock); + } + v6routegeneration++; + + ipifcaddroute(f, 0, a, mask, gate, type); +} + +Route** +looknode(Route **cur, Route *r) +{ + Route *p; + + for(;;){ + p = *cur; + if(p == 0) + return 0; + + switch(rangecompare(r, p)){ + case Rcontains: + return 0; + case Rpreceeds: + cur = &p->left; + break; + case Rfollows: + cur = &p->right; + break; + case Rcontained: + cur = &p->mid; + break; + case Requals: + return cur; + } + } +} + +void +v4delroute(Fs *f, uchar *a, uchar *mask, int dolock) +{ + Route **r, *p; + Route rt; + int h, eh; + ulong m; + + m = nhgetl(mask); + rt.v4.address = nhgetl(a) & m; + rt.v4.endaddress = rt.v4.address | ~m; + rt.type = Rv4; + + eh = V4H(rt.v4.endaddress); + for(h=V4H(rt.v4.address); h<=eh; h++) { + if(dolock) + wlock(&routelock); + r = looknode(&f->v4root[h], &rt); + if(r) { + p = *r; + if(--(p->ref) == 0){ + *r = 0; + addqueue(&f->queue, p->left); + addqueue(&f->queue, p->mid); + addqueue(&f->queue, p->right); + freeroute(p); + while((p = f->queue) != nil) { + f->queue = p->mid; + walkadd(f, &f->v4root[h], p->left); + freeroute(p); + } + } + } + if(dolock) + wunlock(&routelock); + } + v4routegeneration++; + + ipifcremroute(f, Rv4, a, mask); +} + +void +v6delroute(Fs *f, uchar *a, uchar *mask, int dolock) +{ + Route **r, *p; + Route rt; + int h, eh; + ulong x, y; + + for(h = 0; h < IPllen; h++){ + x = nhgetl(a+4*h); + y = nhgetl(mask+4*h); + rt.v6.address[h] = x & y; + rt.v6.endaddress[h] = x | ~y; + } + rt.type = 0; + + eh = V6H(rt.v6.endaddress); + for(h=V6H(rt.v6.address); h<=eh; h++) { + if(dolock) + wlock(&routelock); + r = looknode(&f->v6root[h], &rt); + if(r) { + p = *r; + if(--(p->ref) == 0){ + *r = 0; + addqueue(&f->queue, p->left); + addqueue(&f->queue, p->mid); + addqueue(&f->queue, p->right); + freeroute(p); + while((p = f->queue) != nil) { + f->queue = p->mid; + walkadd(f, &f->v6root[h], p->left); + freeroute(p); + } + } + } + if(dolock) + wunlock(&routelock); + } + v6routegeneration++; + + ipifcremroute(f, 0, a, mask); +} + +Route* +v4lookup(Fs *f, uchar *a, Conv *c) +{ + Route *p, *q; + ulong la; + uchar gate[IPaddrlen]; + Ipifc *ifc; + + if(c != nil && c->r != nil && c->r->ifc != nil && c->rgen == v4routegeneration) + return c->r; + + la = nhgetl(a); + q = nil; + for(p=f->v4root[V4H(la)]; p;) + if(la >= p->v4.address) { + if(la <= p->v4.endaddress) { + q = p; + p = p->mid; + } else + p = p->right; + } else + p = p->left; + + if(q && (q->ifc == nil || q->ifcid != q->ifc->ifcid)){ + if(q->type & Rifc) { + hnputl(gate+IPv4off, q->v4.address); + memmove(gate, v4prefix, IPv4off); + } else + v4tov6(gate, q->v4.gate); + ifc = findipifc(f, gate, q->type); + if(ifc == nil) + return nil; + q->ifc = ifc; + q->ifcid = ifc->ifcid; + } + + if(c != nil){ + c->r = q; + c->rgen = v4routegeneration; + } + + return q; +} + +Route* +v6lookup(Fs *f, uchar *a, Conv *c) +{ + Route *p, *q; + ulong la[IPllen]; + int h; + ulong x, y; + uchar gate[IPaddrlen]; + Ipifc *ifc; + + if(memcmp(a, v4prefix, IPv4off) == 0){ + q = v4lookup(f, a+IPv4off, c); + if(q != nil) + return q; + } + + if(c != nil && c->r != nil && c->r->ifc != nil && c->rgen == v6routegeneration) + return c->r; + + for(h = 0; h < IPllen; h++) + la[h] = nhgetl(a+4*h); + + q = 0; + for(p=f->v6root[V6H(la)]; p;){ + for(h = 0; h < IPllen; h++){ + x = la[h]; + y = p->v6.address[h]; + if(x == y) + continue; + if(x < y){ + p = p->left; + goto next; + } + break; + } + for(h = 0; h < IPllen; h++){ + x = la[h]; + y = p->v6.endaddress[h]; + if(x == y) + continue; + if(x > y){ + p = p->right; + goto next; + } + break; + } + q = p; + p = p->mid; +next: ; + } + + if(q && (q->ifc == nil || q->ifcid != q->ifc->ifcid)){ + if(q->type & Rifc) { + for(h = 0; h < IPllen; h++) + hnputl(gate+4*h, q->v6.address[h]); + ifc = findipifc(f, gate, q->type); + } else + ifc = findipifc(f, q->v6.gate, q->type); + if(ifc == nil) + return nil; + q->ifc = ifc; + q->ifcid = ifc->ifcid; + } + if(c != nil){ + c->r = q; + c->rgen = v6routegeneration; + } + + return q; +} + +void +routetype(int type, char *p) +{ + memset(p, ' ', 4); + p[4] = 0; + if(type & Rv4) + *p++ = '4'; + else + *p++ = '6'; + if(type & Rifc) + *p++ = 'i'; + if(type & Runi) + *p++ = 'u'; + else if(type & Rbcast) + *p++ = 'b'; + else if(type & Rmulti) + *p++ = 'm'; + if(type & Rptpt) + *p = 'p'; +} + +static char *rformat = "%-15I %-4M %-15I %4.4s %4.4s %3s\n"; + +void +convroute(Route *r, uchar *addr, uchar *mask, uchar *gate, char *t, int *nifc) +{ + int i; + + if(r->type & Rv4){ + memmove(addr, v4prefix, IPv4off); + hnputl(addr+IPv4off, r->v4.address); + memset(mask, 0xff, IPv4off); + hnputl(mask+IPv4off, ~(r->v4.endaddress ^ r->v4.address)); + memmove(gate, v4prefix, IPv4off); + memmove(gate+IPv4off, r->v4.gate, IPv4addrlen); + } else { + for(i = 0; i < IPllen; i++){ + hnputl(addr + 4*i, r->v6.address[i]); + hnputl(mask + 4*i, ~(r->v6.endaddress[i] ^ r->v6.address[i])); + } + memmove(gate, r->v6.gate, IPaddrlen); + } + + routetype(r->type, t); + + if(r->ifc) + *nifc = r->ifc->conv->x; + else + *nifc = -1; +} + +/* + * this code is not in rr to reduce stack size + */ +static void +sprintroute(Route *r, Routewalk *rw) +{ + int nifc, n; + char t[5], *iname, ifbuf[5]; + uchar addr[IPaddrlen], mask[IPaddrlen], gate[IPaddrlen]; + char *p; + + convroute(r, addr, mask, gate, t, &nifc); + iname = "-"; + if(nifc != -1) { + iname = ifbuf; + snprint(ifbuf, sizeof ifbuf, "%d", nifc); + } + p = seprint(rw->p, rw->e, rformat, addr, mask, gate, t, r->tag, iname); + if(rw->o < 0){ + n = p - rw->p; + if(n > -rw->o){ + memmove(rw->p, rw->p-rw->o, n+rw->o); + rw->p = p + rw->o; + } + rw->o += n; + } else + rw->p = p; +} + +/* + * recurse descending tree, applying the function in Routewalk + */ +static int +rr(Route *r, Routewalk *rw) +{ + int h; + + if(rw->e <= rw->p) + return 0; + if(r == nil) + return 1; + + if(rr(r->left, rw) == 0) + return 0; + + if(r->type & Rv4) + h = V4H(r->v4.address); + else + h = V6H(r->v6.address); + + if(h == rw->h) + rw->walk(r, rw); + + if(rr(r->mid, rw) == 0) + return 0; + + return rr(r->right, rw); +} + +void +ipwalkroutes(Fs *f, Routewalk *rw) +{ + rlock(&routelock); + if(rw->e > rw->p) { + for(rw->h = 0; rw->h < nelem(f->v4root); rw->h++) + if(rr(f->v4root[rw->h], rw) == 0) + break; + } + if(rw->e > rw->p) { + for(rw->h = 0; rw->h < nelem(f->v6root); rw->h++) + if(rr(f->v6root[rw->h], rw) == 0) + break; + } + runlock(&routelock); +} + +long +routeread(Fs *f, char *p, ulong offset, int n) +{ + Routewalk rw; + + rw.p = p; + rw.e = p+n; + rw.o = -offset; + rw.walk = sprintroute; + + ipwalkroutes(f, &rw); + + return rw.p - p; +} + +/* + * this code is not in routeflush to reduce stack size + */ +void +delroute(Fs *f, Route *r, int dolock) +{ + uchar addr[IPaddrlen]; + uchar mask[IPaddrlen]; + uchar gate[IPaddrlen]; + char t[5]; + int nifc; + + convroute(r, addr, mask, gate, t, &nifc); + if(r->type & Rv4) + v4delroute(f, addr+IPv4off, mask+IPv4off, dolock); + else + v6delroute(f, addr, mask, dolock); +} + +/* + * recurse until one route is deleted + * returns 0 if nothing is deleted, 1 otherwise + */ +int +routeflush(Fs *f, Route *r, char *tag) +{ + if(r == nil) + return 0; + if(routeflush(f, r->mid, tag)) + return 1; + if(routeflush(f, r->left, tag)) + return 1; + if(routeflush(f, r->right, tag)) + return 1; + if((r->type & Rifc) == 0){ + if(tag == nil || strncmp(tag, r->tag, sizeof(r->tag)) == 0){ + delroute(f, r, 0); + return 1; + } + } + return 0; +} + +long +routewrite(Fs *f, Chan *c, char *p, int n) +{ + int h, changed; + char *tag; + Cmdbuf *cb; + uchar addr[IPaddrlen]; + uchar mask[IPaddrlen]; + uchar gate[IPaddrlen]; + IPaux *a, *na; + + cb = parsecmd(p, n); + if(waserror()){ + free(cb); + nexterror(); + } + + if(strcmp(cb->f[0], "flush") == 0){ + tag = cb->f[1]; + for(h = 0; h < nelem(f->v4root); h++) + for(changed = 1; changed;){ + wlock(&routelock); + changed = routeflush(f, f->v4root[h], tag); + wunlock(&routelock); + } + for(h = 0; h < nelem(f->v6root); h++) + for(changed = 1; changed;){ + wlock(&routelock); + changed = routeflush(f, f->v6root[h], tag); + wunlock(&routelock); + } + } else if(strcmp(cb->f[0], "remove") == 0){ + if(cb->nf < 3) + error(Ebadarg); + if (parseip(addr, cb->f[1]) == -1) + error(Ebadip); + parseipmask(mask, cb->f[2]); + if(memcmp(addr, v4prefix, IPv4off) == 0) + v4delroute(f, addr+IPv4off, mask+IPv4off, 1); + else + v6delroute(f, addr, mask, 1); + } else if(strcmp(cb->f[0], "add") == 0){ + if(cb->nf < 4) + error(Ebadarg); + if(parseip(addr, cb->f[1]) == -1 || + parseip(gate, cb->f[3]) == -1) + error(Ebadip); + parseipmask(mask, cb->f[2]); + tag = "none"; + if(c != nil){ + a = c->aux; + tag = a->tag; + } + if(memcmp(addr, v4prefix, IPv4off) == 0) + v4addroute(f, tag, addr+IPv4off, mask+IPv4off, gate+IPv4off, 0); + else + v6addroute(f, tag, addr, mask, gate, 0); + } else if(strcmp(cb->f[0], "tag") == 0) { + if(cb->nf < 2) + error(Ebadarg); + + a = c->aux; + na = newipaux(a->owner, cb->f[1]); + c->aux = na; + free(a); + } + + poperror(); + free(cb); + return n; +} diff --git a/src/9vx/a/ip/ipv6.c b/src/9vx/a/ip/ipv6.c @@ -0,0 +1,718 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" +#include "ipv6.h" + +enum +{ + IP6FHDR = 8, /* sizeof(Fraghdr6) */ +}; + +#define IPV6CLASS(hdr) (((hdr)->vcf[0]&0x0F)<<2 | ((hdr)->vcf[1]&0xF0)>>2) +#define BLKIPVER(xp) (((Ip6hdr*)((xp)->rp))->vcf[0] & 0xF0) +/* + * This sleazy macro is stolen shamelessly from ip.c, see comment there. + */ +#define BKFG(xp) ((Ipfrag*)((xp)->base)) + +typedef struct Fragment4 Fragment4; +typedef struct Fragment6 Fragment6; +typedef struct Ipfrag Ipfrag; + +Block* ip6reassemble(IP*, int, Block*, Ip6hdr*); +Fragment6* ipfragallo6(IP*); +void ipfragfree6(IP*, Fragment6*); +Block* procopts(Block *bp); +static Block* procxtns(IP *ip, Block *bp, int doreasm); +int unfraglen(Block *bp, uchar *nexthdr, int setfh); + +/* MIB II counters */ +enum +{ + Forwarding, + DefaultTTL, + InReceives, + InHdrErrors, + InAddrErrors, + ForwDatagrams, + InUnknownProtos, + InDiscards, + InDelivers, + OutRequests, + OutDiscards, + OutNoRoutes, + ReasmTimeout, + ReasmReqds, + ReasmOKs, + ReasmFails, + FragOKs, + FragFails, + FragCreates, + + Nstats, +}; + +static char *statnames[] = +{ +[Forwarding] "Forwarding", +[DefaultTTL] "DefaultTTL", +[InReceives] "InReceives", +[InHdrErrors] "InHdrErrors", +[InAddrErrors] "InAddrErrors", +[ForwDatagrams] "ForwDatagrams", +[InUnknownProtos] "InUnknownProtos", +[InDiscards] "InDiscards", +[InDelivers] "InDelivers", +[OutRequests] "OutRequests", +[OutDiscards] "OutDiscards", +[OutNoRoutes] "OutNoRoutes", +[ReasmTimeout] "ReasmTimeout", +[ReasmReqds] "ReasmReqds", +[ReasmOKs] "ReasmOKs", +[ReasmFails] "ReasmFails", +[FragOKs] "FragOKs", +[FragFails] "FragFails", +[FragCreates] "FragCreates", +}; + +struct Fragment4 +{ + Block* blist; + Fragment4* next; + ulong src; + ulong dst; + ushort id; + ulong age; +}; + +struct Fragment6 +{ + Block* blist; + Fragment6* next; + uchar src[IPaddrlen]; + uchar dst[IPaddrlen]; + uint id; + ulong age; +}; + +struct Ipfrag +{ + ushort foff; + ushort flen; +}; + +/* an instance of IP */ +struct IP +{ + ulong stats[Nstats]; + + QLock fraglock4; + Fragment4* flisthead4; + Fragment4* fragfree4; + Ref id4; + + QLock fraglock6; + Fragment6* flisthead6; + Fragment6* fragfree6; + Ref id6; + + int iprouting; /* true if we route like a gateway */ +}; + +int +ipoput6(Fs *f, Block *bp, int gating, int ttl, int tos, Conv *c) +{ + int medialen, len, chunk, uflen, flen, seglen, lid, offset, fragoff; + int morefrags, blklen, rv = 0, tentative; + uchar *gate, nexthdr; + Block *xp, *nb; + Fraghdr6 fraghdr; + IP *ip; + Ip6hdr *eh; + Ipifc *ifc; + Route *r, *sr; + + ip = f->ip; + + /* Fill out the ip header */ + eh = (Ip6hdr*)(bp->rp); + + ip->stats[OutRequests]++; + + /* Number of uchars in data and ip header to write */ + len = blocklen(bp); + + tentative = iptentative(f, eh->src); + if(tentative){ + netlog(f, Logip, "reject tx of packet with tentative src address %I\n", + eh->src); + goto free; + } + + if(gating){ + chunk = nhgets(eh->ploadlen); + if(chunk > len){ + ip->stats[OutDiscards]++; + netlog(f, Logip, "short gated packet\n"); + goto free; + } + if(chunk + IP6HDR < len) + len = chunk + IP6HDR; + } + + if(len >= IP_MAX){ + ip->stats[OutDiscards]++; + netlog(f, Logip, "exceeded ip max size %I\n", eh->dst); + goto free; + } + + r = v6lookup(f, eh->dst, c); + if(r == nil){ +// print("no route for %I, src %I free\n", eh->dst, eh->src); + ip->stats[OutNoRoutes]++; + netlog(f, Logip, "no interface %I\n", eh->dst); + rv = -1; + goto free; + } + + ifc = r->ifc; + if(r->type & (Rifc|Runi)) + gate = eh->dst; + else if(r->type & (Rbcast|Rmulti)) { + gate = eh->dst; + sr = v6lookup(f, eh->src, nil); + if(sr && (sr->type & Runi)) + ifc = sr->ifc; + } + else + gate = r->v6.gate; + + if(!gating) + eh->vcf[0] = IP_VER6; + eh->ttl = ttl; + if(!gating) { + eh->vcf[0] |= tos >> 4; + eh->vcf[1] = tos << 4; + } + + if(!CANRLOCK(ifc)) + goto free; + + if(waserror()){ + RUNLOCK(ifc); + nexterror(); + } + + if(ifc->m == nil) + goto raise; + + /* If we dont need to fragment just send it */ + medialen = ifc->maxtu - ifc->m->hsize; + if(len <= medialen) { + hnputs(eh->ploadlen, len - IP6HDR); + ifc->m->bwrite(ifc, bp, V6, gate); + RUNLOCK(ifc); + poperror(); + return 0; + } + + if(gating && ifc->reassemble <= 0) { + /* + * v6 intermediate nodes are not supposed to fragment pkts; + * we fragment if ifc->reassemble is turned on; an exception + * needed for nat. + */ + ip->stats[OutDiscards]++; + icmppkttoobig6(f, ifc, bp); + netlog(f, Logip, "%I: gated pkts not fragmented\n", eh->dst); + goto raise; + } + + /* start v6 fragmentation */ + uflen = unfraglen(bp, &nexthdr, 1); + if(uflen > medialen) { + ip->stats[FragFails]++; + ip->stats[OutDiscards]++; + netlog(f, Logip, "%I: unfragmentable part too big\n", eh->dst); + goto raise; + } + + flen = len - uflen; + seglen = (medialen - (uflen + IP6FHDR)) & ~7; + if(seglen < 8) { + ip->stats[FragFails]++; + ip->stats[OutDiscards]++; + netlog(f, Logip, "%I: seglen < 8\n", eh->dst); + goto raise; + } + + lid = incref(&ip->id6); + fraghdr.nexthdr = nexthdr; + fraghdr.res = 0; + hnputl(fraghdr.id, lid); + + xp = bp; + offset = uflen; + while (xp && offset && offset >= BLEN(xp)) { + offset -= BLEN(xp); + xp = xp->next; + } + xp->rp += offset; + + fragoff = 0; + morefrags = 1; + + for(; fragoff < flen; fragoff += seglen) { + nb = allocb(uflen + IP6FHDR + seglen); + + if(fragoff + seglen >= flen) { + seglen = flen - fragoff; + morefrags = 0; + } + + hnputs(eh->ploadlen, seglen+IP6FHDR); + memmove(nb->wp, eh, uflen); + nb->wp += uflen; + + hnputs(fraghdr.offsetRM, fragoff); /* last 3 bits must be 0 */ + fraghdr.offsetRM[1] |= morefrags; + memmove(nb->wp, &fraghdr, IP6FHDR); + nb->wp += IP6FHDR; + + /* Copy data */ + chunk = seglen; + while (chunk) { + if(!xp) { + ip->stats[OutDiscards]++; + ip->stats[FragFails]++; + freeblist(nb); + netlog(f, Logip, "!xp: chunk in v6%d\n", chunk); + goto raise; + } + blklen = chunk; + if(BLEN(xp) < chunk) + blklen = BLEN(xp); + memmove(nb->wp, xp->rp, blklen); + + nb->wp += blklen; + xp->rp += blklen; + chunk -= blklen; + if(xp->rp == xp->wp) + xp = xp->next; + } + + ifc->m->bwrite(ifc, nb, V6, gate); + ip->stats[FragCreates]++; + } + ip->stats[FragOKs]++; + +raise: + RUNLOCK(ifc); + poperror(); +free: + freeblist(bp); + return rv; +} + +void +ipiput6(Fs *f, Ipifc *ifc, Block *bp) +{ + int hl, hop, tos, notforme, tentative; + uchar proto; + uchar v6dst[IPaddrlen]; + IP *ip; + Ip6hdr *h; + Proto *p; + Route *r, *sr; + + ip = f->ip; + ip->stats[InReceives]++; + + /* + * Ensure we have all the header info in the first + * block. Make life easier for other protocols by + * collecting up to the first 64 bytes in the first block. + */ + if(BLEN(bp) < 64) { + hl = blocklen(bp); + if(hl < IP6HDR) + hl = IP6HDR; + if(hl > 64) + hl = 64; + bp = pullupblock(bp, hl); + if(bp == nil) + return; + } + + h = (Ip6hdr *)bp->rp; + + memmove(&v6dst[0], &h->dst[0], IPaddrlen); + notforme = ipforme(f, v6dst) == 0; + tentative = iptentative(f, v6dst); + + if(tentative && h->proto != ICMPv6) { + print("tentative addr, drop\n"); + freeblist(bp); + return; + } + + /* Check header version */ + if(BLKIPVER(bp) != IP_VER6) { + ip->stats[InHdrErrors]++; + netlog(f, Logip, "ip: bad version %ux\n", (h->vcf[0]&0xF0)>>2); + freeblist(bp); + return; + } + + /* route */ + if(notforme) { + if(!ip->iprouting){ + freeb(bp); + return; + } + + /* don't forward to link-local destinations */ + if(islinklocal(h->dst) || + (isv6mcast(h->dst) && (h->dst[1]&0xF) <= Link_local_scop)){ + ip->stats[OutDiscards]++; + freeblist(bp); + return; + } + + /* don't forward to source's network */ + sr = v6lookup(f, h->src, nil); + r = v6lookup(f, h->dst, nil); + + if(r == nil || sr == r){ + ip->stats[OutDiscards]++; + freeblist(bp); + return; + } + + /* don't forward if packet has timed out */ + hop = h->ttl; + if(hop < 1) { + ip->stats[InHdrErrors]++; + icmpttlexceeded6(f, ifc, bp); + freeblist(bp); + return; + } + + /* process headers & reassemble if the interface expects it */ + bp = procxtns(ip, bp, r->ifc->reassemble); + if(bp == nil) + return; + + ip->stats[ForwDatagrams]++; + h = (Ip6hdr *)bp->rp; + tos = IPV6CLASS(h); + hop = h->ttl; + ipoput6(f, bp, 1, hop-1, tos, nil); + return; + } + + /* reassemble & process headers if needed */ + bp = procxtns(ip, bp, 1); + if(bp == nil) + return; + + h = (Ip6hdr *) (bp->rp); + proto = h->proto; + p = Fsrcvpcol(f, proto); + if(p && p->rcv) { + ip->stats[InDelivers]++; + (*p->rcv)(p, ifc, bp); + return; + } + + ip->stats[InDiscards]++; + ip->stats[InUnknownProtos]++; + freeblist(bp); +} + +/* + * ipfragfree6 - copied from ipfragfree4 - assume hold fraglock6 + */ +void +ipfragfree6(IP *ip, Fragment6 *frag) +{ + Fragment6 *fl, **l; + + if(frag->blist) + freeblist(frag->blist); + + memset(frag->src, 0, IPaddrlen); + frag->id = 0; + frag->blist = nil; + + l = &ip->flisthead6; + for(fl = *l; fl; fl = fl->next) { + if(fl == frag) { + *l = frag->next; + break; + } + l = &fl->next; + } + + frag->next = ip->fragfree6; + ip->fragfree6 = frag; +} + +/* + * ipfragallo6 - copied from ipfragalloc4 + */ +Fragment6* +ipfragallo6(IP *ip) +{ + Fragment6 *f; + + while(ip->fragfree6 == nil) { + /* free last entry on fraglist */ + for(f = ip->flisthead6; f->next; f = f->next) + ; + ipfragfree6(ip, f); + } + f = ip->fragfree6; + ip->fragfree6 = f->next; + f->next = ip->flisthead6; + ip->flisthead6 = f; + f->age = NOW + 30000; + + return f; +} + +static Block* +procxtns(IP *ip, Block *bp, int doreasm) +{ + int offset; + uchar proto; + Ip6hdr *h; + + h = (Ip6hdr *)bp->rp; + offset = unfraglen(bp, &proto, 0); + + if(proto == FH && doreasm != 0) { + bp = ip6reassemble(ip, offset, bp, h); + if(bp == nil) + return nil; + offset = unfraglen(bp, &proto, 0); + } + + if(proto == DOH || offset > IP6HDR) + bp = procopts(bp); + return bp; +} + +/* + * returns length of "Unfragmentable part", i.e., sum of lengths of ipv6 hdr, + * hop-by-hop & routing headers if present; *nexthdr is set to nexthdr value + * of the last header in the "Unfragmentable part"; if setfh != 0, nexthdr + * field of the last header in the "Unfragmentable part" is set to FH. + */ +int +unfraglen(Block *bp, uchar *nexthdr, int setfh) +{ + uchar *p, *q; + int ufl, hs; + + p = bp->rp; + q = p+6; /* proto, = p+sizeof(Ip6hdr.vcf)+sizeof(Ip6hdr.ploadlen) */ + *nexthdr = *q; + ufl = IP6HDR; + p += ufl; + + while (*nexthdr == HBH || *nexthdr == RH) { + *nexthdr = *p; + hs = ((int)*(p+1) + 1) * 8; + ufl += hs; + q = p; + p += hs; + } + + if(*nexthdr == FH) + *q = *p; + if(setfh) + *q = FH; + return ufl; +} + +Block* +procopts(Block *bp) +{ + return bp; +} + +Block* +ip6reassemble(IP* ip, int uflen, Block* bp, Ip6hdr* ih) +{ + int fend, offset, ovlap, len, fragsize, pktposn; + uint id; + uchar src[IPaddrlen], dst[IPaddrlen]; + Block *bl, **l, *last, *prev; + Fraghdr6 *fraghdr; + Fragment6 *f, *fnext; + + fraghdr = (Fraghdr6 *)(bp->rp + uflen); + memmove(src, ih->src, IPaddrlen); + memmove(dst, ih->dst, IPaddrlen); + id = nhgetl(fraghdr->id); + offset = nhgets(fraghdr->offsetRM) & ~7; + + /* + * block lists are too hard, pullupblock into a single block + */ + if(bp->next){ + bp = pullupblock(bp, blocklen(bp)); + ih = (Ip6hdr *)bp->rp; + } + + qlock(&ip->fraglock6); + + /* + * find a reassembly queue for this fragment + */ + for(f = ip->flisthead6; f; f = fnext){ + fnext = f->next; + if(ipcmp(f->src, src)==0 && ipcmp(f->dst, dst)==0 && f->id == id) + break; + if(f->age < NOW){ + ip->stats[ReasmTimeout]++; + ipfragfree6(ip, f); + } + } + + /* + * if this isn't a fragmented packet, accept it + * and get rid of any fragments that might go + * with it. + */ + if(nhgets(fraghdr->offsetRM) == 0) { /* 1st frag is also last */ + if(f) { + ipfragfree6(ip, f); + ip->stats[ReasmFails]++; + } + qunlock(&ip->fraglock6); + return bp; + } + + if(bp->base+sizeof(Ipfrag) >= bp->rp){ + bp = padblock(bp, sizeof(Ipfrag)); + bp->rp += sizeof(Ipfrag); + } + + BKFG(bp)->foff = offset; + BKFG(bp)->flen = nhgets(ih->ploadlen) + IP6HDR - uflen - IP6FHDR; + + /* First fragment allocates a reassembly queue */ + if(f == nil) { + f = ipfragallo6(ip); + f->id = id; + memmove(f->src, src, IPaddrlen); + memmove(f->dst, dst, IPaddrlen); + + f->blist = bp; + + qunlock(&ip->fraglock6); + ip->stats[ReasmReqds]++; + return nil; + } + + /* + * find the new fragment's position in the queue + */ + prev = nil; + l = &f->blist; + bl = f->blist; + while(bl != nil && BKFG(bp)->foff > BKFG(bl)->foff) { + prev = bl; + l = &bl->next; + bl = bl->next; + } + + /* Check overlap of a previous fragment - trim away as necessary */ + if(prev) { + ovlap = BKFG(prev)->foff + BKFG(prev)->flen - BKFG(bp)->foff; + if(ovlap > 0) { + if(ovlap >= BKFG(bp)->flen) { + freeblist(bp); + qunlock(&ip->fraglock6); + return nil; + } + BKFG(prev)->flen -= ovlap; + } + } + + /* Link onto assembly queue */ + bp->next = *l; + *l = bp; + + /* Check to see if succeeding segments overlap */ + if(bp->next) { + l = &bp->next; + fend = BKFG(bp)->foff + BKFG(bp)->flen; + + /* Take completely covered segments out */ + while(*l) { + ovlap = fend - BKFG(*l)->foff; + if(ovlap <= 0) + break; + if(ovlap < BKFG(*l)->flen) { + BKFG(*l)->flen -= ovlap; + BKFG(*l)->foff += ovlap; + /* move up ih hdrs */ + memmove((*l)->rp + ovlap, (*l)->rp, uflen); + (*l)->rp += ovlap; + break; + } + last = (*l)->next; + (*l)->next = nil; + freeblist(*l); + *l = last; + } + } + + /* + * look for a complete packet. if we get to a fragment + * with the trailing bit of fraghdr->offsetRM[1] set, we're done. + */ + pktposn = 0; + for(bl = f->blist; bl && BKFG(bl)->foff == pktposn; bl = bl->next) { + fraghdr = (Fraghdr6 *)(bl->rp + uflen); + if((fraghdr->offsetRM[1] & 1) == 0) { + bl = f->blist; + + /* get rid of frag header in first fragment */ + memmove(bl->rp + IP6FHDR, bl->rp, uflen); + bl->rp += IP6FHDR; + len = nhgets(((Ip6hdr*)bl->rp)->ploadlen) - IP6FHDR; + bl->wp = bl->rp + len + IP6HDR; + /* + * Pullup all the fragment headers and + * return a complete packet + */ + for(bl = bl->next; bl; bl = bl->next) { + fragsize = BKFG(bl)->flen; + len += fragsize; + bl->rp += uflen + IP6FHDR; + bl->wp = bl->rp + fragsize; + } + + bl = f->blist; + f->blist = nil; + ipfragfree6(ip, f); + ih = (Ip6hdr*)bl->rp; + hnputs(ih->ploadlen, len); + qunlock(&ip->fraglock6); + ip->stats[ReasmOKs]++; + return bl; + } + pktposn += BKFG(bl)->flen; + } + qunlock(&ip->fraglock6); + return nil; +} diff --git a/src/9vx/a/ip/ipv6.h b/src/9vx/a/ip/ipv6.h @@ -0,0 +1,185 @@ +/* + * Internet Protocol Version 6 + * + * rfc2460 defines the protocol, rfc2461 neighbour discovery, and + * rfc2462 address autoconfiguration. rfc4443 defines ICMP; was rfc2463. + * rfc4291 defines the address architecture (including prefices), was rfc3513. + * rfc4007 defines the scoped address architecture. + * + * global unicast is anything but unspecified (::), loopback (::1), + * multicast (ff00::/8), and link-local unicast (fe80::/10). + * + * site-local (fec0::/10) is now deprecated, originally by rfc3879. + * + * Unique Local IPv6 Unicast Addresses are defined by rfc4193. + * prefix is fc00::/7, scope is global, routing is limited to roughly a site. + */ +#define isv6mcast(addr) ((addr)[0] == 0xff) +#define islinklocal(addr) ((addr)[0] == 0xfe && ((addr)[1] & 0xc0) == 0x80) + +#define optexsts(np) (nhgets((np)->ploadlen) > 24) +#define issmcast(addr) (memcmp((addr), v6solicitednode, 13) == 0) + +#ifndef MIN +#define MIN(a, b) ((a) <= (b)? (a): (b)) +#endif + +#undef ESP + +enum { /* Header Types */ + HBH = 0, /* hop-by-hop multicast routing protocol */ + ICMP = 1, + IGMP = 2, + GGP = 3, + IPINIP = 4, + ST = 5, + TCP = 6, + UDP = 17, + ISO_TP4 = 29, + RH = 43, + FH = 44, + IDRP = 45, + RSVP = 46, + AH = 51, + ESP = 52, + ICMPv6 = 58, + NNH = 59, + DOH = 60, + ISO_IP = 80, + IGRP = 88, + OSPF = 89, + + Maxhdrtype = 256, +}; + +enum { + /* multicast flags and scopes */ + +// Well_known_flg = 0, +// Transient_flg = 1, + +// Interface_local_scop = 1, + Link_local_scop = 2, +// Site_local_scop = 5, +// Org_local_scop = 8, + Global_scop = 14, + + /* various prefix lengths */ + SOLN_PREF_LEN = 13, + + /* icmpv6 unreachability codes */ + Icmp6_no_route = 0, + Icmp6_ad_prohib = 1, + Icmp6_out_src_scope = 2, + Icmp6_adr_unreach = 3, + Icmp6_port_unreach = 4, + Icmp6_gress_src_fail = 5, + Icmp6_rej_route = 6, + Icmp6_unknown = 7, /* our own invention for internal use */ + + /* various flags & constants */ + v6MINTU = 1280, + HOP_LIMIT = 255, + IP6HDR = 20, /* sizeof(Ip6hdr) */ + + /* option types */ + + /* neighbour discovery */ + SRC_LLADDR = 1, + TARGET_LLADDR = 2, + PREFIX_INFO = 3, + REDIR_HEADER = 4, + MTU_OPTION = 5, + /* new since rfc2461; see iana.org/assignments/icmpv6-parameters */ + V6nd_home = 8, + V6nd_srcaddrs = 9, /* rfc3122 */ + V6nd_ip = 17, + /* /lib/rfc/drafts/draft-jeong-dnsop-ipv6-dns-discovery-12.txt */ + V6nd_rdns = 25, + /* plan 9 extensions */ + V6nd_9fs = 250, + V6nd_9auth = 251, + + SRC_UNSPEC = 0, + SRC_UNI = 1, + TARG_UNI = 2, + TARG_MULTI = 3, + + Tunitent = 1, + Tuniproxy = 2, + Tunirany = 3, + + /* Node constants */ + MAX_MULTICAST_SOLICIT = 3, + RETRANS_TIMER = 1000, +}; + +typedef struct Ip6hdr Ip6hdr; +typedef struct Opthdr Opthdr; +typedef struct Routinghdr Routinghdr; +typedef struct Fraghdr6 Fraghdr6; + +struct Ip6hdr { + uchar vcf[4]; /* version:4, traffic class:8, flow label:20 */ + uchar ploadlen[2]; /* payload length: packet length - 40 */ + uchar proto; /* next header type */ + uchar ttl; /* hop limit */ + uchar src[IPaddrlen]; + uchar dst[IPaddrlen]; +}; + +struct Opthdr { + uchar nexthdr; + uchar len; +}; + +/* + * Beware routing header type 0 (loose source routing); see + * http://www.secdev.org/conf/IPv6_RH_security-csw07.pdf. + * Type 1 is unused. Type 2 is for MIPv6 (mobile IPv6) filtering + * against type 0 header. + */ +struct Routinghdr { + uchar nexthdr; + uchar len; + uchar rtetype; + uchar segrem; +}; + +struct Fraghdr6 { + uchar nexthdr; + uchar res; + uchar offsetRM[2]; /* Offset, Res, M flag */ + uchar id[4]; +}; + +extern uchar v6allnodesN[IPaddrlen]; +extern uchar v6allnodesL[IPaddrlen]; +extern uchar v6allroutersN[IPaddrlen]; +extern uchar v6allroutersL[IPaddrlen]; +extern uchar v6allnodesNmask[IPaddrlen]; +extern uchar v6allnodesLmask[IPaddrlen]; +extern uchar v6solicitednode[IPaddrlen]; +extern uchar v6solicitednodemask[IPaddrlen]; +extern uchar v6Unspecified[IPaddrlen]; +extern uchar v6loopback[IPaddrlen]; +extern uchar v6loopbackmask[IPaddrlen]; +extern uchar v6linklocal[IPaddrlen]; +extern uchar v6linklocalmask[IPaddrlen]; +extern uchar v6multicast[IPaddrlen]; +extern uchar v6multicastmask[IPaddrlen]; + +extern int v6llpreflen; +extern int v6mcpreflen; +extern int v6snpreflen; +extern int v6aNpreflen; +extern int v6aLpreflen; + +extern int ReTransTimer; + +void ipv62smcast(uchar *, uchar *); +void icmpns(Fs *f, uchar* src, int suni, uchar* targ, int tuni, uchar* mac); +void icmpna(Fs *f, uchar* src, uchar* dst, uchar* targ, uchar* mac, uchar flags); +void icmpttlexceeded6(Fs *f, Ipifc *ifc, Block *bp); +void icmppkttoobig6(Fs *f, Ipifc *ifc, Block *bp); +void icmphostunr(Fs *f, Ipifc *ifc, Block *bp, int code, int free); diff --git a/src/9vx/a/ip/loopbackmedium.c b/src/9vx/a/ip/loopbackmedium.c @@ -0,0 +1,120 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" + +enum +{ + Maxtu= 16*1024, +}; + +typedef struct LB LB; +struct LB +{ + Proc *readp; + Queue *q; + Fs *f; +}; + +static void loopbackread(void *a); + +static void +loopbackbind(Ipifc *ifc, int _, char** __) +{ + LB *lb; + + lb = smalloc(sizeof(*lb)); + lb->f = ifc->conv->p->f; + lb->q = qopen(1024*1024, Qmsg, nil, nil); + ifc->arg = lb; + ifc->mbps = 1000; + + kproc("loopbackread", loopbackread, ifc); + +} + +static void +loopbackunbind(Ipifc *ifc) +{ + LB *lb = ifc->arg; + + if(lb->readp) + postnote(lb->readp, 1, "unbind", 0); + + /* wait for reader to die */ + while(lb->readp != 0) + tsleep(&up->sleep, return0, 0, 300); + + /* clean up */ + qfree(lb->q); + free(lb); +} + +static void +loopbackbwrite(Ipifc *ifc, Block *bp, int _, uchar* __) +{ + LB *lb; + + lb = ifc->arg; + if(qpass(lb->q, bp) < 0) + ifc->outerr++; + ifc->out++; +} + +static void +loopbackread(void *a) +{ + Ipifc *ifc; + Block *bp; + LB *lb; + + ifc = a; + lb = ifc->arg; + lb->readp = up; /* hide identity under a rock for unbind */ + if(waserror()){ + lb->readp = 0; + pexit("hangup", 1); + } + for(;;){ + bp = qbread(lb->q, Maxtu); + if(bp == nil) + continue; + ifc->in++; + if(!CANRLOCK(ifc)){ + freeb(bp); + continue; + } + if(waserror()){ + RUNLOCK(ifc); + nexterror(); + } + if(ifc->lifc == nil) + freeb(bp); + else + ipiput4(lb->f, ifc, bp); + RUNLOCK(ifc); + poperror(); + } +} + +Medium loopbackmedium = +{ +.hsize= 0, +.mintu= 0, +.maxtu= Maxtu, +.maclen= 0, +.name= "loopback", +.bind= loopbackbind, +.unbind= loopbackunbind, +.bwrite= loopbackbwrite, +}; + +void +loopbackmediumlink(void) +{ + addipmedium(&loopbackmedium); +} diff --git a/src/9vx/a/ip/netdevmedium.c b/src/9vx/a/ip/netdevmedium.c @@ -0,0 +1,153 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" + +static void netdevbind(Ipifc *ifc, int argc, char **argv); +static void netdevunbind(Ipifc *ifc); +static void netdevbwrite(Ipifc *ifc, Block *bp, int version, uchar *ip); +static void netdevread(void *a); + +typedef struct Netdevrock Netdevrock; +struct Netdevrock +{ + Fs *f; /* file system we belong to */ + Proc *readp; /* reading process */ + Chan *mchan; /* Data channel */ +}; + +Medium netdevmedium = +{ +.name= "netdev", +.hsize= 0, +.mintu= 0, +.maxtu= 64000, +.maclen= 0, +.bind= netdevbind, +.unbind= netdevunbind, +.bwrite= netdevbwrite, +.unbindonclose= 0, +}; + +/* + * called to bind an IP ifc to a generic network device + * called with ifc qlock'd + */ +static void +netdevbind(Ipifc *ifc, int argc, char **argv) +{ + Chan *mchan; + Netdevrock *er; + + if(argc < 2) + error(Ebadarg); + + mchan = namec(argv[2], Aopen, ORDWR, 0); + + er = smalloc(sizeof(*er)); + er->mchan = mchan; + er->f = ifc->conv->p->f; + + ifc->arg = er; + + kproc("netdevread", netdevread, ifc); +} + +/* + * called with ifc wlock'd + */ +static void +netdevunbind(Ipifc *ifc) +{ + Netdevrock *er = ifc->arg; + + if(er->readp != nil) + postnote(er->readp, 1, "unbind", 0); + + /* wait for readers to die */ + while(er->readp != nil) + tsleep(&up->sleep, return0, 0, 300); + + if(er->mchan != nil) + cclose(er->mchan); + + free(er); +} + +/* + * called by ipoput with a single block to write + */ +static void +netdevbwrite(Ipifc *ifc, Block *bp, int _, uchar* __) +{ + Netdevrock *er = ifc->arg; + + if(bp->next) + bp = concatblock(bp); + if(BLEN(bp) < ifc->mintu) + bp = adjustblock(bp, ifc->mintu); + + devtab[er->mchan->type]->bwrite(er->mchan, bp, 0); + ifc->out++; +} + +/* + * process to read from the device + */ +static void +netdevread(void *a) +{ + Ipifc *ifc; + Block *bp; + Netdevrock *er; + char *argv[1]; + + ifc = a; + er = ifc->arg; + er->readp = up; /* hide identity under a rock for unbind */ + if(waserror()){ + er->readp = nil; + pexit("hangup", 1); + } + for(;;){ + bp = devtab[er->mchan->type]->bread(er->mchan, ifc->maxtu, 0); + if(bp == nil){ + /* + * get here if mchan is a pipe and other side hangs up + * clean up this interface & get out +ZZZ is this a good idea? + */ + poperror(); + er->readp = nil; + argv[0] = "unbind"; + if(!waserror()) + ifc->conv->p->ctl(ifc->conv, argv, 1); + pexit("hangup", 1); + } + if(!CANRLOCK(ifc)){ + freeb(bp); + continue; + } + if(waserror()){ + RUNLOCK(ifc); + nexterror(); + } + ifc->in++; + if(ifc->lifc == nil) + freeb(bp); + else + ipiput4(er->f, ifc, bp); + RUNLOCK(ifc); + poperror(); + } +} + +void +netdevmediumlink(void) +{ + addipmedium(&netdevmedium); +} diff --git a/src/9vx/a/ip/netlog.c b/src/9vx/a/ip/netlog.c @@ -0,0 +1,261 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" +#include "ip/ip.h" + +enum { + Nlog = 16*1024, +}; + +/* + * action log + */ +struct Netlog { + Lock lk; + int opens; + char* buf; + char *end; + char *rptr; + int len; + + int logmask; /* mask of things to debug */ + uchar iponly[IPaddrlen]; /* ip address to print debugging for */ + int iponlyset; + + QLock qlock; + Rendez rendez; +}; + +typedef struct Netlogflag { + char* name; + int mask; +} Netlogflag; + +static Netlogflag flags[] = +{ + { "ppp", Logppp, }, + { "ip", Logip, }, + { "fs", Logfs, }, + { "tcp", Logtcp, }, + { "icmp", Logicmp, }, + { "udp", Logudp, }, + { "compress", Logcompress, }, + { "gre", Loggre, }, + { "tcpwin", Logtcp|Logtcpwin, }, + { "tcprxmt", Logtcp|Logtcprxmt, }, + { "udpmsg", Logudp|Logudpmsg, }, + { "ipmsg", Logip|Logipmsg, }, + { "esp", Logesp, }, + { nil, 0, }, +}; + +char Ebadnetctl[] = "too few arguments for netlog control message"; + +enum +{ + CMset, + CMclear, + CMonly, +}; + +static +Cmdtab routecmd[] = { + CMset, "set", 0, + CMclear, "clear", 0, + CMonly, "only", 0, +}; + +void +netloginit(Fs *f) +{ + f->alog = smalloc(sizeof(Netlog)); +} + +void +netlogopen(Fs *f) +{ + LOCK(f->alog); + if(waserror()){ + UNLOCK(f->alog); + nexterror(); + } + if(f->alog->opens == 0){ + if(f->alog->buf == nil) + f->alog->buf = malloc(Nlog); + f->alog->rptr = f->alog->buf; + f->alog->end = f->alog->buf + Nlog; + } + f->alog->opens++; + UNLOCK(f->alog); + poperror(); +} + +void +netlogclose(Fs *f) +{ + LOCK(f->alog); + if(waserror()){ + UNLOCK(f->alog); + nexterror(); + } + f->alog->opens--; + if(f->alog->opens == 0){ + free(f->alog->buf); + f->alog->buf = nil; + } + UNLOCK(f->alog); + poperror(); +} + +static int +netlogready(void *a) +{ + Fs *f = a; + + return f->alog->len; +} + +long +netlogread(Fs *f, void *a, ulong _, long n) +{ + int i, d; + char *p, *rptr; + + QLOCK(f->alog); + if(waserror()){ + QUNLOCK(f->alog); + nexterror(); + } + + for(;;){ + LOCK(f->alog); + if(f->alog->len){ + if(n > f->alog->len) + n = f->alog->len; + d = 0; + rptr = f->alog->rptr; + f->alog->rptr += n; + if(f->alog->rptr >= f->alog->end){ + d = f->alog->rptr - f->alog->end; + f->alog->rptr = f->alog->buf + d; + } + f->alog->len -= n; + UNLOCK(f->alog); + + i = n-d; + p = a; + memmove(p, rptr, i); + memmove(p+i, f->alog->buf, d); + break; + } + else + UNLOCK(f->alog); + + sleep(&f->alog->rendez, netlogready, f); + } + + QUNLOCK(f->alog); + poperror(); + + return n; +} + +void +netlogctl(Fs *f, char* s, int n) +{ + int i, set; + Netlogflag *fp; + Cmdbuf *cb; + Cmdtab *ct; + + cb = parsecmd(s, n); + if(waserror()){ + free(cb); + nexterror(); + } + + if(cb->nf < 2) + error(Ebadnetctl); + + ct = lookupcmd(cb, routecmd, nelem(routecmd)); + + set = 1; + + switch(ct->index){ + case CMset: + set = 1; + break; + + case CMclear: + set = 0; + break; + + case CMonly: + parseip(f->alog->iponly, cb->f[1]); + if(ipcmp(f->alog->iponly, IPnoaddr) == 0) + f->alog->iponlyset = 0; + else + f->alog->iponlyset = 1; + free(cb); + return; + + default: + cmderror(cb, "unknown ip control message"); + } + + for(i = 1; i < cb->nf; i++){ + for(fp = flags; fp->name; fp++) + if(strcmp(fp->name, cb->f[i]) == 0) + break; + if(fp->name == nil) + continue; + if(set) + f->alog->logmask |= fp->mask; + else + f->alog->logmask &= ~fp->mask; + } + + free(cb); + poperror(); +} + +void +netlog(Fs *f, int mask, char *fmt, ...) +{ + char buf[128], *t, *fp; + int i, n; + va_list arg; + + if(!(f->alog->logmask & mask)) + return; + + if(f->alog->opens == 0) + return; + + va_start(arg, fmt); + n = vseprint(buf, buf+sizeof(buf), fmt, arg) - buf; + va_end(arg); + + LOCK(f->alog); + i = f->alog->len + n - Nlog; + if(i > 0){ + f->alog->len -= i; + f->alog->rptr += i; + if(f->alog->rptr >= f->alog->end) + f->alog->rptr = f->alog->buf + (f->alog->rptr - f->alog->end); + } + t = f->alog->rptr + f->alog->len; + fp = buf; + f->alog->len += n; + while(n-- > 0){ + if(t >= f->alog->end) + t = f->alog->buf + (t - f->alog->end); + *t++ = *fp++; + } + UNLOCK(f->alog); + + wakeup(&f->alog->rendez); +} diff --git a/src/9vx/a/ip/nullmedium.c b/src/9vx/a/ip/nullmedium.c @@ -0,0 +1,39 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" + +static void +nullbind(Ipifc* _, int __, char** ___) +{ + error("cannot bind null device"); +} + +static void +nullunbind(Ipifc* _) +{ +} + +static void +nullbwrite(Ipifc* _, Block* __, int ___, uchar* ____) +{ + error("nullbwrite"); +} + +Medium nullmedium = +{ +.name= "null", +.bind= nullbind, +.unbind= nullunbind, +.bwrite= nullbwrite, +}; + +void +nullmediumlink(void) +{ + addipmedium(&nullmedium); +} diff --git a/src/9vx/a/ip/pktmedium.c b/src/9vx/a/ip/pktmedium.c @@ -0,0 +1,78 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" + + +static void pktbind(Ipifc*, int, char**); +static void pktunbind(Ipifc*); +static void pktbwrite(Ipifc*, Block*, int, uchar*); +static void pktin(Fs*, Ipifc*, Block*); + +Medium pktmedium = +{ +.name= "pkt", +.hsize= 14, +.mintu= 40, +.maxtu= 4*1024, +.maclen= 6, +.bind= pktbind, +.unbind= pktunbind, +.bwrite= pktbwrite, +.pktin= pktin, +}; + +/* + * called to bind an IP ifc to an ethernet device + * called with ifc wlock'd + */ +static void +pktbind(Ipifc* _, int argc, char **argv) +{ +} + +/* + * called with ifc wlock'd + */ +static void +pktunbind(Ipifc* _) +{ +} + +/* + * called by ipoput with a single packet to write + */ +static void +pktbwrite(Ipifc *ifc, Block *bp, int _, uchar* __) +{ + /* enqueue onto the conversation's rq */ + bp = concatblock(bp); + if(ifc->conv->snoopers.ref > 0) + qpass(ifc->conv->sq, copyblock(bp, BLEN(bp))); + qpass(ifc->conv->rq, bp); +} + +/* + * called with ifc rlocked when someone write's to 'data' + */ +static void +pktin(Fs *f, Ipifc *ifc, Block *bp) +{ + if(ifc->lifc == nil) + freeb(bp); + else { + if(ifc->conv->snoopers.ref > 0) + qpass(ifc->conv->sq, copyblock(bp, BLEN(bp))); + ipiput4(f, ifc, bp); + } +} + +void +pktmediumlink(void) +{ + addipmedium(&pktmedium); +} diff --git a/src/9vx/a/ip/ptclbsum.c b/src/9vx/a/ip/ptclbsum.c @@ -0,0 +1,72 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" +#include "ip.h" + +static short endian = 1; +static uchar* aendian = (uchar*)&endian; +#define LITTLE *aendian + +ushort +ptclbsum(uchar *addr, int len) +{ + ulong losum, hisum, mdsum, x; + ulong t1, t2; + + losum = 0; + hisum = 0; + mdsum = 0; + + x = 0; + if((ulong)addr & 1) { + if(len) { + hisum += addr[0]; + len--; + addr++; + } + x = 1; + } + while(len >= 16) { + t1 = *(ushort*)(addr+0); + t2 = *(ushort*)(addr+2); mdsum += t1; + t1 = *(ushort*)(addr+4); mdsum += t2; + t2 = *(ushort*)(addr+6); mdsum += t1; + t1 = *(ushort*)(addr+8); mdsum += t2; + t2 = *(ushort*)(addr+10); mdsum += t1; + t1 = *(ushort*)(addr+12); mdsum += t2; + t2 = *(ushort*)(addr+14); mdsum += t1; + mdsum += t2; + len -= 16; + addr += 16; + } + while(len >= 2) { + mdsum += *(ushort*)addr; + len -= 2; + addr += 2; + } + if(x) { + if(len) + losum += addr[0]; + if(LITTLE) + losum += mdsum; + else + hisum += mdsum; + } else { + if(len) + hisum += addr[0]; + if(LITTLE) + hisum += mdsum; + else + losum += mdsum; + } + + losum += hisum >> 8; + losum += (hisum & 0xff) << 8; + while((hisum = losum>>16)) + losum = hisum + (losum & 0xffff); + + return losum & 0xffff; +} diff --git a/src/9vx/a/ip/rudp.c b/src/9vx/a/ip/rudp.c @@ -0,0 +1,1055 @@ +/* + * Reliable User Datagram Protocol, currently only for IPv4. + * This protocol is compatible with UDP's packet format. + * It could be done over UDP if need be. + */ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" + +#define DEBUG 0 +#define DPRINT if(DEBUG)print + +#define SEQDIFF(a,b) ( (a)>=(b)?\ + (a)-(b):\ + 0xffffffffUL-((b)-(a)) ) +#define INSEQ(a,start,end) ( (start)<=(end)?\ + ((a)>(start)&&(a)<=(end)):\ + ((a)>(start)||(a)<=(end)) ) +#define UNACKED(r) SEQDIFF(r->sndseq, r->ackrcvd) +#define NEXTSEQ(a) ( (a)+1 == 0 ? 1 : (a)+1 ) + +enum +{ + UDP_PHDRSIZE = 12, /* pseudo header */ +// UDP_HDRSIZE = 20, /* pseudo header + udp header */ + UDP_RHDRSIZE = 36, /* pseudo header + udp header + rudp header */ + UDP_IPHDR = 8, /* ip header */ + IP_UDPPROTO = 254, + UDP_USEAD7 = 52, /* size of new ipv6 headers struct */ + + Rudprxms = 200, + Rudptickms = 50, + Rudpmaxxmit = 10, + Maxunacked = 100, +}; + +#define Hangupgen 0xffffffff /* used only in hangup messages */ + +typedef struct Udphdr Udphdr; +struct Udphdr +{ + /* ip header */ + uchar vihl; /* Version and header length */ + uchar tos; /* Type of service */ + uchar length[2]; /* packet length */ + uchar id[2]; /* Identification */ + uchar frag[2]; /* Fragment information */ + + /* pseudo header starts here */ + uchar Unused; + uchar udpproto; /* Protocol */ + uchar udpplen[2]; /* Header plus data length */ + uchar udpsrc[4]; /* Ip source */ + uchar udpdst[4]; /* Ip destination */ + + /* udp header */ + uchar udpsport[2]; /* Source port */ + uchar udpdport[2]; /* Destination port */ + uchar udplen[2]; /* data length */ + uchar udpcksum[2]; /* Checksum */ +}; + +typedef struct Rudphdr Rudphdr; +struct Rudphdr +{ + /* ip header */ + uchar vihl; /* Version and header length */ + uchar tos; /* Type of service */ + uchar length[2]; /* packet length */ + uchar id[2]; /* Identification */ + uchar frag[2]; /* Fragment information */ + + /* pseudo header starts here */ + uchar Unused; + uchar udpproto; /* Protocol */ + uchar udpplen[2]; /* Header plus data length */ + uchar udpsrc[4]; /* Ip source */ + uchar udpdst[4]; /* Ip destination */ + + /* udp header */ + uchar udpsport[2]; /* Source port */ + uchar udpdport[2]; /* Destination port */ + uchar udplen[2]; /* data length (includes rudp header) */ + uchar udpcksum[2]; /* Checksum */ + + /* rudp header */ + uchar relseq[4]; /* id of this packet (or 0) */ + uchar relsgen[4]; /* generation/time stamp */ + uchar relack[4]; /* packet being acked (or 0) */ + uchar relagen[4]; /* generation/time stamp */ +}; + + +/* + * one state structure per destination + */ +typedef struct Reliable Reliable; +struct Reliable +{ + Ref; + + Reliable *next; + + uchar addr[IPaddrlen]; /* always V6 when put here */ + ushort port; + + Block *unacked; /* unacked msg list */ + Block *unackedtail; /* and its tail */ + + int timeout; /* time since first unacked msg sent */ + int xmits; /* number of times first unacked msg sent */ + + ulong sndseq; /* next packet to be sent */ + ulong sndgen; /* and its generation */ + + ulong rcvseq; /* last packet received */ + ulong rcvgen; /* and its generation */ + + ulong acksent; /* last ack sent */ + ulong ackrcvd; /* last msg for which ack was rcvd */ + + /* flow control */ + QLock lock; + Rendez vous; + int blocked; +}; + + + +/* MIB II counters */ +typedef struct Rudpstats Rudpstats; +struct Rudpstats +{ + ulong rudpInDatagrams; + ulong rudpNoPorts; + ulong rudpInErrors; + ulong rudpOutDatagrams; +}; + +typedef struct Rudppriv Rudppriv; +struct Rudppriv +{ + Ipht ht; + + /* MIB counters */ + Rudpstats ustats; + + /* non-MIB stats */ + ulong csumerr; /* checksum errors */ + ulong lenerr; /* short packet */ + ulong rxmits; /* # of retransmissions */ + ulong orders; /* # of out of order pkts */ + + /* keeping track of the ack kproc */ + int ackprocstarted; + QLock apl; +}; + + +static ulong generation = 0; +static Rendez rend; + +/* + * protocol specific part of Conv + */ +typedef struct Rudpcb Rudpcb; +struct Rudpcb +{ + QLock; + uchar headers; + uchar randdrop; + Reliable *r; +}; + +/* + * local functions + */ +void relsendack(Conv*, Reliable*, int); +int reliput(Conv*, Block*, uchar*, ushort); +Reliable *relstate(Rudpcb*, uchar*, ushort, char*); +void relput(Reliable*); +void relforget(Conv *, uchar*, int, int); +void relackproc(void *); +void relackq(Reliable *, Block*); +void relhangup(Conv *, Reliable*); +void relrexmit(Conv *, Reliable*); +void relput(Reliable*); +void rudpkick(void *x); + +static void +rudpstartackproc(Proto *rudp) +{ + Rudppriv *rpriv; + char kpname[KNAMELEN]; + + rpriv = rudp->priv; + if(rpriv->ackprocstarted == 0){ + qlock(&rpriv->apl); + if(rpriv->ackprocstarted == 0){ + sprint(kpname, "#I%drudpack", rudp->f->dev); + kproc(kpname, relackproc, rudp); + rpriv->ackprocstarted = 1; + } + qunlock(&rpriv->apl); + } +} + +static char* +rudpconnect(Conv *c, char **argv, int argc) +{ + char *e; + Rudppriv *upriv; + + upriv = c->p->priv; + rudpstartackproc(c->p); + e = Fsstdconnect(c, argv, argc); + Fsconnected(c, e); + iphtadd(&upriv->ht, c); + + return e; +} + + +static int +rudpstate(Conv *c, char *state, int n) +{ + Rudpcb *ucb; + Reliable *r; + int m; + + m = snprint(state, n, "%s", c->inuse?"Open":"Closed"); + ucb = (Rudpcb*)c->ptcl; + qlock(ucb); + for(r = ucb->r; r; r = r->next) + m += snprint(state+m, n-m, " %I/%ld", r->addr, UNACKED(r)); + m += snprint(state+m, n-m, "\n"); + qunlock(ucb); + return m; +} + +static char* +rudpannounce(Conv *c, char** argv, int argc) +{ + char *e; + Rudppriv *upriv; + + upriv = c->p->priv; + rudpstartackproc(c->p); + e = Fsstdannounce(c, argv, argc); + if(e != nil) + return e; + Fsconnected(c, nil); + iphtadd(&upriv->ht, c); + + return nil; +} + +static void +rudpcreate(Conv *c) +{ + c->rq = qopen(64*1024, Qmsg, 0, 0); + c->wq = qopen(64*1024, Qkick, rudpkick, c); +} + +static void +rudpclose(Conv *c) +{ + Rudpcb *ucb; + Reliable *r, *nr; + Rudppriv *upriv; + + upriv = c->p->priv; + iphtrem(&upriv->ht, c); + + /* force out any delayed acks */ + ucb = (Rudpcb*)c->ptcl; + qlock(ucb); + for(r = ucb->r; r; r = r->next){ + if(r->acksent != r->rcvseq) + relsendack(c, r, 0); + } + qunlock(ucb); + + qclose(c->rq); + qclose(c->wq); + qclose(c->eq); + ipmove(c->laddr, IPnoaddr); + ipmove(c->raddr, IPnoaddr); + c->lport = 0; + c->rport = 0; + + ucb->headers = 0; + ucb->randdrop = 0; + qlock(ucb); + for(r = ucb->r; r; r = nr){ + if(r->acksent != r->rcvseq) + relsendack(c, r, 0); + nr = r->next; + relhangup(c, r); + relput(r); + } + ucb->r = 0; + + qunlock(ucb); +} + +/* + * randomly don't send packets + */ +static void +doipoput(Conv *c, Fs *f, Block *bp, int x, int ttl, int tos) +{ + Rudpcb *ucb; + + ucb = (Rudpcb*)c->ptcl; + if(ucb->randdrop && nrand(100) < ucb->randdrop) + freeblist(bp); + else + ipoput4(f, bp, x, ttl, tos, nil); +} + +int +flow(void *v) +{ + Reliable *r = v; + + return UNACKED(r) <= Maxunacked; +} + +void +rudpkick(void *x) +{ + Conv *c = x; + Udphdr *uh; + ushort rport; + uchar laddr[IPaddrlen], raddr[IPaddrlen]; + Block *bp; + Rudpcb *ucb; + Rudphdr *rh; + Reliable *r; + int dlen, ptcllen; + Rudppriv *upriv; + Fs *f; + + upriv = c->p->priv; + f = c->p->f; + + netlog(c->p->f, Logrudp, "rudp: kick\n"); + bp = qget(c->wq); + if(bp == nil) + return; + + ucb = (Rudpcb*)c->ptcl; + switch(ucb->headers) { + case 7: + /* get user specified addresses */ + bp = pullupblock(bp, UDP_USEAD7); + if(bp == nil) + return; + ipmove(raddr, bp->rp); + bp->rp += IPaddrlen; + ipmove(laddr, bp->rp); + bp->rp += IPaddrlen; + /* pick interface closest to dest */ + if(ipforme(f, laddr) != Runi) + findlocalip(f, laddr, raddr); + bp->rp += IPaddrlen; /* Ignore ifc address */ + rport = nhgets(bp->rp); + bp->rp += 2+2; /* Ignore local port */ + break; + default: + ipmove(raddr, c->raddr); + ipmove(laddr, c->laddr); + rport = c->rport; + break; + } + + dlen = blocklen(bp); + + /* Make space to fit rudp & ip header */ + bp = padblock(bp, UDP_IPHDR+UDP_RHDRSIZE); + if(bp == nil) + return; + + uh = (Udphdr *)(bp->rp); + uh->vihl = IP_VER4; + + rh = (Rudphdr*)uh; + + ptcllen = dlen + (UDP_RHDRSIZE-UDP_PHDRSIZE); + uh->Unused = 0; + uh->udpproto = IP_UDPPROTO; + uh->frag[0] = 0; + uh->frag[1] = 0; + hnputs(uh->udpplen, ptcllen); + switch(ucb->headers){ + case 7: + v6tov4(uh->udpdst, raddr); + hnputs(uh->udpdport, rport); + v6tov4(uh->udpsrc, laddr); + break; + default: + v6tov4(uh->udpdst, c->raddr); + hnputs(uh->udpdport, c->rport); + if(ipcmp(c->laddr, IPnoaddr) == 0) + findlocalip(f, c->laddr, c->raddr); + v6tov4(uh->udpsrc, c->laddr); + break; + } + hnputs(uh->udpsport, c->lport); + hnputs(uh->udplen, ptcllen); + uh->udpcksum[0] = 0; + uh->udpcksum[1] = 0; + + qlock(ucb); + r = relstate(ucb, raddr, rport, "kick"); + r->sndseq = NEXTSEQ(r->sndseq); + hnputl(rh->relseq, r->sndseq); + hnputl(rh->relsgen, r->sndgen); + + hnputl(rh->relack, r->rcvseq); /* ACK last rcvd packet */ + hnputl(rh->relagen, r->rcvgen); + + if(r->rcvseq != r->acksent) + r->acksent = r->rcvseq; + + hnputs(uh->udpcksum, ptclcsum(bp, UDP_IPHDR, dlen+UDP_RHDRSIZE)); + + relackq(r, bp); + qunlock(ucb); + + upriv->ustats.rudpOutDatagrams++; + + DPRINT("sent: %lud/%lud, %lud/%lud\n", + r->sndseq, r->sndgen, r->rcvseq, r->rcvgen); + + doipoput(c, f, bp, 0, c->ttl, c->tos); + + if(waserror()) { + relput(r); + qunlock(&r->lock); + nexterror(); + } + + /* flow control of sorts */ + qlock(&r->lock); + if(UNACKED(r) > Maxunacked){ + r->blocked = 1; + sleep(&r->vous, flow, r); + r->blocked = 0; + } + + qunlock(&r->lock); + relput(r); + poperror(); +} + +void +rudpiput(Proto *rudp, Ipifc *ifc, Block *bp) +{ + int len, olen, ottl; + Udphdr *uh; + Conv *c; + Rudpcb *ucb; + uchar raddr[IPaddrlen], laddr[IPaddrlen]; + ushort rport, lport; + Rudppriv *upriv; + Fs *f; + uchar *p; + + upriv = rudp->priv; + f = rudp->f; + + upriv->ustats.rudpInDatagrams++; + + uh = (Udphdr*)(bp->rp); + + /* Put back pseudo header for checksum + * (remember old values for icmpnoconv()) + */ + ottl = uh->Unused; + uh->Unused = 0; + len = nhgets(uh->udplen); + olen = nhgets(uh->udpplen); + hnputs(uh->udpplen, len); + + v4tov6(raddr, uh->udpsrc); + v4tov6(laddr, uh->udpdst); + lport = nhgets(uh->udpdport); + rport = nhgets(uh->udpsport); + + if(nhgets(uh->udpcksum)) { + if(ptclcsum(bp, UDP_IPHDR, len+UDP_PHDRSIZE)) { + upriv->ustats.rudpInErrors++; + upriv->csumerr++; + netlog(f, Logrudp, "rudp: checksum error %I\n", raddr); + DPRINT("rudp: checksum error %I\n", raddr); + freeblist(bp); + return; + } + } + + qlock(rudp); + + c = iphtlook(&upriv->ht, raddr, rport, laddr, lport); + if(c == nil){ + /* no conversation found */ + upriv->ustats.rudpNoPorts++; + qunlock(rudp); + netlog(f, Logudp, "udp: no conv %I!%d -> %I!%d\n", raddr, rport, + laddr, lport); + uh->Unused = ottl; + hnputs(uh->udpplen, olen); + icmpnoconv(f, bp); + freeblist(bp); + return; + } + ucb = (Rudpcb*)c->ptcl; + qlock(ucb); + qunlock(rudp); + + if(reliput(c, bp, raddr, rport) < 0){ + qunlock(ucb); + freeb(bp); + return; + } + + /* + * Trim the packet down to data size + */ + + len -= (UDP_RHDRSIZE-UDP_PHDRSIZE); + bp = trimblock(bp, UDP_IPHDR+UDP_RHDRSIZE, len); + if(bp == nil) { + netlog(f, Logrudp, "rudp: len err %I.%d -> %I.%d\n", + raddr, rport, laddr, lport); + DPRINT("rudp: len err %I.%d -> %I.%d\n", + raddr, rport, laddr, lport); + upriv->lenerr++; + return; + } + + netlog(f, Logrudpmsg, "rudp: %I.%d -> %I.%d l %d\n", + raddr, rport, laddr, lport, len); + + switch(ucb->headers){ + case 7: + /* pass the src address */ + bp = padblock(bp, UDP_USEAD7); + p = bp->rp; + ipmove(p, raddr); p += IPaddrlen; + ipmove(p, laddr); p += IPaddrlen; + ipmove(p, ifc->lifc->local); p += IPaddrlen; + hnputs(p, rport); p += 2; + hnputs(p, lport); + break; + default: + /* connection oriented rudp */ + if(ipcmp(c->raddr, IPnoaddr) == 0){ + /* save the src address in the conversation */ + ipmove(c->raddr, raddr); + c->rport = rport; + + /* reply with the same ip address (if not broadcast) */ + if(ipforme(f, laddr) == Runi) + ipmove(c->laddr, laddr); + else + v4tov6(c->laddr, ifc->lifc->local); + } + break; + } + if(bp->next) + bp = concatblock(bp); + + if(qfull(c->rq)) { + netlog(f, Logrudp, "rudp: qfull %I.%d -> %I.%d\n", raddr, rport, + laddr, lport); + freeblist(bp); + } + else + qpass(c->rq, bp); + + qunlock(ucb); +} + +static char *rudpunknown = "unknown rudp ctl request"; + +char* +rudpctl(Conv *c, char **f, int n) +{ + Rudpcb *ucb; + uchar ip[IPaddrlen]; + int x; + + ucb = (Rudpcb*)c->ptcl; + if(n < 1) + return rudpunknown; + + if(strcmp(f[0], "headers") == 0){ + ucb->headers = 7; /* new headers format */ + return nil; + } else if(strcmp(f[0], "hangup") == 0){ + if(n < 3) + return "bad syntax"; + if (parseip(ip, f[1]) == -1) + return Ebadip; + x = atoi(f[2]); + qlock(ucb); + relforget(c, ip, x, 1); + qunlock(ucb); + return nil; + } else if(strcmp(f[0], "randdrop") == 0){ + x = 10; /* default is 10% */ + if(n > 1) + x = atoi(f[1]); + if(x > 100 || x < 0) + return "illegal rudp drop rate"; + ucb->randdrop = x; + return nil; + } + return rudpunknown; +} + +void +rudpadvise(Proto *rudp, Block *bp, char *msg) +{ + Udphdr *h; + uchar source[IPaddrlen], dest[IPaddrlen]; + ushort psource, pdest; + Conv *s, **p; + + h = (Udphdr*)(bp->rp); + + v4tov6(dest, h->udpdst); + v4tov6(source, h->udpsrc); + psource = nhgets(h->udpsport); + pdest = nhgets(h->udpdport); + + /* Look for a connection */ + for(p = rudp->conv; *p; p++) { + s = *p; + if(s->rport == pdest) + if(s->lport == psource) + if(ipcmp(s->raddr, dest) == 0) + if(ipcmp(s->laddr, source) == 0){ + qhangup(s->rq, msg); + qhangup(s->wq, msg); + break; + } + } + freeblist(bp); +} + +int +rudpstats(Proto *rudp, char *buf, int len) +{ + Rudppriv *upriv; + + upriv = rudp->priv; + return snprint(buf, len, "%lud %lud %lud %lud %lud %lud\n", + upriv->ustats.rudpInDatagrams, + upriv->ustats.rudpNoPorts, + upriv->ustats.rudpInErrors, + upriv->ustats.rudpOutDatagrams, + upriv->rxmits, + upriv->orders); +} + +void +rudpinit(Fs *fs) +{ + + Proto *rudp; + + rudp = smalloc(sizeof(Proto)); + rudp->priv = smalloc(sizeof(Rudppriv)); + rudp->name = "rudp"; + rudp->connect = rudpconnect; + rudp->announce = rudpannounce; + rudp->ctl = rudpctl; + rudp->state = rudpstate; + rudp->create = rudpcreate; + rudp->close = rudpclose; + rudp->rcv = rudpiput; + rudp->advise = rudpadvise; + rudp->stats = rudpstats; + rudp->ipproto = IP_UDPPROTO; + rudp->nc = 16; + rudp->ptclsize = sizeof(Rudpcb); + + Fsproto(fs, rudp); +} + +/*********************************************/ +/* Here starts the reliable helper functions */ +/*********************************************/ +/* + * Enqueue a copy of an unacked block for possible retransmissions + */ +void +relackq(Reliable *r, Block *bp) +{ + Block *np; + + np = copyblock(bp, blocklen(bp)); + if(r->unacked) + r->unackedtail->list = np; + else { + /* restart timer */ + r->timeout = 0; + r->xmits = 1; + r->unacked = np; + } + r->unackedtail = np; + np->list = nil; +} + +/* + * retransmit unacked blocks + */ +void +relackproc(void *a) +{ + Rudpcb *ucb; + Proto *rudp; + Reliable *r; + Conv **s, *c; + + rudp = (Proto *)a; + +loop: + tsleep(&up->sleep, return0, 0, Rudptickms); + + for(s = rudp->conv; *s; s++) { + c = *s; + ucb = (Rudpcb*)c->ptcl; + qlock(ucb); + + for(r = ucb->r; r; r = r->next) { + if(r->unacked != nil){ + r->timeout += Rudptickms; + if(r->timeout > Rudprxms*r->xmits) + relrexmit(c, r); + } + if(r->acksent != r->rcvseq) + relsendack(c, r, 0); + } + qunlock(ucb); + } + goto loop; +} + +/* + * get the state record for a conversation + */ +Reliable* +relstate(Rudpcb *ucb, uchar *addr, ushort port, char *from) +{ + Reliable *r, **l; + + l = &ucb->r; + for(r = *l; r; r = *l){ + if(memcmp(addr, r->addr, IPaddrlen) == 0 && + port == r->port) + break; + l = &r->next; + } + + /* no state for this addr/port, create some */ + if(r == nil){ + while(generation == 0) + generation = rand(); + + DPRINT("from %s new state %lud for %I!%ud\n", + from, generation, addr, port); + + r = smalloc(sizeof(Reliable)); + memmove(r->addr, addr, IPaddrlen); + r->port = port; + r->unacked = 0; + if(generation == Hangupgen) + generation++; + r->sndgen = generation++; + r->sndseq = 0; + r->ackrcvd = 0; + r->rcvgen = 0; + r->rcvseq = 0; + r->acksent = 0; + r->xmits = 0; + r->timeout = 0; + r->ref = 0; + incref(r); /* one reference for being in the list */ + + *l = r; + } + + incref(r); + return r; +} + +void +relput(Reliable *r) +{ + if(decref(r) == 0) + free(r); +} + +/* + * forget a Reliable state + */ +void +relforget(Conv *c, uchar *ip, int port, int originator) +{ + Rudpcb *ucb; + Reliable *r, **l; + + ucb = (Rudpcb*)c->ptcl; + + l = &ucb->r; + for(r = *l; r; r = *l){ + if(ipcmp(ip, r->addr) == 0 && port == r->port){ + *l = r->next; + if(originator) + relsendack(c, r, 1); + relhangup(c, r); + relput(r); /* remove from the list */ + break; + } + l = &r->next; + } +} + +/* + * process a rcvd reliable packet. return -1 if not to be passed to user process, + * 0 therwise. + * + * called with ucb locked. + */ +int +reliput(Conv *c, Block *bp, uchar *addr, ushort port) +{ + Block *nbp; + Rudpcb *ucb; + Rudppriv *upriv; + Udphdr *uh; + Reliable *r; + Rudphdr *rh; + ulong seq, ack, sgen, agen, ackreal; + int rv = -1; + + /* get fields */ + uh = (Udphdr*)(bp->rp); + rh = (Rudphdr*)uh; + seq = nhgetl(rh->relseq); + sgen = nhgetl(rh->relsgen); + ack = nhgetl(rh->relack); + agen = nhgetl(rh->relagen); + + upriv = c->p->priv; + ucb = (Rudpcb*)c->ptcl; + r = relstate(ucb, addr, port, "input"); + + DPRINT("rcvd %lud/%lud, %lud/%lud, r->sndgen = %lud\n", + seq, sgen, ack, agen, r->sndgen); + + /* if acking an incorrect generation, ignore */ + if(ack && agen != r->sndgen) + goto out; + + /* Look for a hangup */ + if(sgen == Hangupgen) { + if(agen == r->sndgen) + relforget(c, addr, port, 0); + goto out; + } + + /* make sure we're not talking to a new remote side */ + if(r->rcvgen != sgen){ + if(seq != 0 && seq != 1) + goto out; + + /* new connection */ + if(r->rcvgen != 0){ + DPRINT("new con r->rcvgen = %lud, sgen = %lud\n", r->rcvgen, sgen); + relhangup(c, r); + } + r->rcvgen = sgen; + } + + /* dequeue acked packets */ + if(ack && agen == r->sndgen){ + ackreal = 0; + while(r->unacked != nil && INSEQ(ack, r->ackrcvd, r->sndseq)){ + nbp = r->unacked; + r->unacked = nbp->list; + DPRINT("%lud/%lud acked, r->sndgen = %lud\n", + ack, agen, r->sndgen); + freeb(nbp); + r->ackrcvd = NEXTSEQ(r->ackrcvd); + ackreal = 1; + } + + /* flow control */ + if(UNACKED(r) < Maxunacked/8 && r->blocked) + wakeup(&r->vous); + + /* + * retransmit next packet if the acked packet + * was transmitted more than once + */ + if(ackreal && r->unacked != nil){ + r->timeout = 0; + if(r->xmits > 1){ + r->xmits = 1; + relrexmit(c, r); + } + } + + } + + /* no message or input queue full */ + if(seq == 0 || qfull(c->rq)) + goto out; + + /* refuse out of order delivery */ + if(seq != NEXTSEQ(r->rcvseq)){ + relsendack(c, r, 0); /* tell him we got it already */ + upriv->orders++; + DPRINT("out of sequence %lud not %lud\n", seq, NEXTSEQ(r->rcvseq)); + goto out; + } + r->rcvseq = seq; + + rv = 0; +out: + relput(r); + return rv; +} + +void +relsendack(Conv *c, Reliable *r, int hangup) +{ + Udphdr *uh; + Block *bp; + Rudphdr *rh; + int ptcllen; + Fs *f; + + bp = allocb(UDP_IPHDR + UDP_RHDRSIZE); + if(bp == nil) + return; + bp->wp += UDP_IPHDR + UDP_RHDRSIZE; + f = c->p->f; + uh = (Udphdr *)(bp->rp); + uh->vihl = IP_VER4; + rh = (Rudphdr*)uh; + + ptcllen = (UDP_RHDRSIZE-UDP_PHDRSIZE); + uh->Unused = 0; + uh->udpproto = IP_UDPPROTO; + uh->frag[0] = 0; + uh->frag[1] = 0; + hnputs(uh->udpplen, ptcllen); + + v6tov4(uh->udpdst, r->addr); + hnputs(uh->udpdport, r->port); + hnputs(uh->udpsport, c->lport); + if(ipcmp(c->laddr, IPnoaddr) == 0) + findlocalip(f, c->laddr, c->raddr); + v6tov4(uh->udpsrc, c->laddr); + hnputs(uh->udplen, ptcllen); + + if(hangup) + hnputl(rh->relsgen, Hangupgen); + else + hnputl(rh->relsgen, r->sndgen); + hnputl(rh->relseq, 0); + hnputl(rh->relagen, r->rcvgen); + hnputl(rh->relack, r->rcvseq); + + if(r->acksent < r->rcvseq) + r->acksent = r->rcvseq; + + uh->udpcksum[0] = 0; + uh->udpcksum[1] = 0; + hnputs(uh->udpcksum, ptclcsum(bp, UDP_IPHDR, UDP_RHDRSIZE)); + + DPRINT("sendack: %lud/%lud, %lud/%lud\n", 0L, r->sndgen, r->rcvseq, r->rcvgen); + doipoput(c, f, bp, 0, c->ttl, c->tos); +} + + +/* + * called with ucb locked (and c locked if user initiated close) + */ +void +relhangup(Conv *c, Reliable *r) +{ + int n; + Block *bp; + char hup[ERRMAX]; + + n = snprint(hup, sizeof(hup), "hangup %I!%d", r->addr, r->port); + qproduce(c->eq, hup, n); + + /* + * dump any unacked outgoing messages + */ + for(bp = r->unacked; bp != nil; bp = r->unacked){ + r->unacked = bp->list; + bp->list = nil; + freeb(bp); + } + + r->rcvgen = 0; + r->rcvseq = 0; + r->acksent = 0; + if(generation == Hangupgen) + generation++; + r->sndgen = generation++; + r->sndseq = 0; + r->ackrcvd = 0; + r->xmits = 0; + r->timeout = 0; + wakeup(&r->vous); +} + +/* + * called with ucb locked + */ +void +relrexmit(Conv *c, Reliable *r) +{ + Rudppriv *upriv; + Block *np; + Fs *f; + + upriv = c->p->priv; + f = c->p->f; + r->timeout = 0; + if(r->xmits++ > Rudpmaxxmit){ + relhangup(c, r); + return; + } + + upriv->rxmits++; + np = copyblock(r->unacked, blocklen(r->unacked)); + DPRINT("rxmit r->ackrvcd+1 = %lud\n", r->ackrcvd+1); + doipoput(c, f, np, 0, c->ttl, c->tos); +} diff --git a/src/9vx/a/ip/tcp.c b/src/9vx/a/ip/tcp.c @@ -0,0 +1,3209 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" + +enum +{ + QMAX = 64*1024-1, + IP_TCPPROTO = 6, + + TCP4_IPLEN = 8, + TCP4_PHDRSIZE = 12, + TCP4_HDRSIZE = 20, + TCP4_TCBPHDRSZ = 40, + TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE, + + TCP6_IPLEN = 0, + TCP6_PHDRSIZE = 40, + TCP6_HDRSIZE = 20, + TCP6_TCBPHDRSZ = 60, + TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE, + + TcptimerOFF = 0, + TcptimerON = 1, + TcptimerDONE = 2, + MAX_TIME = (1<<20), /* Forever */ + TCP_ACK = 50, /* Timed ack sequence in ms */ + MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */ + + URG = 0x20, /* Data marked urgent */ + ACK = 0x10, /* Acknowledge is valid */ + PSH = 0x08, /* Whole data pipe is pushed */ + RST = 0x04, /* Reset connection */ + SYN = 0x02, /* Pkt. is synchronise */ + FIN = 0x01, /* Start close down */ + + EOLOPT = 0, + NOOPOPT = 1, + MSSOPT = 2, + MSS_LENGTH = 4, /* Mean segment size */ + WSOPT = 3, + WS_LENGTH = 3, /* Bits to scale window size by */ + MSL2 = 10, + MSPTICK = 50, /* Milliseconds per timer tick */ + DEF_MSS = 1460, /* Default mean segment */ + DEF_MSS6 = 1280, /* Default mean segment (min) for v6 */ + DEF_RTT = 500, /* Default round trip */ + DEF_KAT = 120000, /* Default time (ms) between keep alives */ + TCP_LISTEN = 0, /* Listen connection */ + TCP_CONNECT = 1, /* Outgoing connection */ + SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */ + + TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */ + + FORCE = 1, + CLONE = 2, + RETRAN = 4, + ACTIVE = 8, + SYNACK = 16, + + LOGAGAIN = 3, + LOGDGAIN = 2, + + Closed = 0, /* Connection states */ + Listen, + Syn_sent, + Syn_received, + Established, + Finwait1, + Finwait2, + Close_wait, + Closing, + Last_ack, + Time_wait, + + Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */ + NLHT = 256, /* hash table size, must be a power of 2 */ + LHTMASK = NLHT-1, + + HaveWS = 1<<8, +}; + +/* Must correspond to the enumeration above */ +char *tcpstates[] = +{ + "Closed", "Listen", "Syn_sent", "Syn_received", + "Established", "Finwait1", "Finwait2", "Close_wait", + "Closing", "Last_ack", "Time_wait" +}; + +typedef struct Tcptimer Tcptimer; +struct Tcptimer +{ + Tcptimer *next; + Tcptimer *prev; + Tcptimer *readynext; + int state; + int start; + int count; + void (*func)(void*); + void *arg; +}; + +/* + * v4 and v6 pseudo headers used for + * checksuming tcp + */ +typedef struct Tcp4hdr Tcp4hdr; +struct Tcp4hdr +{ + uchar vihl; /* Version and header length */ + uchar tos; /* Type of service */ + uchar length[2]; /* packet length */ + uchar id[2]; /* Identification */ + uchar frag[2]; /* Fragment information */ + uchar Unused; + uchar proto; + uchar tcplen[2]; + uchar tcpsrc[4]; + uchar tcpdst[4]; + uchar tcpsport[2]; + uchar tcpdport[2]; + uchar tcpseq[4]; + uchar tcpack[4]; + uchar tcpflag[2]; + uchar tcpwin[2]; + uchar tcpcksum[2]; + uchar tcpurg[2]; + /* Options segment */ + uchar tcpopt[1]; +}; + +typedef struct Tcp6hdr Tcp6hdr; +struct Tcp6hdr +{ + uchar vcf[4]; + uchar ploadlen[2]; + uchar proto; + uchar ttl; + uchar tcpsrc[IPaddrlen]; + uchar tcpdst[IPaddrlen]; + uchar tcpsport[2]; + uchar tcpdport[2]; + uchar tcpseq[4]; + uchar tcpack[4]; + uchar tcpflag[2]; + uchar tcpwin[2]; + uchar tcpcksum[2]; + uchar tcpurg[2]; + /* Options segment */ + uchar tcpopt[1]; +}; + +/* + * this represents the control info + * for a single packet. It is derived from + * a packet in ntohtcp{4,6}() and stuck into + * a packet in htontcp{4,6}(). + */ +typedef struct Tcp Tcp; +struct Tcp +{ + ushort source; + ushort dest; + ulong seq; + ulong ack; + uchar flags; + ushort ws; /* window scale option (if not zero) */ + ulong wnd; + ushort urg; + ushort mss; /* max segment size option (if not zero) */ + ushort len; /* size of data */ +}; + +/* + * this header is malloc'd to thread together fragments + * waiting to be coalesced + */ +typedef struct Reseq Reseq; +struct Reseq +{ + Reseq *next; + Tcp seg; + Block *bp; + ushort length; +}; + +/* + * the QLOCK in the Conv locks this structure + */ +typedef struct Tcpctl Tcpctl; +struct Tcpctl +{ + uchar state; /* Connection state */ + uchar type; /* Listening or active connection */ + uchar code; /* Icmp code */ + struct { + ulong una; /* Unacked data pointer */ + ulong nxt; /* Next sequence expected */ + ulong ptr; /* Data pointer */ + ulong wnd; /* Tcp send window */ + ulong urg; /* Urgent data pointer */ + ulong wl2; + int scale; /* how much to right shift window in xmitted packets */ + /* to implement tahoe and reno TCP */ + ulong dupacks; /* number of duplicate acks rcvd */ + int recovery; /* loss recovery flag */ + ulong rxt; /* right window marker for recovery */ + } snd; + struct { + ulong nxt; /* Receive pointer to next uchar slot */ + ulong wnd; /* Receive window incoming */ + ulong urg; /* Urgent pointer */ + int blocked; + int una; /* unacked data segs */ + int scale; /* how much to left shift window in rcved packets */ + } rcv; + ulong iss; /* Initial sequence number */ + int sawwsopt; /* true if we saw a wsopt on the incoming SYN */ + ulong cwind; /* Congestion window */ + int scale; /* desired snd.scale */ + ushort ssthresh; /* Slow start threshold */ + int resent; /* Bytes just resent */ + int irs; /* Initial received squence */ + ushort mss; /* Mean segment size */ + int rerecv; /* Overlap of data rerecevived */ + ulong window; /* Recevive window */ + uchar backoff; /* Exponential backoff counter */ + int backedoff; /* ms we've backed off for rexmits */ + uchar flags; /* State flags */ + Reseq *reseq; /* Resequencing queue */ + Tcptimer timer; /* Activity timer */ + Tcptimer acktimer; /* Acknowledge timer */ + Tcptimer rtt_timer; /* Round trip timer */ + Tcptimer katimer; /* keep alive timer */ + ulong rttseq; /* Round trip sequence */ + int srtt; /* Shortened round trip */ + int mdev; /* Mean deviation of round trip */ + int kacounter; /* count down for keep alive */ + uint sndsyntime; /* time syn sent */ + ulong time; /* time Finwait2 or Syn_received was sent */ + int nochecksum; /* non-zero means don't send checksums */ + int flgcnt; /* number of flags in the sequence (FIN,SEQ) */ + + union { + Tcp4hdr tcp4hdr; + Tcp6hdr tcp6hdr; + } protohdr; /* prototype header */ +}; + +/* + * New calls are put in limbo rather than having a conversation structure + * allocated. Thus, a SYN attack results in lots of limbo'd calls but not + * any real Conv structures mucking things up. Calls in limbo rexmit their + * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second. + * + * In particular they aren't on a listener's queue so that they don't figure + * in the input queue limit. + * + * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue + * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore + * there is no hashing of this list. + */ +typedef struct Limbo Limbo; +struct Limbo +{ + Limbo *next; + + uchar laddr[IPaddrlen]; + uchar raddr[IPaddrlen]; + ushort lport; + ushort rport; + ulong irs; /* initial received sequence */ + ulong iss; /* initial sent sequence */ + ushort mss; /* mss from the other end */ + ushort rcvscale; /* how much to scale rcvd windows */ + ushort sndscale; /* how much to scale sent windows */ + ulong lastsend; /* last time we sent a synack */ + uchar version; /* v4 or v6 */ + uchar rexmits; /* number of retransmissions */ +}; + +int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */ +ushort tcp_mss = DEF_MSS; /* Maximum segment size to be sent */ + +enum { + /* MIB stats */ + MaxConn, + ActiveOpens, + PassiveOpens, + EstabResets, + CurrEstab, + InSegs, + OutSegs, + RetransSegs, + RetransTimeouts, + InErrs, + OutRsts, + + /* non-MIB stats */ + CsumErrs, + HlenErrs, + LenErrs, + OutOfOrder, + + Nstats +}; + +static char *statnames[] = +{ +[MaxConn] "MaxConn", +[ActiveOpens] "ActiveOpens", +[PassiveOpens] "PassiveOpens", +[EstabResets] "EstabResets", +[CurrEstab] "CurrEstab", +[InSegs] "InSegs", +[OutSegs] "OutSegs", +[RetransSegs] "RetransSegs", +[RetransTimeouts] "RetransTimeouts", +[InErrs] "InErrs", +[OutRsts] "OutRsts", +[CsumErrs] "CsumErrs", +[HlenErrs] "HlenErrs", +[LenErrs] "LenErrs", +[OutOfOrder] "OutOfOrder", +}; + +typedef struct Tcppriv Tcppriv; +struct Tcppriv +{ + /* List of active timers */ + QLock tl; + Tcptimer *timers; + + /* hash table for matching conversations */ + Ipht ht; + + /* calls in limbo waiting for an ACK to our SYN ACK */ + int nlimbo; + Limbo *lht[NLHT]; + + /* for keeping track of tcpackproc */ + QLock apl; + int ackprocstarted; + + ulong stats[Nstats]; +}; + +/* + * Setting tcpporthogdefense to non-zero enables Dong Lin's + * solution to hijacked systems staking out port's as a form + * of DoS attack. + * + * To avoid stateless Conv hogs, we pick a sequence number at random. If + * that number gets acked by the other end, we shut down the connection. + * Look for tcpporthogdefense in the code. + */ +int tcpporthogdefense = 0; + +int addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort); +void getreseq(Tcpctl*, Tcp*, Block**, ushort*); +void localclose(Conv*, char*); +void procsyn(Conv*, Tcp*); +void tcpiput(Proto*, Ipifc*, Block*); +void tcpoutput(Conv*); +int tcptrim(Tcpctl*, Tcp*, Block**, ushort*); +void tcpstart(Conv*, int); +void tcptimeout(void*); +void tcpsndsyn(Conv*, Tcpctl*); +void tcprcvwin(Conv*); +void tcpacktimer(void*); +void tcpkeepalive(void*); +void tcpsetkacounter(Tcpctl*); +void tcprxmit(Conv*); +void tcpsettimer(Tcpctl*); +void tcpsynackrtt(Conv*); +void tcpsetscale(Conv*, Tcpctl*, ushort, ushort); + +static void limborexmit(Proto*); +static void limbo(Conv*, uchar*, uchar*, Tcp*, int); + +void +tcpsetstate(Conv *s, uchar newstate) +{ + Tcpctl *tcb; + uchar oldstate; + Tcppriv *tpriv; + + tpriv = s->p->priv; + + tcb = (Tcpctl*)s->ptcl; + + oldstate = tcb->state; + if(oldstate == newstate) + return; + + if(oldstate == Established) + tpriv->stats[CurrEstab]--; + if(newstate == Established) + tpriv->stats[CurrEstab]++; + + /** + print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport, + tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab ); + **/ + + switch(newstate) { + case Closed: + qclose(s->rq); + qclose(s->wq); + qclose(s->eq); + break; + + case Close_wait: /* Remote closes */ + qhangup(s->rq, nil); + break; + } + + tcb->state = newstate; + + if(oldstate == Syn_sent && newstate != Closed) + Fsconnected(s, nil); +} + +static char* +tcpconnect(Conv *c, char **argv, int argc) +{ + char *e; + Tcpctl *tcb; + + tcb = (Tcpctl*)(c->ptcl); + if(tcb->state != Closed) + return Econinuse; + + e = Fsstdconnect(c, argv, argc); + if(e != nil) + return e; + tcpstart(c, TCP_CONNECT); + + return nil; +} + +static int +tcpstate(Conv *c, char *state, int n) +{ + Tcpctl *s; + + s = (Tcpctl*)(c->ptcl); + + return snprint(state, n, + "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n", + tcpstates[s->state], + c->rq ? qlen(c->rq) : 0, + c->wq ? qlen(c->wq) : 0, + s->srtt, s->mdev, + s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale, + s->timer.start, s->timer.count, s->rerecv, + s->katimer.start, s->katimer.count); +} + +static int +tcpinuse(Conv *c) +{ + Tcpctl *s; + + s = (Tcpctl*)(c->ptcl); + return s->state != Closed; +} + +static char* +tcpannounce(Conv *c, char **argv, int argc) +{ + char *e; + Tcpctl *tcb; + + tcb = (Tcpctl*)(c->ptcl); + if(tcb->state != Closed) + return Econinuse; + + e = Fsstdannounce(c, argv, argc); + if(e != nil) + return e; + tcpstart(c, TCP_LISTEN); + Fsconnected(c, nil); + + return nil; +} + +/* + * tcpclose is always called with the q locked + */ +static void +tcpclose(Conv *c) +{ + Tcpctl *tcb; + + tcb = (Tcpctl*)c->ptcl; + + qhangup(c->rq, nil); + qhangup(c->wq, nil); + qhangup(c->eq, nil); + qflush(c->rq); + + switch(tcb->state) { + case Listen: + /* + * reset any incoming calls to this listener + */ + Fsconnected(c, "Hangup"); + + localclose(c, nil); + break; + case Closed: + case Syn_sent: + localclose(c, nil); + break; + case Syn_received: + case Established: + tcb->flgcnt++; + tcb->snd.nxt++; + tcpsetstate(c, Finwait1); + tcpoutput(c); + break; + case Close_wait: + tcb->flgcnt++; + tcb->snd.nxt++; + tcpsetstate(c, Last_ack); + tcpoutput(c); + break; + } +} + +void +tcpkick(void *x) +{ + Conv *s = x; + Tcpctl *tcb; + + tcb = (Tcpctl*)s->ptcl; + + if(waserror()){ + QUNLOCK(s); + nexterror(); + } + QLOCK(s); + + switch(tcb->state) { + case Syn_sent: + case Syn_received: + case Established: + case Close_wait: + /* + * Push data + */ + tcprcvwin(s); + tcpoutput(s); + break; + default: + localclose(s, "Hangup"); + break; + } + + QUNLOCK(s); + poperror(); +} + +void +tcprcvwin(Conv *s) /* Call with tcb locked */ +{ + int w; + Tcpctl *tcb; + + tcb = (Tcpctl*)s->ptcl; + w = tcb->window - qlen(s->rq); + if(w < 0) + w = 0; + tcb->rcv.wnd = w; + if(w == 0) + tcb->rcv.blocked = 1; +} + +void +tcpacktimer(void *v) +{ + Tcpctl *tcb; + Conv *s; + + s = v; + tcb = (Tcpctl*)s->ptcl; + + if(waserror()){ + QUNLOCK(s); + nexterror(); + } + QLOCK(s); + if(tcb->state != Closed){ + tcb->flags |= FORCE; + tcprcvwin(s); + tcpoutput(s); + } + QUNLOCK(s); + poperror(); +} + +static void +tcpcreate(Conv *c) +{ + c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c); + c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c); +} + +static void +timerstate(Tcppriv *priv, Tcptimer *t, int newstate) +{ + if(newstate != TcptimerON){ + if(t->state == TcptimerON){ + /* unchain */ + if(priv->timers == t){ + priv->timers = t->next; + if(t->prev != nil) + panic("timerstate1"); + } + if(t->next) + t->next->prev = t->prev; + if(t->prev) + t->prev->next = t->next; + t->next = t->prev = nil; + } + } else { + if(t->state != TcptimerON){ + /* chain */ + if(t->prev != nil || t->next != nil) + panic("timerstate2"); + t->prev = nil; + t->next = priv->timers; + if(t->next) + t->next->prev = t; + priv->timers = t; + } + } + t->state = newstate; +} + +void +tcpackproc(void *a) +{ + Tcptimer *t, *tp, *timeo; + Proto *tcp; + Tcppriv *priv; + int loop; + + tcp = a; + priv = tcp->priv; + + for(;;) { + tsleep(&up->sleep, return0, 0, MSPTICK); + + qlock(&priv->tl); + timeo = nil; + loop = 0; + for(t = priv->timers; t != nil; t = tp) { + if(loop++ > 10000) + panic("tcpackproc1"); + tp = t->next; + if(t->state == TcptimerON) { + t->count--; + if(t->count == 0) { + timerstate(priv, t, TcptimerDONE); + t->readynext = timeo; + timeo = t; + } + } + } + qunlock(&priv->tl); + + loop = 0; + for(t = timeo; t != nil; t = t->readynext) { + if(loop++ > 10000) + panic("tcpackproc2"); + if(t->state == TcptimerDONE && t->func != nil && !waserror()){ + (*t->func)(t->arg); + poperror(); + } + } + + limborexmit(tcp); + } +} + +void +tcpgo(Tcppriv *priv, Tcptimer *t) +{ + if(t == nil || t->start == 0) + return; + + qlock(&priv->tl); + t->count = t->start; + timerstate(priv, t, TcptimerON); + qunlock(&priv->tl); +} + +void +tcphalt(Tcppriv *priv, Tcptimer *t) +{ + if(t == nil) + return; + + qlock(&priv->tl); + timerstate(priv, t, TcptimerOFF); + qunlock(&priv->tl); +} + +int +backoff(int n) +{ + return 1 << n; +} + +void +localclose(Conv *s, char *reason) /* called with tcb locked */ +{ + Tcpctl *tcb; + Reseq *rp,*rp1; + Tcppriv *tpriv; + + tpriv = s->p->priv; + tcb = (Tcpctl*)s->ptcl; + + iphtrem(&tpriv->ht, s); + + tcphalt(tpriv, &tcb->timer); + tcphalt(tpriv, &tcb->rtt_timer); + tcphalt(tpriv, &tcb->acktimer); + tcphalt(tpriv, &tcb->katimer); + + /* Flush reassembly queue; nothing more can arrive */ + for(rp = tcb->reseq; rp != nil; rp = rp1) { + rp1 = rp->next; + freeblist(rp->bp); + free(rp); + } + tcb->reseq = nil; + + if(tcb->state == Syn_sent) + Fsconnected(s, reason); + if(s->state == Announced) + wakeup(&s->listenr); + + qhangup(s->rq, reason); + qhangup(s->wq, reason); + + tcpsetstate(s, Closed); +} + +/* mtu (- TCP + IP hdr len) of 1st hop */ +int +tcpmtu(Proto *tcp, uchar *addr, int version, int *scale) +{ + Ipifc *ifc; + int mtu; + + ifc = findipifc(tcp->f, addr, 0); + switch(version){ + default: + case V4: + mtu = DEF_MSS; + if(ifc != nil) + mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE); + break; + case V6: + mtu = DEF_MSS6; + if(ifc != nil) + mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE); + break; + } + if(ifc != nil){ + if(ifc->mbps > 1000) + *scale = HaveWS | 4; + else if(ifc->mbps > 100) + *scale = HaveWS | 3; + else if(ifc->mbps > 10) + *scale = HaveWS | 1; + else + *scale = HaveWS | 0; + } else + *scale = HaveWS | 0; + + return mtu; +} + +void +inittcpctl(Conv *s, int mode) +{ + Tcpctl *tcb; + Tcp4hdr* h4; + Tcp6hdr* h6; + int mss; + + tcb = (Tcpctl*)s->ptcl; + + memset(tcb, 0, sizeof(Tcpctl)); + + tcb->ssthresh = 65535; + tcb->srtt = tcp_irtt<<LOGAGAIN; + tcb->mdev = 0; + + /* setup timers */ + tcb->timer.start = tcp_irtt / MSPTICK; + tcb->timer.func = tcptimeout; + tcb->timer.arg = s; + tcb->rtt_timer.start = MAX_TIME; + tcb->acktimer.start = TCP_ACK / MSPTICK; + tcb->acktimer.func = tcpacktimer; + tcb->acktimer.arg = s; + tcb->katimer.start = DEF_KAT / MSPTICK; + tcb->katimer.func = tcpkeepalive; + tcb->katimer.arg = s; + + mss = DEF_MSS; + + /* create a prototype(pseudo) header */ + if(mode != TCP_LISTEN){ + if(ipcmp(s->laddr, IPnoaddr) == 0) + findlocalip(s->p->f, s->laddr, s->raddr); + + switch(s->ipversion){ + case V4: + h4 = &tcb->protohdr.tcp4hdr; + memset(h4, 0, sizeof(*h4)); + h4->proto = IP_TCPPROTO; + hnputs(h4->tcpsport, s->lport); + hnputs(h4->tcpdport, s->rport); + v6tov4(h4->tcpsrc, s->laddr); + v6tov4(h4->tcpdst, s->raddr); + break; + case V6: + h6 = &tcb->protohdr.tcp6hdr; + memset(h6, 0, sizeof(*h6)); + h6->proto = IP_TCPPROTO; + hnputs(h6->tcpsport, s->lport); + hnputs(h6->tcpdport, s->rport); + ipmove(h6->tcpsrc, s->laddr); + ipmove(h6->tcpdst, s->raddr); + mss = DEF_MSS6; + break; + default: + panic("inittcpctl: version %d", s->ipversion); + } + } + + tcb->mss = tcb->cwind = mss; + + /* default is no window scaling */ + tcb->window = QMAX; + tcb->rcv.wnd = QMAX; + tcb->rcv.scale = 0; + tcb->snd.scale = 0; + qsetlimit(s->rq, QMAX); +} + +/* + * called with s QLOCKed + */ +void +tcpstart(Conv *s, int mode) +{ + Tcpctl *tcb; + Tcppriv *tpriv; + char kpname[KNAMELEN]; + + tpriv = s->p->priv; + + if(tpriv->ackprocstarted == 0){ + qlock(&tpriv->apl); + if(tpriv->ackprocstarted == 0){ + sprint(kpname, "#I%dtcpack", s->p->f->dev); + kproc(kpname, tcpackproc, s->p); + tpriv->ackprocstarted = 1; + } + qunlock(&tpriv->apl); + } + + tcb = (Tcpctl*)s->ptcl; + + inittcpctl(s, mode); + + iphtadd(&tpriv->ht, s); + switch(mode) { + case TCP_LISTEN: + tpriv->stats[PassiveOpens]++; + tcb->flags |= CLONE; + tcpsetstate(s, Listen); + break; + + case TCP_CONNECT: + tpriv->stats[ActiveOpens]++; + tcb->flags |= ACTIVE; + tcpsndsyn(s, tcb); + tcpsetstate(s, Syn_sent); + tcpoutput(s); + break; + } +} + +static char* +tcpflag(ushort flag) +{ + static char buf[128]; + + sprint(buf, "%d", flag>>10); /* Head len */ + if(flag & URG) + strcat(buf, " URG"); + if(flag & ACK) + strcat(buf, " ACK"); + if(flag & PSH) + strcat(buf, " PSH"); + if(flag & RST) + strcat(buf, " RST"); + if(flag & SYN) + strcat(buf, " SYN"); + if(flag & FIN) + strcat(buf, " FIN"); + + return buf; +} + +Block * +htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb) +{ + int dlen; + Tcp6hdr *h; + ushort csum; + ushort hdrlen, optpad = 0; + uchar *opt; + + hdrlen = TCP6_HDRSIZE; + if(tcph->flags & SYN){ + if(tcph->mss) + hdrlen += MSS_LENGTH; + if(tcph->ws) + hdrlen += WS_LENGTH; + optpad = hdrlen & 3; + if(optpad) + optpad = 4 - optpad; + hdrlen += optpad; + } + + if(data) { + dlen = blocklen(data); + data = padblock(data, hdrlen + TCP6_PKT); + if(data == nil) + return nil; + } + else { + dlen = 0; + data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */ + if(data == nil) + return nil; + data->wp += hdrlen + TCP6_PKT; + } + + /* copy in pseudo ip header plus port numbers */ + h = (Tcp6hdr *)(data->rp); + memmove(h, ph, TCP6_TCBPHDRSZ); + + /* compose pseudo tcp header, do cksum calculation */ + hnputl(h->vcf, hdrlen + dlen); + h->ploadlen[0] = h->ploadlen[1] = h->proto = 0; + h->ttl = ph->proto; + + /* copy in variable bits */ + hnputl(h->tcpseq, tcph->seq); + hnputl(h->tcpack, tcph->ack); + hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags); + hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0)); + hnputs(h->tcpurg, tcph->urg); + + if(tcph->flags & SYN){ + opt = h->tcpopt; + if(tcph->mss != 0){ + *opt++ = MSSOPT; + *opt++ = MSS_LENGTH; + hnputs(opt, tcph->mss); + opt += 2; + } + if(tcph->ws != 0){ + *opt++ = WSOPT; + *opt++ = WS_LENGTH; + *opt++ = tcph->ws; + } + while(optpad-- > 0) + *opt++ = NOOPOPT; + } + + if(tcb != nil && tcb->nochecksum){ + h->tcpcksum[0] = h->tcpcksum[1] = 0; + } else { + csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE); + hnputs(h->tcpcksum, csum); + } + + /* move from pseudo header back to normal ip header */ + memset(h->vcf, 0, 4); + h->vcf[0] = IP_VER6; + hnputs(h->ploadlen, hdrlen+dlen); + h->proto = ph->proto; + + return data; +} + +Block * +htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb) +{ + int dlen; + Tcp4hdr *h; + ushort csum; + ushort hdrlen, optpad = 0; + uchar *opt; + + hdrlen = TCP4_HDRSIZE; + if(tcph->flags & SYN){ + if(tcph->mss) + hdrlen += MSS_LENGTH; + if(tcph->ws) + hdrlen += WS_LENGTH; + optpad = hdrlen & 3; + if(optpad) + optpad = 4 - optpad; + hdrlen += optpad; + } + + if(data) { + dlen = blocklen(data); + data = padblock(data, hdrlen + TCP4_PKT); + if(data == nil) + return nil; + } + else { + dlen = 0; + data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */ + if(data == nil) + return nil; + data->wp += hdrlen + TCP4_PKT; + } + + /* copy in pseudo ip header plus port numbers */ + h = (Tcp4hdr *)(data->rp); + memmove(h, ph, TCP4_TCBPHDRSZ); + + /* copy in variable bits */ + hnputs(h->tcplen, hdrlen + dlen); + hnputl(h->tcpseq, tcph->seq); + hnputl(h->tcpack, tcph->ack); + hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags); + hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0)); + hnputs(h->tcpurg, tcph->urg); + + if(tcph->flags & SYN){ + opt = h->tcpopt; + if(tcph->mss != 0){ + *opt++ = MSSOPT; + *opt++ = MSS_LENGTH; + hnputs(opt, tcph->mss); + opt += 2; + } + if(tcph->ws != 0){ + *opt++ = WSOPT; + *opt++ = WS_LENGTH; + *opt++ = tcph->ws; + } + while(optpad-- > 0) + *opt++ = NOOPOPT; + } + + if(tcb != nil && tcb->nochecksum){ + h->tcpcksum[0] = h->tcpcksum[1] = 0; + } else { + csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE); + hnputs(h->tcpcksum, csum); + } + + return data; +} + +int +ntohtcp6(Tcp *tcph, Block **bpp) +{ + Tcp6hdr *h; + uchar *optr; + ushort hdrlen; + ushort optlen; + int n; + + *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE); + if(*bpp == nil) + return -1; + + h = (Tcp6hdr *)((*bpp)->rp); + tcph->source = nhgets(h->tcpsport); + tcph->dest = nhgets(h->tcpdport); + tcph->seq = nhgetl(h->tcpseq); + tcph->ack = nhgetl(h->tcpack); + hdrlen = (h->tcpflag[0]>>2) & ~3; + if(hdrlen < TCP6_HDRSIZE) { + freeblist(*bpp); + return -1; + } + + tcph->flags = h->tcpflag[1]; + tcph->wnd = nhgets(h->tcpwin); + tcph->urg = nhgets(h->tcpurg); + tcph->mss = 0; + tcph->ws = 0; + tcph->len = nhgets(h->ploadlen) - hdrlen; + + *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT); + if(*bpp == nil) + return -1; + + optr = h->tcpopt; + n = hdrlen - TCP6_HDRSIZE; + while(n > 0 && *optr != EOLOPT) { + if(*optr == NOOPOPT) { + n--; + optr++; + continue; + } + optlen = optr[1]; + if(optlen < 2 || optlen > n) + break; + switch(*optr) { + case MSSOPT: + if(optlen == MSS_LENGTH) + tcph->mss = nhgets(optr+2); + break; + case WSOPT: + if(optlen == WS_LENGTH && *(optr+2) <= 14) + tcph->ws = HaveWS | *(optr+2); + break; + } + n -= optlen; + optr += optlen; + } + return hdrlen; +} + +int +ntohtcp4(Tcp *tcph, Block **bpp) +{ + Tcp4hdr *h; + uchar *optr; + ushort hdrlen; + ushort optlen; + int n; + + *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE); + if(*bpp == nil) + return -1; + + h = (Tcp4hdr *)((*bpp)->rp); + tcph->source = nhgets(h->tcpsport); + tcph->dest = nhgets(h->tcpdport); + tcph->seq = nhgetl(h->tcpseq); + tcph->ack = nhgetl(h->tcpack); + + hdrlen = (h->tcpflag[0]>>2) & ~3; + if(hdrlen < TCP4_HDRSIZE) { + freeblist(*bpp); + return -1; + } + + tcph->flags = h->tcpflag[1]; + tcph->wnd = nhgets(h->tcpwin); + tcph->urg = nhgets(h->tcpurg); + tcph->mss = 0; + tcph->ws = 0; + tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT); + + *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT); + if(*bpp == nil) + return -1; + + optr = h->tcpopt; + n = hdrlen - TCP4_HDRSIZE; + while(n > 0 && *optr != EOLOPT) { + if(*optr == NOOPOPT) { + n--; + optr++; + continue; + } + optlen = optr[1]; + if(optlen < 2 || optlen > n) + break; + switch(*optr) { + case MSSOPT: + if(optlen == MSS_LENGTH) + tcph->mss = nhgets(optr+2); + break; + case WSOPT: + if(optlen == WS_LENGTH && *(optr+2) <= 14) + tcph->ws = HaveWS | *(optr+2); + break; + } + n -= optlen; + optr += optlen; + } + return hdrlen; +} + +/* + * For outgiing calls, generate an initial sequence + * number and put a SYN on the send queue + */ +void +tcpsndsyn(Conv *s, Tcpctl *tcb) +{ + tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16); + tcb->rttseq = tcb->iss; + tcb->snd.wl2 = tcb->iss; + tcb->snd.una = tcb->iss; + tcb->snd.ptr = tcb->rttseq; + tcb->snd.nxt = tcb->rttseq; + tcb->flgcnt++; + tcb->flags |= FORCE; + tcb->sndsyntime = NOW; + + /* set desired mss and scale */ + tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale); +} + +void +sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason) +{ + Block *hbp; + uchar rflags; + Tcppriv *tpriv; + Tcp4hdr ph4; + Tcp6hdr ph6; + + netlog(tcp->f, Logtcp, "sndrst: %s\n", reason); + + tpriv = tcp->priv; + + if(seg->flags & RST) + return; + + /* make pseudo header */ + switch(version) { + case V4: + memset(&ph4, 0, sizeof(ph4)); + ph4.vihl = IP_VER4; + v6tov4(ph4.tcpsrc, dest); + v6tov4(ph4.tcpdst, source); + ph4.proto = IP_TCPPROTO; + hnputs(ph4.tcplen, TCP4_HDRSIZE); + hnputs(ph4.tcpsport, seg->dest); + hnputs(ph4.tcpdport, seg->source); + break; + case V6: + memset(&ph6, 0, sizeof(ph6)); + ph6.vcf[0] = IP_VER6; + ipmove(ph6.tcpsrc, dest); + ipmove(ph6.tcpdst, source); + ph6.proto = IP_TCPPROTO; + hnputs(ph6.ploadlen, TCP6_HDRSIZE); + hnputs(ph6.tcpsport, seg->dest); + hnputs(ph6.tcpdport, seg->source); + break; + default: + panic("sndrst: version %d", version); + } + + tpriv->stats[OutRsts]++; + rflags = RST; + + /* convince the other end that this reset is in band */ + if(seg->flags & ACK) { + seg->seq = seg->ack; + seg->ack = 0; + } + else { + rflags |= ACK; + seg->ack = seg->seq; + seg->seq = 0; + if(seg->flags & SYN) + seg->ack++; + seg->ack += length; + if(seg->flags & FIN) + seg->ack++; + } + seg->flags = rflags; + seg->wnd = 0; + seg->urg = 0; + seg->mss = 0; + seg->ws = 0; + switch(version) { + case V4: + hbp = htontcp4(seg, nil, &ph4, nil); + if(hbp == nil) + return; + ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); + break; + case V6: + hbp = htontcp6(seg, nil, &ph6, nil); + if(hbp == nil) + return; + ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); + break; + default: + panic("sndrst2: version %d", version); + } +} + +/* + * send a reset to the remote side and close the conversation + * called with s QLOCKed + */ +char* +tcphangup(Conv *s) +{ + Tcp seg; + Tcpctl *tcb; + Block *hbp; + + tcb = (Tcpctl*)s->ptcl; + if(waserror()) + return commonerror(); + if(ipcmp(s->raddr, IPnoaddr) != 0) { + if(!waserror()){ + seg.flags = RST | ACK; + seg.ack = tcb->rcv.nxt; + tcb->rcv.una = 0; + seg.seq = tcb->snd.ptr; + seg.wnd = 0; + seg.urg = 0; + seg.mss = 0; + seg.ws = 0; + switch(s->ipversion) { + case V4: + tcb->protohdr.tcp4hdr.vihl = IP_VER4; + hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb); + ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); + break; + case V6: + tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; + hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb); + ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); + break; + default: + panic("tcphangup: version %d", s->ipversion); + } + poperror(); + } + } + localclose(s, nil); + poperror(); + return nil; +} + +/* + * (re)send a SYN ACK + */ +int +sndsynack(Proto *tcp, Limbo *lp) +{ + Block *hbp; + Tcp4hdr ph4; + Tcp6hdr ph6; + Tcp seg; + int scale; + + /* make pseudo header */ + switch(lp->version) { + case V4: + memset(&ph4, 0, sizeof(ph4)); + ph4.vihl = IP_VER4; + v6tov4(ph4.tcpsrc, lp->laddr); + v6tov4(ph4.tcpdst, lp->raddr); + ph4.proto = IP_TCPPROTO; + hnputs(ph4.tcplen, TCP4_HDRSIZE); + hnputs(ph4.tcpsport, lp->lport); + hnputs(ph4.tcpdport, lp->rport); + break; + case V6: + memset(&ph6, 0, sizeof(ph6)); + ph6.vcf[0] = IP_VER6; + ipmove(ph6.tcpsrc, lp->laddr); + ipmove(ph6.tcpdst, lp->raddr); + ph6.proto = IP_TCPPROTO; + hnputs(ph6.ploadlen, TCP6_HDRSIZE); + hnputs(ph6.tcpsport, lp->lport); + hnputs(ph6.tcpdport, lp->rport); + break; + default: + panic("sndrst: version %d", lp->version); + } + + seg.seq = lp->iss; + seg.ack = lp->irs+1; + seg.flags = SYN|ACK; + seg.urg = 0; + seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale); + seg.wnd = QMAX; + + /* if the other side set scale, we should too */ + if(lp->rcvscale){ + seg.ws = scale; + lp->sndscale = scale; + } else { + seg.ws = 0; + lp->sndscale = 0; + } + + switch(lp->version) { + case V4: + hbp = htontcp4(&seg, nil, &ph4, nil); + if(hbp == nil) + return -1; + ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); + break; + case V6: + hbp = htontcp6(&seg, nil, &ph6, nil); + if(hbp == nil) + return -1; + ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); + break; + default: + panic("sndsnack: version %d", lp->version); + } + lp->lastsend = NOW; + return 0; +} + +#define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK ) + +/* + * put a call into limbo and respond with a SYN ACK + * + * called with proto locked + */ +static void +limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version) +{ + Limbo *lp, **l; + Tcppriv *tpriv; + int h; + + tpriv = s->p->priv; + h = hashipa(source, seg->source); + + for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){ + lp = *l; + if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version) + continue; + if(ipcmp(lp->raddr, source) != 0) + continue; + if(ipcmp(lp->laddr, dest) != 0) + continue; + + /* each new SYN restarts the retransmits */ + lp->irs = seg->seq; + break; + } + lp = *l; + if(lp == nil){ + if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){ + lp = tpriv->lht[h]; + tpriv->lht[h] = lp->next; + lp->next = nil; + } else { + lp = malloc(sizeof(*lp)); + if(lp == nil) + return; + tpriv->nlimbo++; + } + *l = lp; + lp->version = version; + ipmove(lp->laddr, dest); + ipmove(lp->raddr, source); + lp->lport = seg->dest; + lp->rport = seg->source; + lp->mss = seg->mss; + lp->rcvscale = seg->ws; + lp->irs = seg->seq; + lp->iss = (nrand(1<<16)<<16)|nrand(1<<16); + } + + if(sndsynack(s->p, lp) < 0){ + *l = lp->next; + tpriv->nlimbo--; + free(lp); + } +} + +/* + * resend SYN ACK's once every SYNACK_RXTIMER ms. + */ +static void +limborexmit(Proto *tcp) +{ + Tcppriv *tpriv; + Limbo **l, *lp; + int h; + int seen; + ulong now; + + tpriv = tcp->priv; + + if(!CANQLOCK(tcp)) + return; + seen = 0; + now = NOW; + for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){ + for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){ + lp = *l; + seen++; + if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER) + continue; + + /* time it out after 1 second */ + if(++(lp->rexmits) > 5){ + tpriv->nlimbo--; + *l = lp->next; + free(lp); + continue; + } + + /* if we're being attacked, don't bother resending SYN ACK's */ + if(tpriv->nlimbo > 100) + continue; + + if(sndsynack(tcp, lp) < 0){ + tpriv->nlimbo--; + *l = lp->next; + free(lp); + continue; + } + + l = &lp->next; + } + } + QUNLOCK(tcp); +} + +/* + * lookup call in limbo. if found, throw it out. + * + * called with proto locked + */ +static void +limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) +{ + Limbo *lp, **l; + int h; + Tcppriv *tpriv; + + tpriv = s->p->priv; + + /* find a call in limbo */ + h = hashipa(src, segp->source); + for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){ + lp = *l; + if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version) + continue; + if(ipcmp(lp->laddr, dst) != 0) + continue; + if(ipcmp(lp->raddr, src) != 0) + continue; + + /* RST can only follow the SYN */ + if(segp->seq == lp->irs+1){ + tpriv->nlimbo--; + *l = lp->next; + free(lp); + } + break; + } +} + +/* + * come here when we finally get an ACK to our SYN-ACK. + * lookup call in limbo. if found, create a new conversation + * + * called with proto locked + */ +static Conv* +tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) +{ + Conv *new; + Tcpctl *tcb; + Tcppriv *tpriv; + Tcp4hdr *h4; + Tcp6hdr *h6; + Limbo *lp, **l; + int h; + + /* unless it's just an ack, it can't be someone coming out of limbo */ + if((segp->flags & SYN) || (segp->flags & ACK) == 0) + return nil; + + tpriv = s->p->priv; + + /* find a call in limbo */ + h = hashipa(src, segp->source); + for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){ + netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d\n", + src, segp->source, lp->raddr, lp->rport, + dst, segp->dest, lp->laddr, lp->lport, + version, lp->version + ); + + if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version) + continue; + if(ipcmp(lp->laddr, dst) != 0) + continue; + if(ipcmp(lp->raddr, src) != 0) + continue; + + /* we're assuming no data with the initial SYN */ + if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){ + netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n", + segp->seq, lp->irs+1, segp->ack, lp->iss+1); + lp = nil; + } else { + tpriv->nlimbo--; + *l = lp->next; + } + break; + } + if(lp == nil) + return nil; + + new = Fsnewcall(s, src, segp->source, dst, segp->dest, version); + if(new == nil) + return nil; + + memmove(new->ptcl, s->ptcl, sizeof(Tcpctl)); + tcb = (Tcpctl*)new->ptcl; + tcb->flags &= ~CLONE; + tcb->timer.arg = new; + tcb->timer.state = TcptimerOFF; + tcb->acktimer.arg = new; + tcb->acktimer.state = TcptimerOFF; + tcb->katimer.arg = new; + tcb->katimer.state = TcptimerOFF; + tcb->rtt_timer.arg = new; + tcb->rtt_timer.state = TcptimerOFF; + + tcb->irs = lp->irs; + tcb->rcv.nxt = tcb->irs+1; + tcb->rcv.urg = tcb->rcv.nxt; + + tcb->iss = lp->iss; + tcb->rttseq = tcb->iss; + tcb->snd.wl2 = tcb->iss; + tcb->snd.una = tcb->iss+1; + tcb->snd.ptr = tcb->iss+1; + tcb->snd.nxt = tcb->iss+1; + tcb->flgcnt = 0; + tcb->flags |= SYNACK; + + /* our sending max segment size cannot be bigger than what he asked for */ + if(lp->mss != 0 && lp->mss < tcb->mss) + tcb->mss = lp->mss; + + /* window scaling */ + tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale); + + /* the congestion window always starts out as a single segment */ + tcb->snd.wnd = segp->wnd; + tcb->cwind = tcb->mss; + + /* set initial round trip time */ + tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER; + tcpsynackrtt(new); + + free(lp); + + /* set up proto header */ + switch(version){ + case V4: + h4 = &tcb->protohdr.tcp4hdr; + memset(h4, 0, sizeof(*h4)); + h4->proto = IP_TCPPROTO; + hnputs(h4->tcpsport, new->lport); + hnputs(h4->tcpdport, new->rport); + v6tov4(h4->tcpsrc, dst); + v6tov4(h4->tcpdst, src); + break; + case V6: + h6 = &tcb->protohdr.tcp6hdr; + memset(h6, 0, sizeof(*h6)); + h6->proto = IP_TCPPROTO; + hnputs(h6->tcpsport, new->lport); + hnputs(h6->tcpdport, new->rport); + ipmove(h6->tcpsrc, dst); + ipmove(h6->tcpdst, src); + break; + default: + panic("tcpincoming: version %d", new->ipversion); + } + + tcpsetstate(new, Established); + + iphtadd(&tpriv->ht, new); + + return new; +} + +int +seq_within(ulong x, ulong low, ulong high) +{ + if(low <= high){ + if(low <= x && x <= high) + return 1; + } + else { + if(x >= low || x <= high) + return 1; + } + return 0; +} + +int +seq_lt(ulong x, ulong y) +{ + return (int)(x-y) < 0; +} + +int +seq_le(ulong x, ulong y) +{ + return (int)(x-y) <= 0; +} + +int +seq_gt(ulong x, ulong y) +{ + return (int)(x-y) > 0; +} + +int +seq_ge(ulong x, ulong y) +{ + return (int)(x-y) >= 0; +} + +/* + * use the time between the first SYN and it's ack as the + * initial round trip time + */ +void +tcpsynackrtt(Conv *s) +{ + Tcpctl *tcb; + int delta; + Tcppriv *tpriv; + + tcb = (Tcpctl*)s->ptcl; + tpriv = s->p->priv; + + delta = NOW - tcb->sndsyntime; + tcb->srtt = delta<<LOGAGAIN; + tcb->mdev = delta<<LOGDGAIN; + + /* halt round trip timer */ + tcphalt(tpriv, &tcb->rtt_timer); +} + +void +update(Conv *s, Tcp *seg) +{ + int rtt, delta; + Tcpctl *tcb; + ulong acked; + ulong expand; + Tcppriv *tpriv; + + tpriv = s->p->priv; + tcb = (Tcpctl*)s->ptcl; + + /* if everything has been acked, force output(?) */ + if(seq_gt(seg->ack, tcb->snd.nxt)) { + tcb->flags |= FORCE; + return; + } + + /* added by Dong Lin for fast retransmission */ + if(seg->ack == tcb->snd.una + && tcb->snd.una != tcb->snd.nxt + && seg->len == 0 + && seg->wnd == tcb->snd.wnd) { + + /* this is a pure ack w/o window update */ + netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %d advwin %d\n", + tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd); + + if(++tcb->snd.dupacks == TCPREXMTTHRESH) { + /* + * tahoe tcp rxt the packet, half sshthresh, + * and set cwnd to one packet + */ + tcb->snd.recovery = 1; + tcb->snd.rxt = tcb->snd.nxt; + netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt); + tcprxmit(s); + } else { + /* do reno tcp here. */ + } + } + + /* + * update window + */ + if(seq_gt(seg->ack, tcb->snd.wl2) + || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){ + tcb->snd.wnd = seg->wnd; + tcb->snd.wl2 = seg->ack; + } + + if(!seq_gt(seg->ack, tcb->snd.una)){ + /* + * don't let us hangup if sending into a closed window and + * we're still getting acks + */ + if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){ + tcb->backedoff = MAXBACKMS/4; + } + return; + } + + /* + * any positive ack turns off fast rxt, + * (should we do new-reno on partial acks?) + */ + if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) { + tcb->snd.dupacks = 0; + tcb->snd.recovery = 0; + } else + netlog(s->p->f, Logtcp, "rxt next %lud, cwin %ud\n", seg->ack, tcb->cwind); + + /* Compute the new send window size */ + acked = seg->ack - tcb->snd.una; + + /* avoid slow start and timers for SYN acks */ + if((tcb->flags & SYNACK) == 0) { + tcb->flags |= SYNACK; + acked--; + tcb->flgcnt--; + goto done; + } + + /* slow start as long as we're not recovering from lost packets */ + if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) { + if(tcb->cwind < tcb->ssthresh) { + expand = tcb->mss; + if(acked < expand) + expand = acked; + } + else + expand = ((int)tcb->mss * tcb->mss) / tcb->cwind; + + if(tcb->cwind + expand < tcb->cwind) + expand = tcb->snd.wnd - tcb->cwind; + if(tcb->cwind + expand > tcb->snd.wnd) + expand = tcb->snd.wnd - tcb->cwind; + tcb->cwind += expand; + } + + /* Adjust the timers according to the round trip time */ + if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) { + tcphalt(tpriv, &tcb->rtt_timer); + if((tcb->flags&RETRAN) == 0) { + tcb->backoff = 0; + tcb->backedoff = 0; + rtt = tcb->rtt_timer.start - tcb->rtt_timer.count; + if(rtt == 0) + rtt = 1; /* otherwise all close systems will rexmit in 0 time */ + rtt *= MSPTICK; + if(tcb->srtt == 0) { + tcb->srtt = rtt << LOGAGAIN; + tcb->mdev = rtt << LOGDGAIN; + } else { + delta = rtt - (tcb->srtt>>LOGAGAIN); + tcb->srtt += delta; + if(tcb->srtt <= 0) + tcb->srtt = 1; + + delta = abs(delta) - (tcb->mdev>>LOGDGAIN); + tcb->mdev += delta; + if(tcb->mdev <= 0) + tcb->mdev = 1; + } + tcpsettimer(tcb); + } + } + +done: + if(qdiscard(s->wq, acked) < acked) + tcb->flgcnt--; + + tcb->snd.una = seg->ack; + if(seq_gt(seg->ack, tcb->snd.urg)) + tcb->snd.urg = seg->ack; + + if(tcb->snd.una != tcb->snd.nxt) + tcpgo(tpriv, &tcb->timer); + else + tcphalt(tpriv, &tcb->timer); + + if(seq_lt(tcb->snd.ptr, tcb->snd.una)) + tcb->snd.ptr = tcb->snd.una; + + tcb->flags &= ~RETRAN; + tcb->backoff = 0; + tcb->backedoff = 0; +} + +void +tcpiput(Proto *tcp, Ipifc* _, Block *bp) +{ + Tcp seg; + Tcp4hdr *h4; + Tcp6hdr *h6; + int hdrlen; + Tcpctl *tcb; + ushort length, csum; + uchar source[IPaddrlen], dest[IPaddrlen]; + Conv *s; + Fs *f; + Tcppriv *tpriv; + uchar version; + + f = tcp->f; + tpriv = tcp->priv; + + tpriv->stats[InSegs]++; + + h4 = (Tcp4hdr*)(bp->rp); + h6 = (Tcp6hdr*)(bp->rp); + + if((h4->vihl&0xF0)==IP_VER4) { + version = V4; + length = nhgets(h4->length); + v4tov6(dest, h4->tcpdst); + v4tov6(source, h4->tcpsrc); + + h4->Unused = 0; + hnputs(h4->tcplen, length-TCP4_PKT); + if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) && + ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) { + tpriv->stats[CsumErrs]++; + tpriv->stats[InErrs]++; + netlog(f, Logtcp, "bad tcp proto cksum\n"); + freeblist(bp); + return; + } + + hdrlen = ntohtcp4(&seg, &bp); + if(hdrlen < 0){ + tpriv->stats[HlenErrs]++; + tpriv->stats[InErrs]++; + netlog(f, Logtcp, "bad tcp hdr len\n"); + return; + } + + /* trim the packet to the size claimed by the datagram */ + length -= hdrlen+TCP4_PKT; + bp = trimblock(bp, hdrlen+TCP4_PKT, length); + if(bp == nil){ + tpriv->stats[LenErrs]++; + tpriv->stats[InErrs]++; + netlog(f, Logtcp, "tcp len < 0 after trim\n"); + return; + } + } + else { + int ttl = h6->ttl; + int proto = h6->proto; + + version = V6; + length = nhgets(h6->ploadlen); + ipmove(dest, h6->tcpdst); + ipmove(source, h6->tcpsrc); + + h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0; + h6->ttl = proto; + hnputl(h6->vcf, length); + if((h6->tcpcksum[0] || h6->tcpcksum[1]) && + (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) { + tpriv->stats[CsumErrs]++; + tpriv->stats[InErrs]++; + netlog(f, Logtcp, + "bad tcpv6 proto cksum: got %#ux, computed %#ux\n", + h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum); + freeblist(bp); + return; + } + h6->ttl = ttl; + h6->proto = proto; + hnputs(h6->ploadlen, length); + + hdrlen = ntohtcp6(&seg, &bp); + if(hdrlen < 0){ + tpriv->stats[HlenErrs]++; + tpriv->stats[InErrs]++; + netlog(f, Logtcp, "bad tcpv6 hdr len\n"); + return; + } + + /* trim the packet to the size claimed by the datagram */ + length -= hdrlen; + bp = trimblock(bp, hdrlen+TCP6_PKT, length); + if(bp == nil){ + tpriv->stats[LenErrs]++; + tpriv->stats[InErrs]++; + netlog(f, Logtcp, "tcpv6 len < 0 after trim\n"); + return; + } + } + + /* lock protocol while searching for a conversation */ + QLOCK(tcp); + + /* Look for a matching conversation */ + s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest); + if(s == nil){ + netlog(f, Logtcp, "iphtlook failed\n"); +reset: + QUNLOCK(tcp); + sndrst(tcp, source, dest, length, &seg, version, "no conversation"); + freeblist(bp); + return; + } + + /* if it's a listener, look for the right flags and get a new conv */ + tcb = (Tcpctl*)s->ptcl; + if(tcb->state == Listen){ + if(seg.flags & RST){ + limborst(s, &seg, source, dest, version); + QUNLOCK(tcp); + freeblist(bp); + return; + } + + /* if this is a new SYN, put the call into limbo */ + if((seg.flags & SYN) && (seg.flags & ACK) == 0){ + limbo(s, source, dest, &seg, version); + QUNLOCK(tcp); + freeblist(bp); + return; + } + + /* + * if there's a matching call in limbo, tcpincoming will + * return it in state Syn_received + */ + s = tcpincoming(s, &seg, source, dest, version); + if(s == nil) + goto reset; + } + + /* The rest of the input state machine is run with the control block + * locked and implements the state machine directly out of the RFC. + * Out-of-band data is ignored - it was always a bad idea. + */ + tcb = (Tcpctl*)s->ptcl; + if(waserror()){ + QUNLOCK(s); + nexterror(); + } + QLOCK(s); + QUNLOCK(tcp); + + /* fix up window */ + seg.wnd <<= tcb->rcv.scale; + + /* every input packet in puts off the keep alive time out */ + tcpsetkacounter(tcb); + + switch(tcb->state) { + case Closed: + sndrst(tcp, source, dest, length, &seg, version, "sending to Closed"); + goto raise; + case Syn_sent: + if(seg.flags & ACK) { + if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) { + sndrst(tcp, source, dest, length, &seg, version, + "bad seq in Syn_sent"); + goto raise; + } + } + if(seg.flags & RST) { + if(seg.flags & ACK) + localclose(s, Econrefused); + goto raise; + } + + if(seg.flags & SYN) { + procsyn(s, &seg); + if(seg.flags & ACK){ + update(s, &seg); + tcpsynackrtt(s); + tcpsetstate(s, Established); + tcpsetscale(s, tcb, seg.ws, tcb->scale); + } + else { + tcb->time = NOW; + tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */ + } + + if(length != 0 || (seg.flags & FIN)) + break; + + freeblist(bp); + goto output; + } + else + freeblist(bp); + + QUNLOCK(s); + poperror(); + return; + case Syn_received: + /* doesn't matter if it's the correct ack, we're just trying to set timing */ + if(seg.flags & ACK) + tcpsynackrtt(s); + break; + } + + /* + * One DOS attack is to open connections to us and then forget about them, + * thereby tying up a conv at no long term cost to the attacker. + * This is an attempt to defeat these stateless DOS attacks. See + * corresponding code in tcpsendka(). + */ + if(tcb->state != Syn_received && (seg.flags & RST) == 0){ + if(tcpporthogdefense + && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){ + print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n", + source, seg.source, dest, seg.dest, seg.flags, + tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29)); + localclose(s, "stateless hog"); + } + } + + /* Cut the data to fit the receive window */ + if(tcptrim(tcb, &seg, &bp, &length) == -1) { + netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length); + update(s, &seg); + if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) { + tcphalt(tpriv, &tcb->rtt_timer); + tcphalt(tpriv, &tcb->acktimer); + tcphalt(tpriv, &tcb->katimer); + tcpsetstate(s, Time_wait); + tcb->timer.start = MSL2*(1000 / MSPTICK); + tcpgo(tpriv, &tcb->timer); + } + if(!(seg.flags & RST)) { + tcb->flags |= FORCE; + goto output; + } + QUNLOCK(s); + poperror(); + return; + } + + /* Cannot accept so answer with a rst */ + if(length && tcb->state == Closed) { + sndrst(tcp, source, dest, length, &seg, version, "sending to Closed"); + goto raise; + } + + /* The segment is beyond the current receive pointer so + * queue the data in the resequence queue + */ + if(seg.seq != tcb->rcv.nxt) + if(length != 0 || (seg.flags & (SYN|FIN))) { + update(s, &seg); + if(addreseq(tcb, tpriv, &seg, bp, length) < 0) + print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport); + tcb->flags |= FORCE; + goto output; + } + + /* + * keep looping till we've processed this packet plus any + * adjacent packets in the resequence queue + */ + for(;;) { + if(seg.flags & RST) { + if(tcb->state == Established) { + tpriv->stats[EstabResets]++; + if(tcb->rcv.nxt != seg.seq) + print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq); + } + localclose(s, Econrefused); + goto raise; + } + + if((seg.flags&ACK) == 0) + goto raise; + + switch(tcb->state) { + case Syn_received: + if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){ + sndrst(tcp, source, dest, length, &seg, version, + "bad seq in Syn_received"); + goto raise; + } + update(s, &seg); + tcpsetstate(s, Established); + case Established: + case Close_wait: + update(s, &seg); + break; + case Finwait1: + update(s, &seg); + if(qlen(s->wq)+tcb->flgcnt == 0){ + tcphalt(tpriv, &tcb->rtt_timer); + tcphalt(tpriv, &tcb->acktimer); + tcpsetkacounter(tcb); + tcb->time = NOW; + tcpsetstate(s, Finwait2); + tcb->katimer.start = MSL2 * (1000 / MSPTICK); + tcpgo(tpriv, &tcb->katimer); + } + break; + case Finwait2: + update(s, &seg); + break; + case Closing: + update(s, &seg); + if(qlen(s->wq)+tcb->flgcnt == 0) { + tcphalt(tpriv, &tcb->rtt_timer); + tcphalt(tpriv, &tcb->acktimer); + tcphalt(tpriv, &tcb->katimer); + tcpsetstate(s, Time_wait); + tcb->timer.start = MSL2*(1000 / MSPTICK); + tcpgo(tpriv, &tcb->timer); + } + break; + case Last_ack: + update(s, &seg); + if(qlen(s->wq)+tcb->flgcnt == 0) { + localclose(s, nil); + goto raise; + } + case Time_wait: + tcb->flags |= FORCE; + if(tcb->timer.state != TcptimerON) + tcpgo(tpriv, &tcb->timer); + } + + if((seg.flags&URG) && seg.urg) { + if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) { + tcb->rcv.urg = seg.urg + seg.seq; + pullblock(&bp, seg.urg); + } + } + else + if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg)) + tcb->rcv.urg = tcb->rcv.nxt; + + if(length == 0) { + if(bp != nil) + freeblist(bp); + } + else { + switch(tcb->state){ + default: + /* Ignore segment text */ + if(bp != nil) + freeblist(bp); + break; + + case Syn_received: + case Established: + case Finwait1: + /* If we still have some data place on + * receive queue + */ + if(bp) { + bp = packblock(bp); + if(bp == nil) + panic("tcp packblock"); + qpassnolim(s->rq, bp); + bp = nil; + + /* + * Force an ack every 2 data messages. This is + * a hack for rob to make his home system run + * faster. + * + * this also keeps the standard TCP congestion + * control working since it needs an ack every + * 2 max segs worth. This is not quite that, + * but under a real stream is equivalent since + * every packet has a max seg in it. + */ + if(++(tcb->rcv.una) >= 2) + tcb->flags |= FORCE; + } + tcb->rcv.nxt += length; + + /* + * update our rcv window + */ + tcprcvwin(s); + + /* + * turn on the acktimer if there's something + * to ack + */ + if(tcb->acktimer.state != TcptimerON) + tcpgo(tpriv, &tcb->acktimer); + + break; + case Finwait2: + /* no process to read the data, send a reset */ + if(bp != nil) + freeblist(bp); + sndrst(tcp, source, dest, length, &seg, version, + "send to Finwait2"); + QUNLOCK(s); + poperror(); + return; + } + } + + if(seg.flags & FIN) { + tcb->flags |= FORCE; + + switch(tcb->state) { + case Syn_received: + case Established: + tcb->rcv.nxt++; + tcpsetstate(s, Close_wait); + break; + case Finwait1: + tcb->rcv.nxt++; + if(qlen(s->wq)+tcb->flgcnt == 0) { + tcphalt(tpriv, &tcb->rtt_timer); + tcphalt(tpriv, &tcb->acktimer); + tcphalt(tpriv, &tcb->katimer); + tcpsetstate(s, Time_wait); + tcb->timer.start = MSL2*(1000/MSPTICK); + tcpgo(tpriv, &tcb->timer); + } + else + tcpsetstate(s, Closing); + break; + case Finwait2: + tcb->rcv.nxt++; + tcphalt(tpriv, &tcb->rtt_timer); + tcphalt(tpriv, &tcb->acktimer); + tcphalt(tpriv, &tcb->katimer); + tcpsetstate(s, Time_wait); + tcb->timer.start = MSL2 * (1000/MSPTICK); + tcpgo(tpriv, &tcb->timer); + break; + case Close_wait: + case Closing: + case Last_ack: + break; + case Time_wait: + tcpgo(tpriv, &tcb->timer); + break; + } + } + + /* + * get next adjacent segment from the resequence queue. + * dump/trim any overlapping segments + */ + for(;;) { + if(tcb->reseq == nil) + goto output; + + if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0) + goto output; + + getreseq(tcb, &seg, &bp, &length); + + if(tcptrim(tcb, &seg, &bp, &length) == 0) + break; + } + } +output: + tcpoutput(s); + QUNLOCK(s); + poperror(); + return; +raise: + QUNLOCK(s); + poperror(); + freeblist(bp); + tcpkick(s); +} + +/* + * always enters and exits with the s locked. We drop + * the lock to ipoput the packet so some care has to be + * taken by callers. + */ +void +tcpoutput(Conv *s) +{ + Tcp seg; + int msgs; + Tcpctl *tcb; + Block *hbp, *bp; + int sndcnt, n; + ulong ssize, dsize, usable, sent; + Fs *f; + Tcppriv *tpriv; + uchar version; + + f = s->p->f; + tpriv = s->p->priv; + version = s->ipversion; + + for(msgs = 0; msgs < 100; msgs++) { + tcb = (Tcpctl*)s->ptcl; + + switch(tcb->state) { + case Listen: + case Closed: + case Finwait2: + return; + } + + /* force an ack when a window has opened up */ + if(tcb->rcv.blocked && tcb->rcv.wnd > 0){ + tcb->rcv.blocked = 0; + tcb->flags |= FORCE; + } + + sndcnt = qlen(s->wq)+tcb->flgcnt; + sent = tcb->snd.ptr - tcb->snd.una; + + /* Don't send anything else until our SYN has been acked */ + if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0) + break; + + /* Compute usable segment based on offered window and limit + * window probes to one + */ + if(tcb->snd.wnd == 0){ + if(sent != 0) { + if((tcb->flags&FORCE) == 0) + break; +// tcb->snd.ptr = tcb->snd.una; + } + usable = 1; + } + else { + usable = tcb->cwind; + if(tcb->snd.wnd < usable) + usable = tcb->snd.wnd; + usable -= sent; + } + ssize = sndcnt-sent; + if(ssize && usable < 2) + netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n", + tcb->snd.wnd, tcb->cwind); + if(usable < ssize) + ssize = usable; + if(tcb->mss < ssize) + ssize = tcb->mss; + dsize = ssize; + seg.urg = 0; + + if(ssize == 0) + if((tcb->flags&FORCE) == 0) + break; + + tcb->flags &= ~FORCE; + tcprcvwin(s); + + /* By default we will generate an ack */ + tcphalt(tpriv, &tcb->acktimer); + tcb->rcv.una = 0; + seg.source = s->lport; + seg.dest = s->rport; + seg.flags = ACK; + seg.mss = 0; + seg.ws = 0; + switch(tcb->state){ + case Syn_sent: + seg.flags = 0; + if(tcb->snd.ptr == tcb->iss){ + seg.flags |= SYN; + dsize--; + seg.mss = tcb->mss; + seg.ws = tcb->scale; + } + break; + case Syn_received: + /* + * don't send any data with a SYN/ACK packet + * because Linux rejects the packet in its + * attempt to solve the SYN attack problem + */ + if(tcb->snd.ptr == tcb->iss){ + seg.flags |= SYN; + dsize = 0; + ssize = 1; + seg.mss = tcb->mss; + seg.ws = tcb->scale; + } + break; + } + seg.seq = tcb->snd.ptr; + seg.ack = tcb->rcv.nxt; + seg.wnd = tcb->rcv.wnd; + + /* Pull out data to send */ + bp = nil; + if(dsize != 0) { + bp = qcopy(s->wq, dsize, sent); + if(BLEN(bp) != dsize) { + seg.flags |= FIN; + dsize--; + } + } + + if(sent+dsize == sndcnt) + seg.flags |= PSH; + + /* keep track of balance of resent data */ + if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) { + n = tcb->snd.nxt - tcb->snd.ptr; + if(ssize < n) + n = ssize; + tcb->resent += n; + netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr %lux nxt %lux\n", + s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt); + tpriv->stats[RetransSegs]++; + } + + tcb->snd.ptr += ssize; + + /* Pull up the send pointer so we can accept acks + * for this window + */ + if(seq_gt(tcb->snd.ptr,tcb->snd.nxt)) + tcb->snd.nxt = tcb->snd.ptr; + + /* Build header, link data and compute cksum */ + switch(version){ + case V4: + tcb->protohdr.tcp4hdr.vihl = IP_VER4; + hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb); + if(hbp == nil) { + freeblist(bp); + return; + } + break; + case V6: + tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; + hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb); + if(hbp == nil) { + freeblist(bp); + return; + } + break; + default: + hbp = nil; /* to suppress a warning */ + panic("tcpoutput: version %d", version); + } + + /* Start the transmission timers if there is new data and we + * expect acknowledges + */ + if(ssize != 0){ + if(tcb->timer.state != TcptimerON) + tcpgo(tpriv, &tcb->timer); + + /* If round trip timer isn't running, start it. + * measure the longest packet only in case the + * transmission time dominates RTT + */ + if(tcb->rtt_timer.state != TcptimerON) + if(ssize == tcb->mss) { + tcpgo(tpriv, &tcb->rtt_timer); + tcb->rttseq = tcb->snd.ptr; + } + } + + tpriv->stats[OutSegs]++; + + /* put off the next keep alive */ + tcpgo(tpriv, &tcb->katimer); + + switch(version){ + case V4: + if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){ + /* a negative return means no route */ + localclose(s, "no route"); + } + break; + case V6: + if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){ + /* a negative return means no route */ + localclose(s, "no route"); + } + break; + default: + panic("tcpoutput2: version %d", version); + } + if((uint)(msgs%4) == 1){ + QUNLOCK(s); + sched(); + QLOCK(s); + } + } +} + +/* + * the BSD convention (hack?) for keep alives. resend last uchar acked. + */ +void +tcpsendka(Conv *s) +{ + Tcp seg; + Tcpctl *tcb; + Block *hbp,*dbp; + + tcb = (Tcpctl*)s->ptcl; + + dbp = nil; + seg.urg = 0; + seg.source = s->lport; + seg.dest = s->rport; + seg.flags = ACK|PSH; + seg.mss = 0; + seg.ws = 0; + if(tcpporthogdefense) + seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20); + else + seg.seq = tcb->snd.una-1; + seg.ack = tcb->rcv.nxt; + tcb->rcv.una = 0; + seg.wnd = tcb->rcv.wnd; + if(tcb->state == Finwait2){ + seg.flags |= FIN; + } else { + dbp = allocb(1); + dbp->wp++; + } + + if(isv4(s->raddr)) { + /* Build header, link data and compute cksum */ + tcb->protohdr.tcp4hdr.vihl = IP_VER4; + hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb); + if(hbp == nil) { + freeblist(dbp); + return; + } + ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); + } + else { + /* Build header, link data and compute cksum */ + tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; + hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb); + if(hbp == nil) { + freeblist(dbp); + return; + } + ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); + } +} + +/* + * set connection to time out after 12 minutes + */ +void +tcpsetkacounter(Tcpctl *tcb) +{ + tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK); + if(tcb->kacounter < 3) + tcb->kacounter = 3; +} + +/* + * if we've timed out, close the connection + * otherwise, send a keepalive and restart the timer + */ +void +tcpkeepalive(void *v) +{ + Tcpctl *tcb; + Conv *s; + + s = v; + tcb = (Tcpctl*)s->ptcl; + if(waserror()){ + QUNLOCK(s); + nexterror(); + } + QLOCK(s); + if(tcb->state != Closed){ + if(--(tcb->kacounter) <= 0) { + localclose(s, Etimedout); + } else { + tcpsendka(s); + tcpgo(s->p->priv, &tcb->katimer); + } + } + QUNLOCK(s); + poperror(); +} + +/* + * start keepalive timer + */ +char* +tcpstartka(Conv *s, char **f, int n) +{ + Tcpctl *tcb; + int x; + + tcb = (Tcpctl*)s->ptcl; + if(tcb->state != Established) + return "connection must be in Establised state"; + if(n > 1){ + x = atoi(f[1]); + if(x >= MSPTICK) + tcb->katimer.start = x/MSPTICK; + } + tcpsetkacounter(tcb); + tcpgo(s->p->priv, &tcb->katimer); + + return nil; +} + +/* + * turn checksums on/off + */ +char* +tcpsetchecksum(Conv *s, char **f, int _) +{ + Tcpctl *tcb; + + tcb = (Tcpctl*)s->ptcl; + tcb->nochecksum = !atoi(f[1]); + + return nil; +} + +void +tcprxmit(Conv *s) +{ + Tcpctl *tcb; + + tcb = (Tcpctl*)s->ptcl; + + tcb->flags |= RETRAN|FORCE; + tcb->snd.ptr = tcb->snd.una; + + /* + * We should be halving the slow start threshhold (down to one + * mss) but leaving it at mss seems to work well enough + */ + tcb->ssthresh = tcb->mss; + + /* + * pull window down to a single packet + */ + tcb->cwind = tcb->mss; + tcpoutput(s); +} + +void +tcptimeout(void *arg) +{ + Conv *s; + Tcpctl *tcb; + int maxback; + Tcppriv *tpriv; + + s = (Conv*)arg; + tpriv = s->p->priv; + tcb = (Tcpctl*)s->ptcl; + + if(waserror()){ + QUNLOCK(s); + nexterror(); + } + QLOCK(s); + switch(tcb->state){ + default: + tcb->backoff++; + if(tcb->state == Syn_sent) + maxback = MAXBACKMS/2; + else + maxback = MAXBACKMS; + tcb->backedoff += tcb->timer.start * MSPTICK; + if(tcb->backedoff >= maxback) { + localclose(s, Etimedout); + break; + } + netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n", tcb->snd.una, tcb->timer.start, NOW); + tcpsettimer(tcb); + tcprxmit(s); + tpriv->stats[RetransTimeouts]++; + tcb->snd.dupacks = 0; + break; + case Time_wait: + localclose(s, nil); + break; + case Closed: + break; + } + QUNLOCK(s); + poperror(); +} + +int +inwindow(Tcpctl *tcb, int seq) +{ + return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1); +} + +/* + * set up state for a received SYN (or SYN ACK) packet + */ +void +procsyn(Conv *s, Tcp *seg) +{ + Tcpctl *tcb; + + tcb = (Tcpctl*)s->ptcl; + tcb->flags |= FORCE; + + tcb->rcv.nxt = seg->seq + 1; + tcb->rcv.urg = tcb->rcv.nxt; + tcb->irs = seg->seq; + + /* our sending max segment size cannot be bigger than what he asked for */ + if(seg->mss != 0 && seg->mss < tcb->mss) + tcb->mss = seg->mss; + + /* the congestion window always starts out as a single segment */ + tcb->snd.wnd = seg->wnd; + tcb->cwind = tcb->mss; +} + +int +addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length) +{ + Reseq *rp, *rp1; + int i, rqlen, qmax; + + rp = malloc(sizeof(Reseq)); + if(rp == nil){ + freeblist(bp); /* bp always consumed by add_reseq */ + return 0; + } + + rp->seg = *seg; + rp->bp = bp; + rp->length = length; + + /* Place on reassembly list sorting by starting seq number */ + rp1 = tcb->reseq; + if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) { + rp->next = rp1; + tcb->reseq = rp; + if(rp->next != nil) + tpriv->stats[OutOfOrder]++; + return 0; + } + + rqlen = 0; + for(i = 0;; i++) { + rqlen += rp1->length; + if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) { + rp->next = rp1->next; + rp1->next = rp; + if(rp->next != nil) + tpriv->stats[OutOfOrder]++; + break; + } + rp1 = rp1->next; + } + qmax = QMAX<<tcb->rcv.scale; + if(rqlen > qmax){ + print("resequence queue > window: %d > %d\n", rqlen, qmax); + i = 0; + for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){ + print("%#lux %#lux %#ux\n", rp1->seg.seq, + rp1->seg.ack, rp1->seg.flags); + if(i++ > 10){ + print("...\n"); + break; + } + } + + /* + * delete entire reassembly queue; wait for retransmit. + * - should we be smarter and only delete the tail? + */ + for(rp = tcb->reseq; rp != nil; rp = rp1){ + rp1 = rp->next; + freeblist(rp->bp); + free(rp); + } + tcb->reseq = nil; + + return -1; + } + return 0; +} + +void +getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) +{ + Reseq *rp; + + rp = tcb->reseq; + if(rp == nil) + return; + + tcb->reseq = rp->next; + + *seg = rp->seg; + *bp = rp->bp; + *length = rp->length; + + free(rp); +} + +int +tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) +{ + ushort len; + uchar accept; + int dupcnt, excess; + + accept = 0; + len = *length; + if(seg->flags & SYN) + len++; + if(seg->flags & FIN) + len++; + + if(tcb->rcv.wnd == 0) { + if(len == 0 && seg->seq == tcb->rcv.nxt) + return 0; + } + else { + /* Some part of the segment should be in the window */ + if(inwindow(tcb,seg->seq)) + accept++; + else + if(len != 0) { + if(inwindow(tcb, seg->seq+len-1) || + seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1)) + accept++; + } + } + if(!accept) { + freeblist(*bp); + return -1; + } + dupcnt = tcb->rcv.nxt - seg->seq; + if(dupcnt > 0){ + tcb->rerecv += dupcnt; + if(seg->flags & SYN){ + seg->flags &= ~SYN; + seg->seq++; + + if(seg->urg > 1) + seg->urg--; + else + seg->flags &= ~URG; + dupcnt--; + } + if(dupcnt > 0){ + pullblock(bp, (ushort)dupcnt); + seg->seq += dupcnt; + *length -= dupcnt; + + if(seg->urg > dupcnt) + seg->urg -= dupcnt; + else { + seg->flags &= ~URG; + seg->urg = 0; + } + } + } + excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd); + if(excess > 0) { + tcb->rerecv += excess; + *length -= excess; + *bp = trimblock(*bp, 0, *length); + if(*bp == nil) + panic("presotto is a boofhead"); + seg->flags &= ~FIN; + } + return 0; +} + +void +tcpadvise(Proto *tcp, Block *bp, char *msg) +{ + Tcp4hdr *h4; + Tcp6hdr *h6; + Tcpctl *tcb; + uchar source[IPaddrlen]; + uchar dest[IPaddrlen]; + ushort psource, pdest; + Conv *s, **p; + + h4 = (Tcp4hdr*)(bp->rp); + h6 = (Tcp6hdr*)(bp->rp); + + if((h4->vihl&0xF0)==IP_VER4) { + v4tov6(dest, h4->tcpdst); + v4tov6(source, h4->tcpsrc); + psource = nhgets(h4->tcpsport); + pdest = nhgets(h4->tcpdport); + } + else { + ipmove(dest, h6->tcpdst); + ipmove(source, h6->tcpsrc); + psource = nhgets(h6->tcpsport); + pdest = nhgets(h6->tcpdport); + } + + /* Look for a connection */ + QLOCK(tcp); + for(p = tcp->conv; *p; p++) { + s = *p; + tcb = (Tcpctl*)s->ptcl; + if(s->rport == pdest) + if(s->lport == psource) + if(tcb->state != Closed) + if(ipcmp(s->raddr, dest) == 0) + if(ipcmp(s->laddr, source) == 0){ + QLOCK(s); + QUNLOCK(tcp); + switch(tcb->state){ + case Syn_sent: + localclose(s, msg); + break; + } + QUNLOCK(s); + freeblist(bp); + return; + } + } + QUNLOCK(tcp); + freeblist(bp); +} + +static char* +tcpporthogdefensectl(char *val) +{ + if(strcmp(val, "on") == 0) + tcpporthogdefense = 1; + else if(strcmp(val, "off") == 0) + tcpporthogdefense = 0; + else + return "unknown value for tcpporthogdefense"; + return nil; +} + +/* called with c QLOCKed */ +char* +tcpctl(Conv* c, char** f, int n) +{ + if(n == 1 && strcmp(f[0], "hangup") == 0) + return tcphangup(c); + if(n >= 1 && strcmp(f[0], "keepalive") == 0) + return tcpstartka(c, f, n); + if(n >= 1 && strcmp(f[0], "checksum") == 0) + return tcpsetchecksum(c, f, n); + if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0) + return tcpporthogdefensectl(f[1]); + return "unknown control request"; +} + +int +tcpstats(Proto *tcp, char *buf, int len) +{ + Tcppriv *priv; + char *p, *e; + int i; + + priv = tcp->priv; + p = buf; + e = p+len; + for(i = 0; i < Nstats; i++) + p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]); + return p - buf; +} + +/* + * garbage collect any stale conversations: + * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack) + * - Finwait2 after 5 minutes + * + * this is called whenever we run out of channels. Both checks are + * of questionable validity so we try to use them only when we're + * up against the wall. + */ +int +tcpgc(Proto *tcp) +{ + Conv *c, **pp, **ep; + int n; + Tcpctl *tcb; + + + n = 0; + ep = &tcp->conv[tcp->nc]; + for(pp = tcp->conv; pp < ep; pp++) { + c = *pp; + if(c == nil) + break; + if(!CANQLOCK(c)) + continue; + tcb = (Tcpctl*)c->ptcl; + switch(tcb->state){ + case Syn_received: + if(NOW - tcb->time > 5000){ + localclose(c, "timed out"); + n++; + } + break; + case Finwait2: + if(NOW - tcb->time > 5*60*1000){ + localclose(c, "timed out"); + n++; + } + break; + } + QUNLOCK(c); + } + return n; +} + +void +tcpsettimer(Tcpctl *tcb) +{ + int x; + + /* round trip dependency */ + x = backoff(tcb->backoff) * + (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK; + + /* bounded twixt 1/2 and 64 seconds */ + if(x < 500/MSPTICK) + x = 500/MSPTICK; + else if(x > (64000/MSPTICK)) + x = 64000/MSPTICK; + tcb->timer.start = x; +} + +void +tcpinit(Fs *fs) +{ + Proto *tcp; + Tcppriv *tpriv; + + tcp = smalloc(sizeof(Proto)); + tpriv = tcp->priv = smalloc(sizeof(Tcppriv)); + tcp->name = "tcp"; + tcp->connect = tcpconnect; + tcp->announce = tcpannounce; + tcp->ctl = tcpctl; + tcp->state = tcpstate; + tcp->create = tcpcreate; + tcp->close = tcpclose; + tcp->rcv = tcpiput; + tcp->advise = tcpadvise; + tcp->stats = tcpstats; + tcp->inuse = tcpinuse; + tcp->gc = tcpgc; + tcp->ipproto = IP_TCPPROTO; + tcp->nc = scalednconv(); + tcp->ptclsize = sizeof(Tcpctl); + tpriv->stats[MaxConn] = tcp->nc; + + Fsproto(fs, tcp); +} + +void +tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale) +{ + if(rcvscale){ + tcb->rcv.scale = rcvscale & 0xff; + tcb->snd.scale = sndscale & 0xff; + tcb->window = QMAX<<tcb->snd.scale; + qsetlimit(s->rq, tcb->window); + } else { + tcb->rcv.scale = 0; + tcb->snd.scale = 0; + tcb->window = QMAX; + qsetlimit(s->rq, tcb->window); + } +} diff --git a/src/9vx/a/ip/tripmedium.c b/src/9vx/a/ip/tripmedium.c @@ -0,0 +1,398 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" +#include "trip.h" + +static void tripread(void *a); +static void tripbind(Ipifc *ifc, int argc, char **argv); +static void tripunbind(Ipifc *ifc); +static void tripbwrite(Ipifc *ifc, Block *bp, int version, uchar *ip); +static void tripaddmulti(Ipifc *ifc, uchar*, uchar*); +static void tripremmulti(Ipifc *ifc, uchar*, uchar*); +static void tripaddroute(Ipifc *ifc, int, uchar*, uchar*, uchar*, int); +static void tripremroute(Ipifc *ifc, int, uchar*, uchar*); +static void tripares(Fs*, int, uchar*, uchar*, int, int); + +Medium tripmedium = +{ +.name= "trip", +.mintu= 20, +.maxtu= 64*1024, +.maclen= LCIMACSIZE, +.bind= tripbind, +.unbind= tripunbind, +.bwrite= tripbwrite, +.addmulti= tripaddmulti, +.remmulti= tripremmulti, +.addroute= tripaddroute, +.remroute= tripremroute, +.ares= tripares, +}; + +typedef struct Tripinfo Tripinfo; +struct Tripinfo +{ + Fs* fs; /* my instance of the IP stack */ + Ipifc* ifc; /* IP interface */ + Card* dev; + Proc* readp; /* reading process */ + Chan* mchan; /* Data channel */ +}; + +/* + * called to bind an IP ifc to an ethernet device + * called with ifc qlock'd + */ +static void +tripbind(Ipifc *ifc, int argc, char **argv) +{ + int fd; + Chan *mchan; + Tripinfo *er; + + if(argc < 2) + error(Ebadarg); + + fd = kopen(argv[2], ORDWR); + if(fd < 0) + error("trip open failed"); + + mchan = fdtochan(up->env->fgrp, fd, ORDWR, 0, 1); + kclose(fd); + + if(devtab[mchan->type]->dc != 'T') { + cclose(mchan); + error(Enoport); + } + + er = smalloc(sizeof(*er)); + er->mchan = mchan; + er->ifc = ifc; + er->dev = tripsetifc(mchan, ifc); + er->fs = ifc->conv->p->f; + + ifc->arg = er; + + kproc("tripread", tripread, ifc); +} + +/* + * called with ifc qlock'd + */ +static void +tripunbind(Ipifc *ifc) +{ + Tripinfo *er = ifc->arg; +/* + if(er->readp) + postnote(er->readp, 1, "unbind", 0); +*/ + tsleep(&up->sleep, return0, 0, 300); + + if(er->mchan != nil) + cclose(er->mchan); + + free(er); +} + +/* + * called by ipoput with a single block to write + */ +static void +tripbwrite(Ipifc *ifc, Block *bp, int version, uchar *ip) +{ + Tripinfo *er = ifc->arg; + + /* + * Packet is rerouted at linecard + * so the gateway is ignored + */ + USED(ip); + USED(version); + + if(waserror()) { + print("tripwrite failed\n"); + return; + } + + devtab[er->mchan->type]->bwrite(er->mchan, bp, 0); + poperror(); + ifc->out++; +} + +/* + * process to read from the trip interface + */ +static void +tripread(void *a) +{ + Ipifc *ifc; + Block *bp; + Tripinfo *er; + + ifc = a; + er = ifc->arg; + er->readp = up; /* hide identity under a rock for unbind */ + + for(;;) { + bp = devtab[er->mchan->type]->bread(er->mchan, ifc->maxtu, 0); + ifc->in++; + ipiput4(er->fs, ifc, bp); + } + + pexit("hangup", 1); +} + +static void +tripaddroute(Ipifc *ifc, int v, uchar *addr, uchar *mask, uchar *gate, int t) +{ + int alen; + MTroute mtr; + Tripinfo *tinfo; + + tinfo = ifc->arg; + if(!tinfo->dev->routing) + return; + + /* + * Multicast addresses are handled on the linecard by + * the multicast port driver, so the route load is dumped. + * loaded by addmulti/remmulti for SBC routes + * joinmulti/leavemulti for inter LC + */ + if(ipismulticast(addr)) + return; + + mtr.type = T_ROUTEADMIN; + if(v & Rv4) { + mtr.op = RTADD4; + alen = IPv4addrlen; + } + else { + mtr.op = RTADD6; + alen = IPaddrlen; + } + mtr.rtype = t; + memmove(mtr.addr, addr, alen); + memmove(mtr.mask, mask, alen); + memmove(mtr.gate, gate, alen); + + i2osend(tinfo->dev, &mtr, sizeof(mtr)); +} + +static void +tripremroute(Ipifc *ifc, int v, uchar *addr, uchar *mask) +{ + int alen; + MTroute mtr; + Tripinfo *tinfo; + + tinfo = ifc->arg; + if(!tinfo->dev->routing) + return; + + if(ipismulticast(addr)) + return; + + mtr.type = T_ROUTEADMIN; + if(v & Rv4) { + mtr.op = RTDEL4; + alen = IPv4addrlen; + } + else { + mtr.op = RTDEL6; + alen = IPaddrlen; + } + memmove(mtr.addr, addr, alen); + memmove(mtr.mask, mask, alen); + + i2osend(tinfo->dev, &mtr, sizeof(mtr)); +} + +static void +tripxmitroute(Route *r, Routewalk *rw) +{ + int nifc; + char t[5]; + uchar a[IPaddrlen], m[IPaddrlen], g[IPaddrlen]; + + convroute(r, a, m, g, t, &nifc); + if(!(r->type & Rv4)) { + tripaddroute(rw->state, 0, a, m, g, r->type); + return; + } + + tripaddroute(rw->state, Rv4, a+IPv4off, m+IPv4off, g+IPv4off, r->type); +} + +static void +sendifcinfo(Ipifc *dest) +{ + Conv **cp, **e; + Iplifc *l; + Ipifc *ifc; + MTifctl mtc; + Tripinfo *tinfo, *oinfo; + Proto *p; + + tinfo = dest->arg; + + /* Install interfaces */ + p = tinfo->fs->ipifc; + e = &p->conv[p->nc]; + for(cp = p->conv; cp < e; cp++) { + + if(*cp == nil) + continue; + + ifc = (Ipifc*)(*cp)->ptcl; + if(dest == ifc) + continue; + + mtc.type = T_CTLIFADMIN; + mtc.maxtu = ifc->maxtu; + mtc.mintu = ifc->mintu; + + mtc.port = 0; + if(ifc->m == &tripmedium) { + oinfo = ifc->arg; + mtc.port = oinfo->dev->bar[0].bar; + } + + for(l = ifc->lifc; l != nil; l = l->next) { + if(isv4(l->local)) { + mtc.op = IFADD4; + memmove(mtc.addr, l->local+IPv4off, IPv4addrlen); + memmove(mtc.mask, l->mask+IPv4off, IPv4addrlen); + } + else { + mtc.op = IFADD6; + memmove(mtc.addr, l->local, sizeof(mtc.addr)); + memmove(mtc.mask, l->mask, sizeof(mtc.mask)); + } + + i2osend(tinfo->dev, &mtc, sizeof(mtc)); + } + } +} + +void +tripsync(Ipifc *ifc) +{ + Routewalk rw; + + if(ifc == nil) { + print("tripsync: interface not bound\n"); + return; + } + + /* Mirror the route table into the lincard */ + rw.o = 0; + rw.n = (1<<22); + rw.state = ifc; + rw.walk = tripxmitroute; + + ipwalkroutes(ifc->conv->p->f, &rw); + + /* + * Tell the linecard about interfaces that already + * exist elsewhere + */ + sendifcinfo(ifc); +} + +/* Tell a line card the SBC is interested in listening + * to a multicast address + */ +static void +tripaddmulti(Ipifc *ifc, uchar *addr, uchar *ifca) +{ + MTmultiears mt; + Tripinfo *tinfo; + + /* print("tripaddmulti %I %I\n", addr, ifca); /**/ + + tinfo = ifc->arg; + if(!tinfo->dev->routing) + return; + + mt.type = T_MULTIEAR; + mt.op = ADDMULTI; + memmove(mt.addr, addr, sizeof(mt.addr)); + memmove(mt.ifca, ifca, sizeof(mt.ifca)); + + i2osend(tinfo->dev, &mt, sizeof(mt)); +} + +/* Tell a line card the SBC is no longer interested in listening + * to a multicast address + */ +static void +tripremmulti(Ipifc *ifc, uchar *addr, uchar *ifca) +{ + MTmultiears mt; + Tripinfo *tinfo; + + tinfo = ifc->arg; + if(!tinfo->dev->routing) + return; + + mt.type = T_MULTIEAR; + mt.op = REMMULTI; + memmove(mt.addr, addr, sizeof(mt.addr)); + memmove(mt.ifca, ifca, sizeof(mt.ifca)); + + i2osend(tinfo->dev, &mt, sizeof(mt)); +} + +static void +tripares(Fs *fs, int vers, uchar *ip, uchar *mac, int l, int) +{ + Route *r; + Ipifc *ifc; + MTaresenter ta; + Tripinfo *tinfo; + uchar v6ip[IPaddrlen]; + + if(vers == V4) { + r = v4lookup(fs, ip); + v4tov6(v6ip, ip); + ip = v6ip; + } + else + r = v6lookup(fs, ip); + + if(r == nil) { + print("tripares: no route for entry\n"); + return; + } + + ifc = r->ifc; + + tinfo = ifc->arg; + if(!tinfo->dev->routing) + return; + + if(vers == V4) { + v4tov6(v6ip, ip); + ip = v6ip; + } + + ta.type = T_ARESENTER; + ta.maclen = l; + memmove(ta.addr, ip, IPaddrlen); + memmove(ta.amac, mac, l); + + i2osend(tinfo->dev, &ta, sizeof(ta)); +} + +void +tripmediumlink(void) +{ + addipmedium(&tripmedium); +} diff --git a/src/9vx/a/ip/udp.c b/src/9vx/a/ip/udp.c @@ -0,0 +1,619 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#include "ip.h" +#include "ipv6.h" + + +#define DPRINT if(0)print + +enum +{ + UDP_UDPHDR_SZ = 8, + + UDP4_PHDR_OFF = 8, + UDP4_PHDR_SZ = 12, + UDP4_IPHDR_SZ = 20, + UDP6_IPHDR_SZ = 40, + UDP6_PHDR_SZ = 40, + UDP6_PHDR_OFF = 0, + + IP_UDPPROTO = 17, + UDP_USEAD7 = 52, + + Udprxms = 200, + Udptickms = 100, + Udpmaxxmit = 10, +}; + +typedef struct Udp4hdr Udp4hdr; +struct Udp4hdr +{ + /* ip header */ + uchar vihl; /* Version and header length */ + uchar tos; /* Type of service */ + uchar length[2]; /* packet length */ + uchar id[2]; /* Identification */ + uchar frag[2]; /* Fragment information */ + uchar Unused; + uchar udpproto; /* Protocol */ + uchar udpplen[2]; /* Header plus data length */ + uchar udpsrc[IPv4addrlen]; /* Ip source */ + uchar udpdst[IPv4addrlen]; /* Ip destination */ + + /* udp header */ + uchar udpsport[2]; /* Source port */ + uchar udpdport[2]; /* Destination port */ + uchar udplen[2]; /* data length */ + uchar udpcksum[2]; /* Checksum */ +}; + +typedef struct Udp6hdr Udp6hdr; +struct Udp6hdr { + uchar viclfl[4]; + uchar len[2]; + uchar nextheader; + uchar hoplimit; + uchar udpsrc[IPaddrlen]; + uchar udpdst[IPaddrlen]; + + /* udp header */ + uchar udpsport[2]; /* Source port */ + uchar udpdport[2]; /* Destination port */ + uchar udplen[2]; /* data length */ + uchar udpcksum[2]; /* Checksum */ +}; + +/* MIB II counters */ +typedef struct Udpstats Udpstats; +struct Udpstats +{ + ulong udpInDatagrams; + ulong udpNoPorts; + ulong udpInErrors; + ulong udpOutDatagrams; +}; + +typedef struct Udppriv Udppriv; +struct Udppriv +{ + Ipht ht; + + /* MIB counters */ + Udpstats ustats; + + /* non-MIB stats */ + ulong csumerr; /* checksum errors */ + ulong lenerr; /* short packet */ +}; + +void (*etherprofiler)(char *name, int qlen); +void udpkick(void *x, Block *bp); + +/* + * protocol specific part of Conv + */ +typedef struct Udpcb Udpcb; +struct Udpcb +{ + QLock qlock; + uchar headers; +}; + +static char* +udpconnect(Conv *c, char **argv, int argc) +{ + char *e; + Udppriv *upriv; + + upriv = c->p->priv; + e = Fsstdconnect(c, argv, argc); + Fsconnected(c, e); + if(e != nil) + return e; + + iphtadd(&upriv->ht, c); + return nil; +} + + +static int +udpstate(Conv *c, char *state, int n) +{ + return snprint(state, n, "%s qin %d qout %d\n", + c->inuse ? "Open" : "Closed", + c->rq ? qlen(c->rq) : 0, + c->wq ? qlen(c->wq) : 0 + ); +} + +static char* +udpannounce(Conv *c, char** argv, int argc) +{ + char *e; + Udppriv *upriv; + + upriv = c->p->priv; + e = Fsstdannounce(c, argv, argc); + if(e != nil) + return e; + Fsconnected(c, nil); + iphtadd(&upriv->ht, c); + + return nil; +} + +static void +udpcreate(Conv *c) +{ + c->rq = qopen(128*1024, Qmsg, 0, 0); + c->wq = qbypass(udpkick, c); +} + +static void +udpclose(Conv *c) +{ + Udpcb *ucb; + Udppriv *upriv; + + upriv = c->p->priv; + iphtrem(&upriv->ht, c); + + c->state = 0; + qclose(c->rq); + qclose(c->wq); + qclose(c->eq); + ipmove(c->laddr, IPnoaddr); + ipmove(c->raddr, IPnoaddr); + c->lport = 0; + c->rport = 0; + + ucb = (Udpcb*)c->ptcl; + ucb->headers = 0; +} + +void +udpkick(void *x, Block *bp) +{ + Conv *c = x; + Udp4hdr *uh4; + Udp6hdr *uh6; + ushort rport; + uchar laddr[IPaddrlen], raddr[IPaddrlen]; + Udpcb *ucb; + int dlen, ptcllen; + Udppriv *upriv; + Fs *f; + int version; + Conv *rc; + + upriv = c->p->priv; + f = c->p->f; + + netlog(c->p->f, Logudp, "udp: kick\n"); + if(bp == nil) + return; + + ucb = (Udpcb*)c->ptcl; + switch(ucb->headers) { + case 7: + /* get user specified addresses */ + bp = pullupblock(bp, UDP_USEAD7); + if(bp == nil) + return; + ipmove(raddr, bp->rp); + bp->rp += IPaddrlen; + ipmove(laddr, bp->rp); + bp->rp += IPaddrlen; + /* pick interface closest to dest */ + if(ipforme(f, laddr) != Runi) + findlocalip(f, laddr, raddr); + bp->rp += IPaddrlen; /* Ignore ifc address */ + rport = nhgets(bp->rp); + bp->rp += 2+2; /* Ignore local port */ + break; + default: + rport = 0; + break; + } + + if(ucb->headers) { + if(memcmp(laddr, v4prefix, IPv4off) == 0 + || ipcmp(laddr, IPnoaddr) == 0) + version = 4; + else + version = 6; + } else { + if( (memcmp(c->raddr, v4prefix, IPv4off) == 0 && + memcmp(c->laddr, v4prefix, IPv4off) == 0) + || ipcmp(c->raddr, IPnoaddr) == 0) + version = 4; + else + version = 6; + } + + dlen = blocklen(bp); + + /* fill in pseudo header and compute checksum */ + switch(version){ + case V4: + bp = padblock(bp, UDP4_IPHDR_SZ+UDP_UDPHDR_SZ); + if(bp == nil) + return; + + uh4 = (Udp4hdr *)(bp->rp); + ptcllen = dlen + UDP_UDPHDR_SZ; + uh4->Unused = 0; + uh4->udpproto = IP_UDPPROTO; + uh4->frag[0] = 0; + uh4->frag[1] = 0; + hnputs(uh4->udpplen, ptcllen); + if(ucb->headers) { + v6tov4(uh4->udpdst, raddr); + hnputs(uh4->udpdport, rport); + v6tov4(uh4->udpsrc, laddr); + rc = nil; + } else { + v6tov4(uh4->udpdst, c->raddr); + hnputs(uh4->udpdport, c->rport); + if(ipcmp(c->laddr, IPnoaddr) == 0) + findlocalip(f, c->laddr, c->raddr); + v6tov4(uh4->udpsrc, c->laddr); + rc = c; + } + hnputs(uh4->udpsport, c->lport); + hnputs(uh4->udplen, ptcllen); + uh4->udpcksum[0] = 0; + uh4->udpcksum[1] = 0; + hnputs(uh4->udpcksum, + ptclcsum(bp, UDP4_PHDR_OFF, dlen+UDP_UDPHDR_SZ+UDP4_PHDR_SZ)); + uh4->vihl = IP_VER4; + ipoput4(f, bp, 0, c->ttl, c->tos, rc); + break; + + case V6: + bp = padblock(bp, UDP6_IPHDR_SZ+UDP_UDPHDR_SZ); + if(bp == nil) + return; + + /* + * using the v6 ip header to create pseudo header + * first then reset it to the normal ip header + */ + uh6 = (Udp6hdr *)(bp->rp); + memset(uh6, 0, 8); + ptcllen = dlen + UDP_UDPHDR_SZ; + hnputl(uh6->viclfl, ptcllen); + uh6->hoplimit = IP_UDPPROTO; + if(ucb->headers) { + ipmove(uh6->udpdst, raddr); + hnputs(uh6->udpdport, rport); + ipmove(uh6->udpsrc, laddr); + rc = nil; + } else { + ipmove(uh6->udpdst, c->raddr); + hnputs(uh6->udpdport, c->rport); + if(ipcmp(c->laddr, IPnoaddr) == 0) + findlocalip(f, c->laddr, c->raddr); + ipmove(uh6->udpsrc, c->laddr); + rc = c; + } + hnputs(uh6->udpsport, c->lport); + hnputs(uh6->udplen, ptcllen); + uh6->udpcksum[0] = 0; + uh6->udpcksum[1] = 0; + hnputs(uh6->udpcksum, + ptclcsum(bp, UDP6_PHDR_OFF, dlen+UDP_UDPHDR_SZ+UDP6_PHDR_SZ)); + memset(uh6, 0, 8); + uh6->viclfl[0] = IP_VER6; + hnputs(uh6->len, ptcllen); + uh6->nextheader = IP_UDPPROTO; + ipoput6(f, bp, 0, c->ttl, c->tos, rc); + break; + + default: + panic("udpkick: version %d", version); + } + upriv->ustats.udpOutDatagrams++; +} + +void +udpiput(Proto *udp, Ipifc *ifc, Block *bp) +{ + int len; + Udp4hdr *uh4; + Udp6hdr *uh6; + Conv *c; + Udpcb *ucb; + uchar raddr[IPaddrlen], laddr[IPaddrlen]; + ushort rport, lport; + Udppriv *upriv; + Fs *f; + int version; + int ottl, oviclfl, olen; + uchar *p; + + upriv = udp->priv; + f = udp->f; + upriv->ustats.udpInDatagrams++; + + uh4 = (Udp4hdr*)(bp->rp); + version = ((uh4->vihl&0xF0)==IP_VER6) ? 6 : 4; + + /* Put back pseudo header for checksum + * (remember old values for icmpnoconv()) */ + switch(version) { + case V4: + ottl = uh4->Unused; + uh4->Unused = 0; + len = nhgets(uh4->udplen); + olen = nhgets(uh4->udpplen); + hnputs(uh4->udpplen, len); + + v4tov6(raddr, uh4->udpsrc); + v4tov6(laddr, uh4->udpdst); + lport = nhgets(uh4->udpdport); + rport = nhgets(uh4->udpsport); + + if(nhgets(uh4->udpcksum)) { + if(ptclcsum(bp, UDP4_PHDR_OFF, len+UDP4_PHDR_SZ)) { + upriv->ustats.udpInErrors++; + netlog(f, Logudp, "udp: checksum error %I\n", raddr); + DPRINT("udp: checksum error %I\n", raddr); + freeblist(bp); + return; + } + } + uh4->Unused = ottl; + hnputs(uh4->udpplen, olen); + break; + case V6: + uh6 = (Udp6hdr*)(bp->rp); + len = nhgets(uh6->udplen); + oviclfl = nhgetl(uh6->viclfl); + olen = nhgets(uh6->len); + ottl = uh6->hoplimit; + ipmove(raddr, uh6->udpsrc); + ipmove(laddr, uh6->udpdst); + lport = nhgets(uh6->udpdport); + rport = nhgets(uh6->udpsport); + memset(uh6, 0, 8); + hnputl(uh6->viclfl, len); + uh6->hoplimit = IP_UDPPROTO; + if(ptclcsum(bp, UDP6_PHDR_OFF, len+UDP6_PHDR_SZ)) { + upriv->ustats.udpInErrors++; + netlog(f, Logudp, "udp: checksum error %I\n", raddr); + DPRINT("udp: checksum error %I\n", raddr); + freeblist(bp); + return; + } + hnputl(uh6->viclfl, oviclfl); + hnputs(uh6->len, olen); + uh6->nextheader = IP_UDPPROTO; + uh6->hoplimit = ottl; + break; + default: + panic("udpiput: version %d", version); + return; /* to avoid a warning */ + } + + QLOCK(udp); + + c = iphtlook(&upriv->ht, raddr, rport, laddr, lport); + if(c == nil){ + /* no conversation found */ + upriv->ustats.udpNoPorts++; + QUNLOCK(udp); + netlog(f, Logudp, "udp: no conv %I!%d -> %I!%d\n", raddr, rport, + laddr, lport); + + switch(version){ + case V4: + icmpnoconv(f, bp); + break; + case V6: + icmphostunr(f, ifc, bp, Icmp6_port_unreach, 0); + break; + default: + panic("udpiput2: version %d", version); + } + + freeblist(bp); + return; + } + ucb = (Udpcb*)c->ptcl; + + if(c->state == Announced){ + if(ucb->headers == 0){ + /* create a new conversation */ + if(ipforme(f, laddr) != Runi) { + switch(version){ + case V4: + v4tov6(laddr, ifc->lifc->local); + break; + case V6: + ipmove(laddr, ifc->lifc->local); + break; + default: + panic("udpiput3: version %d", version); + } + } + c = Fsnewcall(c, raddr, rport, laddr, lport, version); + if(c == nil){ + QUNLOCK(udp); + freeblist(bp); + return; + } + iphtadd(&upriv->ht, c); + ucb = (Udpcb*)c->ptcl; + } + } + + QLOCK(c); + QUNLOCK(udp); + + /* + * Trim the packet down to data size + */ + len -= UDP_UDPHDR_SZ; + switch(version){ + case V4: + bp = trimblock(bp, UDP4_IPHDR_SZ+UDP_UDPHDR_SZ, len); + break; + case V6: + bp = trimblock(bp, UDP6_IPHDR_SZ+UDP_UDPHDR_SZ, len); + break; + default: + bp = nil; + panic("udpiput4: version %d", version); + } + if(bp == nil){ + QUNLOCK(c); + netlog(f, Logudp, "udp: len err %I.%d -> %I.%d\n", raddr, rport, + laddr, lport); + upriv->lenerr++; + return; + } + + netlog(f, Logudpmsg, "udp: %I.%d -> %I.%d l %d\n", raddr, rport, + laddr, lport, len); + + switch(ucb->headers){ + case 7: + /* pass the src address */ + bp = padblock(bp, UDP_USEAD7); + p = bp->rp; + ipmove(p, raddr); p += IPaddrlen; + ipmove(p, laddr); p += IPaddrlen; + ipmove(p, ifc->lifc->local); p += IPaddrlen; + hnputs(p, rport); p += 2; + hnputs(p, lport); + break; + } + + if(bp->next) + bp = concatblock(bp); + + if(qfull(c->rq)){ + QUNLOCK(c); + netlog(f, Logudp, "udp: qfull %I.%d -> %I.%d\n", raddr, rport, + laddr, lport); + freeblist(bp); + return; + } + + qpass(c->rq, bp); + QUNLOCK(c); + +} + +char* +udpctl(Conv *c, char **f, int n) +{ + Udpcb *ucb; + + ucb = (Udpcb*)c->ptcl; + if(n == 1){ + if(strcmp(f[0], "headers") == 0){ + ucb->headers = 7; /* new headers format */ + return nil; + } + } + return "unknown control request"; +} + +void +udpadvise(Proto *udp, Block *bp, char *msg) +{ + Udp4hdr *h4; + Udp6hdr *h6; + uchar source[IPaddrlen], dest[IPaddrlen]; + ushort psource, pdest; + Conv *s, **p; + int version; + + h4 = (Udp4hdr*)(bp->rp); + version = ((h4->vihl&0xF0)==IP_VER6) ? 6 : 4; + + switch(version) { + case V4: + v4tov6(dest, h4->udpdst); + v4tov6(source, h4->udpsrc); + psource = nhgets(h4->udpsport); + pdest = nhgets(h4->udpdport); + break; + case V6: + h6 = (Udp6hdr*)(bp->rp); + ipmove(dest, h6->udpdst); + ipmove(source, h6->udpsrc); + psource = nhgets(h6->udpsport); + pdest = nhgets(h6->udpdport); + break; + default: + panic("udpadvise: version %d", version); + return; /* to avoid a warning */ + } + + /* Look for a connection */ + QLOCK(udp); + for(p = udp->conv; *p; p++) { + s = *p; + if(s->rport == pdest) + if(s->lport == psource) + if(ipcmp(s->raddr, dest) == 0) + if(ipcmp(s->laddr, source) == 0){ + if(s->ignoreadvice) + break; + QLOCK(s); + QUNLOCK(udp); + qhangup(s->rq, msg); + qhangup(s->wq, msg); + QUNLOCK(s); + freeblist(bp); + return; + } + } + QUNLOCK(udp); + freeblist(bp); +} + +int +udpstats(Proto *udp, char *buf, int len) +{ + Udppriv *upriv; + + upriv = udp->priv; + return snprint(buf, len, "InDatagrams: %lud\nNoPorts: %lud\nInErrors: %lud\nOutDatagrams: %lud\n", + upriv->ustats.udpInDatagrams, + upriv->ustats.udpNoPorts, + upriv->ustats.udpInErrors, + upriv->ustats.udpOutDatagrams); +} + +void +udpinit(Fs *fs) +{ + Proto *udp; + + udp = smalloc(sizeof(Proto)); + udp->priv = smalloc(sizeof(Udppriv)); + udp->name = "udp"; + udp->connect = udpconnect; + udp->announce = udpannounce; + udp->ctl = udpctl; + udp->state = udpstate; + udp->create = udpcreate; + udp->close = udpclose; + udp->rcv = udpiput; + udp->advise = udpadvise; + udp->stats = udpstats; + udp->ipproto = IP_UDPPROTO; + udp->nc = Nchans; + udp->ptclsize = sizeof(Udpcb); + + Fsproto(fs, udp); +} diff --git a/src/9vx/a/kfs.h b/src/9vx/a/kfs.h @@ -0,0 +1,57 @@ +typedef struct Qid9p1 Qid9p1; +typedef struct Dentry Dentry; +typedef struct Kfsfile Kfsfile; +typedef struct Kfs Kfs; + +/* DONT TOUCH, this is the disk structure */ +struct Qid9p1 +{ + long path; + long version; +}; + +#define NAMELEN 28 /* size of names */ +#define NDBLOCK 6 /* number of direct blocks in Dentry */ + +/* DONT TOUCH, this is the disk structure */ +struct Dentry +{ + char name[NAMELEN]; + short uid; + short gid; + ushort mode; +/* + #define DALLOC 0x8000 + #define DDIR 0x4000 + #define DAPND 0x2000 + #define DLOCK 0x1000 + #define DREAD 0x4 + #define DWRITE 0x2 + #define DEXEC 0x1 +*/ + Qid9p1 qid; + long size; + long dblock[NDBLOCK]; + long iblock; + long diblock; + long atime; + long mtime; +}; + +struct Kfsfile +{ + Dentry _; + long off; +}; + +struct Kfs +{ + int RBUFSIZE; + int BUFSIZE; + int DIRPERBUF; + int INDPERBUF; + int INDPERBUF2; +}; + +extern int kfsinit(Fs*); + diff --git a/src/9vx/a/netif.c b/src/9vx/a/netif.c @@ -0,0 +1,761 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" +#include "netif.h" + +static int netown(Netfile*, char*, int); +static int openfile(Netif*, int); +static char* matchtoken(char*, char*); +static char* netmulti(Netif*, Netfile*, uchar*, int); +static int parseaddr(uchar*, char*, int); + +int netifdebug; +#define dprint(...) if(netifdebug)print(__VA_ARGS__); else USED(netifdebug) + +/* + * set up a new network interface + */ +void +netifinit(Netif *nif, char *name, int nfile, ulong limit) +{ + strncpy(nif->name, name, KNAMELEN-1); + nif->name[KNAMELEN-1] = 0; + nif->nfile = nfile; + nif->f = xalloc(nfile*sizeof(Netfile*)); + if (nif->f == nil) + panic("netifinit: no memory"); + nif->limit = limit; +} + +#define DD(c,q,nam,n,owner,perm,dp) dprint("%lux.%llux %s\n", q.type, q.path, nam); devdir(c,q,nam,n,owner,perm,dp) + +/* + * generate a 3 level directory + */ +static int +netifgen(Chan *c, char *dummy, Dirtab *vp, int dummy1, int i, Dir *dp) +{ + Qid q; + Netif *nif = (Netif*)vp; + Netfile *f; + int t, perm; + char *o; + + memset(&q, 0, sizeof q); + q.type = QTFILE; + q.vers = 0; + + dprint("gen %d %llud %.2d ", c->dri, c->qid.path, i); + /* top level directory contains the name of the network */ + if(c->qid.path == 0){ + switch(i){ + case DEVDOTDOT: + q.path = 0; + q.type = QTDIR; + DD(c, q, ".", 0, eve, 0555, dp); + break; + case 0: + q.path = N2ndqid; + q.type = QTDIR; + strcpy(up->genbuf, nif->name); + DD(c, q, up->genbuf, 0, eve, 0555, dp); + break; + default: + dprint("-> -1 (top)\n"); + return -1; + } + return 1; + } + + /* second level contains clone plus all the conversations */ + t = NETTYPE(c->qid.path); + if(t == N2ndqid || t == Ncloneqid || t == Naddrqid || t == Nstatqid || t == Nifstatqid){ + switch(i){ + case DEVDOTDOT: + q.type = QTDIR; + q.path = 0; + DD(c, q, ".", 0, eve, DMDIR|0555, dp); + break; + case 0: + q.path = Ncloneqid; + DD(c, q, "clone", 0, eve, 0666, dp); + break; + case 1: + q.path = Naddrqid; + DD(c, q, "addr", 0, eve, 0666, dp); + break; + case 2: + q.path = Nstatqid; + DD(c, q, "stats", 0, eve, 0444, dp); + break; + case 3: + q.path = Nifstatqid; + DD(c, q, "ifstats", 0, eve, 0444, dp); + break; + default: + i -= 4; + if(i >= nif->nfile){ + dprint("-> -1 (2d): %d %d\n", i, nif->nfile); + return -1; + } + if(nif->f[i] == 0){ + dprint("nif->f[%d] -> 0\n", i); + return 0; + } + q.type = QTDIR; + q.path = NETQID(i, N3rdqid); + sprint(up->genbuf, "%d", i); + DD(c, q, up->genbuf, 0, eve, DMDIR|0555, dp); + break; + } + return 1; + } + + /* third level */ + f = nif->f[NETID(c->qid.path)]; + if(f == 0){ + dprint("->f 0\n"); + return -1; + } + if(*f->owner){ + o = f->owner; + perm = f->mode; + } else { + o = eve; + perm = 0666; + } + switch(i){ + case DEVDOTDOT: + q.type = QTDIR; + q.path = N2ndqid; + strcpy(up->genbuf, nif->name); + DD(c, q, up->genbuf, 0, eve, DMDIR|0555, dp); + break; + case 0: + q.path = NETQID(NETID(c->qid.path), Ndataqid); + DD(c, q, "data", 0, o, perm, dp); + break; + case 1: + q.path = NETQID(NETID(c->qid.path), Nctlqid); + DD(c, q, "ctl", 0, o, perm, dp); + break; + case 2: + q.path = NETQID(NETID(c->qid.path), Nstatqid); + DD(c, q, "stats", 0, eve, 0444, dp); + break; + case 3: + q.path = NETQID(NETID(c->qid.path), Ntypeqid); + DD(c, q, "type", 0, eve, 0444, dp); + break; + case 4: + q.path = NETQID(NETID(c->qid.path), Nifstatqid); + DD(c, q, "ifstats", 0, eve, 0444, dp); + break; + default: + dprint("-> -1 (third)\n"); + return -1; + } + return 1; +} + +static void +prwalk(Netif *nif, Chan *c, Chan *nc, char **name, int nname) +{ + char buf[512], *e, *p; + + if(netifdebug == 0) + return; + p = buf; + e = p + sizeof buf; + for(int i = 0; i < nname; i++) + p = seprint(p, e, "%s ", name[i]); + if(p > buf) + p--; + *p = 0; + print("netifwalk %lld [%s]\n", c->qid.path, buf); +} + +Walkqid* +netifwalk(Netif *nif, Chan *c, Chan *nc, char **name, int nname) +{ + prwalk(nif, c, nc, name, nname); + return devwalk(c, nc, name, nname, (Dirtab *)nif, 0, netifgen); +} + +Chan* +netifopen(Netif *nif, Chan *c, int omode) +{ + int id; + Netfile *f; + + dprint("netifopen %p %d\n", nif, c? c->qid.path: -1); + id = 0; + if(c->qid.type & QTDIR){ + if(omode != OREAD) + error(Eperm); + } else { + switch(NETTYPE(c->qid.path)){ + case Ndataqid: + case Nctlqid: + id = NETID(c->qid.path); + openfile(nif, id); + break; + case Ncloneqid: + id = openfile(nif, -1); + c->qid.path = NETQID(id, Nctlqid); + break; + default: + if(omode != OREAD) + error(Ebadarg); + } + switch(NETTYPE(c->qid.path)){ + case Ndataqid: + case Nctlqid: + f = nif->f[id]; + if(netown(f, up->user, omode&7) < 0) + error(Eperm); + break; + } + } + c->mode = openmode(omode); + c->flag |= COPEN; + c->offset = 0; + c->iounit = qiomaxatomic; + return c; +} + +long +netifread(Netif *nif, Chan *c, void *a, long n, ulong offset) +{ + int i, j; + Netfile *f; + char *p; + + dprint("netifread %lud %lud\n", c->qid.path, NETTYPE(c->qid.path)); + if(c->qid.type&QTDIR) + return devdirread(c, a, n, (Dirtab*)nif, 0, netifgen); + + switch(NETTYPE(c->qid.path)){ + case Ndataqid: + f = nif->f[NETID(c->qid.path)]; + return qread(f->in, a, n); + case Nctlqid: + return readnum(offset, a, n, NETID(c->qid.path), NUMSIZE); + case Nstatqid: + dprint("netstatqid\n"); + p = smalloc(READSTR); + j = snprint(p, READSTR, "in: %llud\n", nif->inpackets); + j += snprint(p+j, READSTR-j, "link: %d\n", nif->link); + j += snprint(p+j, READSTR-j, "out: %llud\n", nif->outpackets); + j += snprint(p+j, READSTR-j, "crc errs: %d\n", nif->crcs); + j += snprint(p+j, READSTR-j, "overflows: %d\n", nif->overflows); + j += snprint(p+j, READSTR-j, "soft overflows: %d\n", nif->soverflows); + j += snprint(p+j, READSTR-j, "framing errs: %d\n", nif->frames); + j += snprint(p+j, READSTR-j, "buffer errs: %d\n", nif->buffs); + j += snprint(p+j, READSTR-j, "output errs: %d\n", nif->oerrs); + j += snprint(p+j, READSTR-j, "prom: %d\n", nif->prom); + j += snprint(p+j, READSTR-j, "mbps: %d\n", nif->mbps); + j += snprint(p+j, READSTR-j, "addr: "); + for(i = 0; i < nif->alen; i++) + j += snprint(p+j, READSTR-j, "%2.2ux", nif->addr[i]); + snprint(p+j, READSTR-j, "\n"); + n = readstr(offset, a, n, p); + free(p); + return n; + case Naddrqid: + p = malloc(READSTR); + j = 0; + for(i = 0; i < nif->alen; i++) + j += snprint(p+j, READSTR-j, "%2.2ux", nif->addr[i]); + n = readstr(offset, a, n, p); + free(p); + return n; + case Ntypeqid: + f = nif->f[NETID(c->qid.path)]; + return readnum(offset, a, n, f->type, NUMSIZE); + case Nifstatqid: + return 0; + } + error(Ebadarg); + return -1; /* not reached */ +} + +Block* +netifbread(Netif *nif, Chan *c, long n, ulong offset) +{ + if((c->qid.type & QTDIR) || NETTYPE(c->qid.path) != Ndataqid) + return devbread(c, n, offset); + + return qbread(nif->f[NETID(c->qid.path)]->in, n); +} + +/* + * make sure this type isn't already in use on this device + */ +static int +typeinuse(Netif *nif, int type) +{ + Netfile *f, **fp, **efp; + + if(type <= 0) + return 0; + + efp = &nif->f[nif->nfile]; + for(fp = nif->f; fp < efp; fp++){ + f = *fp; + if(f == 0) + continue; + if(f->type == type) + return 1; + } + return 0; +} + +/* + * the devxxx.c that calls us handles writing data, it knows best + */ +long +netifwrite(Netif *nif, Chan *c, void *a, long n) +{ + Netfile *f; + int type; + char *p, buf[64]; + uchar binaddr[Nmaxaddr]; + + if(NETTYPE(c->qid.path) != Nctlqid) + error(Eperm); + + if(n >= sizeof(buf)) + n = sizeof(buf)-1; + memmove(buf, a, n); + buf[n] = 0; + + if(waserror()){ + QUNLOCK(nif); + nexterror(); + } + + QLOCK(nif); + f = nif->f[NETID(c->qid.path)]; + if((p = matchtoken(buf, "connect")) != 0){ + type = atoi(p); + if(typeinuse(nif, type)) + error(Einuse); + f->type = type; + if(f->type < 0) + nif->all++; + } else if(matchtoken(buf, "promiscuous")){ + if(f->prom == 0){ + if(nif->prom == 0 && nif->promiscuous != nil) + nif->promiscuous(nif->arg, 1); + f->prom = 1; + nif->prom++; + } + } else if((p = matchtoken(buf, "scanbs")) != 0){ + /* scan for base stations */ + if(f->scan == 0){ + type = atoi(p); + if(type < 5) + type = 5; + if(nif->scanbs != nil) + nif->scanbs(nif->arg, type); + f->scan = type; + nif->scan++; + } + } else if(matchtoken(buf, "bridge")){ + f->bridge = 1; + } else if(matchtoken(buf, "headersonly")){ + f->headersonly = 1; + } else if((p = matchtoken(buf, "addmulti")) != 0){ + if(parseaddr(binaddr, p, nif->alen) < 0) + error("bad address"); + p = netmulti(nif, f, binaddr, 1); + if(p) + error(p); + } else if((p = matchtoken(buf, "remmulti")) != 0){ + if(parseaddr(binaddr, p, nif->alen) < 0) + error("bad address"); + p = netmulti(nif, f, binaddr, 0); + if(p) + error(p); + } else + n = -1; + QUNLOCK(nif); + poperror(); + return n; +} + +int +netifwstat(Netif *nif, Chan *c, uchar *db, int n) +{ + Dir *dir; + Netfile *f; + int m; + + f = nif->f[NETID(c->qid.path)]; + if(f == 0) + error(Enonexist); + + if(netown(f, up->user, OWRITE) < 0) + error(Eperm); + + dir = smalloc(sizeof(Dir)+n); + m = convM2D(db, n, &dir[0], (char*)&dir[1]); + if(m == 0){ + free(dir); + error(Eshortstat); + } + if(!emptystr(dir[0].uid)) + strncpy(f->owner, dir[0].uid, KNAMELEN); + if(dir[0].mode != ~0UL) + f->mode = dir[0].mode; + free(dir); + return m; +} + +int +netifstat(Netif *nif, Chan *c, uchar *db, int n) +{ + dprint("netifstat %s nfile %d %lld type=%d\n", nif->name, nif->nfile, c->qid.path, c->type); + return devstat(c, db, n, (Dirtab *)nif, 0, netifgen); +} + +void +netifclose(Netif *nif, Chan *c) +{ + Netfile *f; + int t; + Netaddr *ap; + + if((c->flag & COPEN) == 0) + return; + + t = NETTYPE(c->qid.path); + if(t != Ndataqid && t != Nctlqid) + return; + + f = nif->f[NETID(c->qid.path)]; + QLOCK(f); + if(--(f->inuse) == 0){ + if(f->prom){ + QLOCK(nif); + if(--(nif->prom) == 0 && nif->promiscuous != nil) + nif->promiscuous(nif->arg, 0); + QUNLOCK(nif); + f->prom = 0; + } + if(f->scan){ + QLOCK(nif); + if(--(nif->scan) == 0 && nif->scanbs != nil) + nif->scanbs(nif->arg, 0); + QUNLOCK(nif); + f->prom = 0; + f->scan = 0; + } + if(f->nmaddr){ + QLOCK(nif); + t = 0; + for(ap = nif->maddr; ap; ap = ap->next){ + if(f->maddr[t/8] & (1<<(t%8))) + netmulti(nif, f, ap->addr, 0); + } + QUNLOCK(nif); + f->nmaddr = 0; + } + if(f->type < 0){ + QLOCK(nif); + --(nif->all); + QUNLOCK(nif); + } + f->owner[0] = 0; +print("drop type %.4ux\n", f->type); + f->type = 0; + f->bridge = 0; + f->headersonly = 0; + qclose(f->in); + } + QUNLOCK(f); +} + +Lock netlock; + +static int +netown(Netfile *p, char *o, int omode) +{ + static int access[] = { 0400, 0200, 0600, 0100 }; + int mode; + int t; + + lock(&netlock); + if(*p->owner){ + if(strncmp(o, p->owner, KNAMELEN) == 0) /* User */ + mode = p->mode; + else if(strncmp(o, eve, KNAMELEN) == 0) /* Bootes is group */ + mode = p->mode<<3; + else + mode = p->mode<<6; /* Other */ + + t = access[omode&3]; + if((t & mode) == t){ + unlock(&netlock); + return 0; + } else { + unlock(&netlock); + return -1; + } + } + strncpy(p->owner, o, KNAMELEN); + p->mode = 0660; + unlock(&netlock); + return 0; +} + +/* + * Increment the reference count of a network device. + * If id < 0, return an unused ether device. + */ +static int +openfile(Netif *nif, int id) +{ + Netfile *f, **fp, **efp; + + if(id >= 0){ + f = nif->f[id]; + if(f == 0) + error(Enodev); + QLOCK(f); + qreopen(f->in); + f->inuse++; + QUNLOCK(f); + return id; + } + + QLOCK(nif); + if(waserror()){ + QUNLOCK(nif); + nexterror(); + } + efp = &nif->f[nif->nfile]; + for(fp = nif->f; fp < efp; fp++){ + f = *fp; + if(f == 0){ + f = malloc(sizeof(Netfile)); + if(f == 0) + exhausted("memory"); + f->in = qopen(nif->limit, Qmsg, 0, 0); + if(f->in == nil){ + free(f); + exhausted("memory"); + } + *fp = f; + QLOCK(f); + } else { + QLOCK(f); + if(f->inuse){ + QUNLOCK(f); + continue; + } + } + f->inuse = 1; + qreopen(f->in); + netown(f, up->user, 0); + QUNLOCK(f); + QUNLOCK(nif); + poperror(); + return fp - nif->f; + } + error(Enodev); + return -1; /* not reached */ +} + +/* + * look for a token starting a string, + * return a pointer to first non-space char after it + */ +static char* +matchtoken(char *p, char *token) +{ + int n; + + n = strlen(token); + if(strncmp(p, token, n)) + return 0; + p += n; + if(*p == 0) + return p; + if(*p != ' ' && *p != '\t' && *p != '\n') + return 0; + while(*p == ' ' || *p == '\t' || *p == '\n') + p++; + return p; +} + +void +hnputv(void *p, uvlong v) +{ + uchar *a; + + a = p; + hnputl(a, v>>32); + hnputl(a+4, v); +} + +void +hnputl(void *p, uint v) +{ + uchar *a; + + a = p; + a[0] = v>>24; + a[1] = v>>16; + a[2] = v>>8; + a[3] = v; +} + +void +hnputs(void *p, ushort v) +{ + uchar *a; + + a = p; + a[0] = v>>8; + a[1] = v; +} + +uvlong +nhgetv(void *p) +{ + uchar *a; + + a = p; + return ((vlong)nhgetl(a) << 32) | nhgetl(a+4); +} + +uint +nhgetl(void *p) +{ + uchar *a; + + a = p; + return (a[0]<<24)|(a[1]<<16)|(a[2]<<8)|(a[3]<<0); +} + +ushort +nhgets(void *p) +{ + uchar *a; + + a = p; + return (a[0]<<8)|(a[1]<<0); +} + +static ulong +hash(uchar *a, int len) +{ + ulong sum = 0; + + while(len-- > 0) + sum = (sum << 1) + *a++; + return sum%Nmhash; +} + +int +activemulti(Netif *nif, uchar *addr, int alen) +{ + Netaddr *hp; + + for(hp = nif->mhash[hash(addr, alen)]; hp; hp = hp->hnext) + if(memcmp(addr, hp->addr, alen) == 0){ + if(hp->ref) + return 1; + else + break; + } + return 0; +} + +static int +parseaddr(uchar *to, char *from, int alen) +{ + char nip[4]; + char *p; + int i; + + p = from; + for(i = 0; i < alen; i++){ + if(*p == 0) + return -1; + nip[0] = *p++; + if(*p == 0) + return -1; + nip[1] = *p++; + nip[2] = 0; + to[i] = strtoul(nip, 0, 16); + if(*p == ':') + p++; + } + return 0; +} + +/* + * keep track of multicast addresses + */ +static char* +netmulti(Netif *nif, Netfile *f, uchar *addr, int add) +{ + Netaddr **l, *ap; + int i; + ulong h; + + if(nif->multicast == nil) + return "interface does not support multicast"; + + l = &nif->maddr; + i = 0; + for(ap = *l; ap; ap = *l){ + if(memcmp(addr, ap->addr, nif->alen) == 0) + break; + i++; + l = &ap->next; + } + + if(add){ + if(ap == 0){ + *l = ap = smalloc(sizeof(*ap)); + memmove(ap->addr, addr, nif->alen); + ap->next = 0; + ap->ref = 1; + h = hash(addr, nif->alen); + ap->hnext = nif->mhash[h]; + nif->mhash[h] = ap; + } else { + ap->ref++; + } + if(ap->ref == 1){ + nif->nmaddr++; + nif->multicast(nif->arg, addr, 1); + } + if(i < 8*sizeof(f->maddr)){ + if((f->maddr[i/8] & (1<<(i%8))) == 0) + f->nmaddr++; + f->maddr[i/8] |= 1<<(i%8); + } + } else { + if(ap == 0 || ap->ref == 0) + return 0; + ap->ref--; + if(ap->ref == 0){ + nif->nmaddr--; + nif->multicast(nif->arg, addr, 0); + } + if(i < 8*sizeof(f->maddr)){ + if((f->maddr[i/8] & (1<<(i%8))) != 0) + f->nmaddr--; + f->maddr[i/8] &= ~(1<<(i%8)); + } + } + return 0; +} diff --git a/src/9vx/a/netif.h b/src/9vx/a/netif.h @@ -31,7 +31,7 @@ enum */ struct Netfile { - QLock lk; + QLock qlock; int inuse; ulong mode; @@ -64,7 +64,7 @@ struct Netaddr */ struct Netif { - QLock lk; + QLock qlock; /* multiplexing */ char name[KNAMELEN]; /* for top level directory */ @@ -87,8 +87,8 @@ struct Netif /* statistics */ int misses; - int inpackets; - int outpackets; + uvlong inpackets; + uvlong outpackets; int crcs; /* input crc errors */ int oerrs; /* output errors */ int frames; /* framing errors */ diff --git a/src/9vx/a/part.c b/src/9vx/a/part.c @@ -0,0 +1,341 @@ +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" + +#include "sd.h" +#include "fs.h" + +enum { + Npart = 32 +}; + +uchar *mbrbuf, *partbuf; +int nbuf; +#define trace 0 + +int +tsdbio(SDunit *unit, SDpart *part, void *a, vlong off, int mbr) +{ + uchar *b; + + if(unit->dev->ifc->bio(unit, 0, 0, a, 1, (off/unit->secsize) + part->start) != unit->secsize){ + if(trace) + print("%s: read %lud at %lld failed\n", unit->dev->name, + unit->secsize, (vlong)part->start*unit->secsize+off); + return -1; + } + b = a; + if(mbr && (b[0x1FE] != 0x55 || b[0x1FF] != 0xAA)){ + if(trace) + print("%s: bad magic %.2ux %.2ux at %lld\n", + unit->dev->name, b[0x1FE], b[0x1FF], + (vlong)part->start*unit->secsize+off); + return -1; + } + return 0; +} + +/* + * read partition table. The partition table is just ascii strings. + */ +#define MAGIC "plan9 partitions" +static void +oldp9part(SDunit *unit) +{ + SDpart *pp; + char *field[3], *line[Npart+1]; + ulong n, start, end; + int i; + + /* + * We have some partitions already. + */ + pp = &unit->part[unit->npart]; + + /* + * We prefer partition tables on the second to last sector, + * but some old disks use the last sector instead. + */ + pp->start = unit->sectors - 2; + pp->end = unit->sectors - 1; + + if(tsdbio(unit, pp, partbuf, 0, 0) < 0) + return; + + if(strncmp((char*)partbuf, MAGIC, sizeof(MAGIC)-1) != 0) { + /* not found on 2nd last sector; look on last sector */ + pp->start++; + pp->end++; + if(tsdbio(unit, pp, partbuf, 0, 0) < 0) + return; + if(strncmp((char*)partbuf, MAGIC, sizeof(MAGIC)-1) != 0) + return; + print("%s: using old plan9 partition table on last sector\n", unit->dev->name); + }else + print("%s: using old plan9 partition table on 2nd-to-last sector\n", unit->dev->name); + + /* we found a partition table, so add a partition partition */ + unit->npart++; + partbuf[unit->secsize-1] = '\0'; + + /* + * parse partition table + */ + n = getfields((char*)partbuf, line, Npart+1, 0, "\n"); + if(n && strncmp(line[0], MAGIC, sizeof(MAGIC)-1) == 0){ + for(i = 1; i < n && unit->npart < SDnpart; i++){ + if(getfields(line[i], field, 3, 0, " ") != 3) + break; + start = strtoull(field[1], 0, 0); + end = strtoull(field[2], 0, 0); + if(start >= end || end > unit->sectors) + break; + sdaddpart(unit, field[0], start, end); + } + } +} + +static void +p9part(SDunit *unit, char *name) +{ + SDpart *p; + char *field[4], *line[Npart+1]; + uvlong start, end; + int i, n; + + p = sdfindpart(unit, name); + if(p == nil) + return; + + if(tsdbio(unit, p, partbuf, unit->secsize, 0) < 0) + return; + partbuf[unit->secsize-1] = '\0'; + + if(strncmp((char*)partbuf, "part ", 5) != 0) + return; + + n = getfields((char*)partbuf, line, Npart+1, 0, "\n"); + if(n == 0) + return; + for(i = 0; i < n /* && unit->npart < SDnpart */; i++){ + if(strncmp(line[i], "part ", 5) != 0) + break; + if(getfields(line[i], field, 4, 0, " ") != 4) + break; + start = strtoull(field[2], 0, 0); + end = strtoull(field[3], 0, 0); + if(start >= end || end > unit->sectors) + break; + sdaddpart(unit, field[1], p->start+start, p->start+end); + } +} + +int +isdos(int t) +{ + return t==FAT12 || t==FAT16 || t==FATHUGE || t==FAT32 || t==FAT32X; +} + +int +isextend(int t) +{ + return t==EXTEND || t==EXTHUGE || t==LEXTEND; +} + +/* + * Fetch the first dos and all plan9 partitions out of the MBR partition table. + * We return -1 if we did not find a plan9 partition. + */ +static int +mbrpart(SDunit *unit) +{ + Dospart *dp; + ulong taboffset, start, end; + ulong firstxpart, nxtxpart; + int havedos, i, nplan9; + char name[10]; + + taboffset = 0; + dp = (Dospart*)&mbrbuf[0x1BE]; + if(1) { + /* get the MBR (allowing for DMDDO) */ + if(tsdbio(unit, &unit->part[0], mbrbuf, (vlong)taboffset*unit->secsize, 1) < 0) + return -1; + for(i=0; i<4; i++) + if(dp[i].type == DMDDO) { + if(trace) + print("DMDDO partition found\n"); + taboffset = 63; + if(tsdbio(unit, &unit->part[0], mbrbuf, (vlong)taboffset*unit->secsize, 1) < 0) + return -1; + i = -1; /* start over */ + } + } + + /* + * Read the partitions, first from the MBR and then + * from successive extended partition tables. + */ + nplan9 = 0; + havedos = 0; + firstxpart = 0; + for(;;) { + if(tsdbio(unit, &unit->part[0], mbrbuf, (vlong)taboffset*unit->secsize, 1) < 0) + return -1; + if(trace) { + if(firstxpart) + print("%s ext %lud ", unit->dev->name, taboffset); + else + print("%s mbr ", unit->dev->name); + } + nxtxpart = 0; + for(i=0; i<4; i++) { + if(trace) + print("dp %d...", dp[i].type); + start = taboffset+GLONG(dp[i].start); + end = start+GLONG(dp[i].len); + + if(dp[i].type == PLAN9) { + if(nplan9 == 0) + strcpy(name, "plan9"); + else + sprint(name, "plan9.%d", nplan9); + sdaddpart(unit, name, start, end); + p9part(unit, name); + nplan9++; + } + + /* + * We used to take the active partition (and then the first + * when none are active). We have to take the first here, + * so that the partition we call ``dos'' agrees with the + * partition disk/fdisk calls ``dos''. + */ + if(havedos==0 && isdos(dp[i].type)){ + havedos = 1; + sdaddpart(unit, "dos", start, end); + } + + /* nxtxpart is relative to firstxpart (or 0), not taboffset */ + if(isextend(dp[i].type)){ + nxtxpart = start-taboffset+firstxpart; + if(trace) + print("link %lud...", nxtxpart); + } + } + if(trace) + print("\n"); + + if(!nxtxpart) + break; + if(!firstxpart) + firstxpart = nxtxpart; + taboffset = nxtxpart; + } + return nplan9 ? 0 : -1; +} + +/* + * To facilitate booting from CDs, we create a partition for + * the boot floppy image embedded in a bootable CD. + */ +static int +part9660(SDunit *unit) +{ + uchar buf[2048]; + ulong a, n; + uchar *p; + + if(unit->secsize != 2048) + return -1; + + if(unit->dev->ifc->bio(unit, 0, 0, buf, 2048/unit->secsize, (17*2048)/unit->secsize) < 0) + return -1; + + if(buf[0] || strcmp((char*)buf+1, "CD001\x01EL TORITO SPECIFICATION") != 0) + return -1; + + + p = buf+0x47; + a = p[0] | (p[1]<<8) | (p[2]<<16) | (p[3]<<24); + + if(unit->dev->ifc->bio(unit, 0, 0, buf, 2048/unit->secsize, (a*2048)/unit->secsize) < 0) + return -1; + + if(memcmp(buf, "\x01\x00\x00\x00", 4) != 0 + || memcmp(buf+30, "\x55\xAA", 2) != 0 + || buf[0x20] != 0x88) + return -1; + + p = buf+0x28; + a = p[0] | (p[1]<<8) | (p[2]<<16) | (p[3]<<24); + + switch(buf[0x21]){ + case 0x01: + n = 1200*1024; + break; + case 0x02: + n = 1440*1024; + break; + case 0x03: + n = 2880*1024; + break; + default: + return -1; + } + n /= 2048; + + print("found partition %s!cdboot; %lud+%lud\n", unit->dev->name, a, n); + sdaddpart(unit, "cdboot", a, a+n); + return 0; +} + +enum { + NEW = 1<<0, + OLD = 1<<1 +}; + +void +partition(SDunit *unit) +{ + int type; + char *p; + + if(unit->part == 0) + return; + + if(part9660(unit) == 0) + return; + + p = "new"; + + if(p != nil && strncmp(p, "new", 3) == 0) + type = NEW; + else if(p != nil && strncmp(p, "old", 3) == 0) + type = OLD; + else + type = NEW|OLD; + + if(nbuf < unit->secsize) { + free(mbrbuf); + free(partbuf); + mbrbuf = malloc(unit->secsize); + partbuf = malloc(unit->secsize); + if(mbrbuf==nil || partbuf==nil) { + free(mbrbuf); + free(partbuf); + partbuf = mbrbuf = nil; + nbuf = 0; + return; + } + nbuf = unit->secsize; + } + + if((type & NEW) && mbrpart(unit) >= 0){ + /* nothing to do */; + } + else if(type & OLD) + oldp9part(unit); +} diff --git a/src/9vx/a/pgrp.c b/src/9vx/a/pgrp.c @@ -180,7 +180,7 @@ dupfgrp(Fgrp *f) lock(&f->ref.lk); /* Make new fd list shorter if possible, preserving quantization */ new->nfd = f->maxfd+1; - i = new->nfd%DELTAFD; + i = (uint)new->nfd%DELTAFD; if(i != 0) new->nfd += DELTAFD - i; new->fd = malloc(new->nfd*sizeof(Chan*)); diff --git a/src/9vx/a/portfns.h b/src/9vx/a/portfns.h @@ -32,8 +32,8 @@ void callwithureg(void(*)(Ureg*)); char* chanpath(Chan*); int canlock(Lock*); int canpage(Proc*); -int canqlock(QLock*); -int canrlock(RWlock*); +int __canqlock(QLock*); +int __canrlock(RWlock*); void chandevinit(void); void chandevreset(void); void chandevshutdown(void); @@ -166,7 +166,7 @@ void ksetenv(char*, char*, int); void kstrcpy(char*, char*, int); void kstrdup(char**, char*); long latin1(Rune*, int); -int lock(Lock*); +int __lock(Lock*); void logopen(Log*); void logclose(Log*); char* logctl(Log*, int, char**, Logflag*); @@ -277,7 +277,7 @@ void qhangup(Queue*, char*); int qisclosed(Queue*); int qiwrite(Queue*, void*, int); int qlen(Queue*); -void qlock(QLock*); +void __qlock(QLock*); Queue* qopen(int, int, void (*)(void*), void*); int qpass(Queue*, Block*); int qpassnolim(Queue*, Block*); @@ -287,7 +287,7 @@ long qread(Queue*, void*, int); Block* qremove(Queue*); void qreopen(Queue*); void qsetlimit(Queue*, int); -void qunlock(QLock*); +void __qunlock(QLock*); int qwindow(Queue*); int qwrite(Queue*, void*, int); void qnoblock(Queue*, int); @@ -305,9 +305,9 @@ void renameuser(char*, char*); void resched(char*); void resrcwait(char*); int return0(void*); -void rlock(RWlock*); +void __rlock(RWlock*); long rtctime(void); -void runlock(RWlock*); +void __runlock(RWlock*); Proc* runproc(void); void savefpregs(FPsave*); void sched(void); @@ -361,7 +361,7 @@ int uartstageoutput(Uart*); void unbreak(Proc*); void uncachepage(Page*); long unionread(Chan*, void*, long); -void unlock(Lock*); +void __unlock(Lock*); uvlong us2fastticks(uvlong); void userinit(void); ulong userpc(void); @@ -372,8 +372,8 @@ void validstat(uchar*, int); void* vmemchr(void*, int, int); Proc* wakeup(Rendez*); int walk(Chan**, char**, int, int, int*); -void wlock(RWlock*); -void wunlock(RWlock*); +void __wlock(RWlock*); +void __wunlock(RWlock*); void* xalloc(ulong); void* xallocz(ulong, int); void xfree(void*); diff --git a/src/9vx/a/qlock.c b/src/9vx/a/qlock.c @@ -5,6 +5,8 @@ #include "dat.h" #include "fns.h" +int tracelock = 0; + struct { ulong rlock; ulong rlockq; @@ -15,7 +17,7 @@ struct { } rwstats; void -qlock(QLock *q) +__qlock(QLock *q) { Proc *p; @@ -50,7 +52,7 @@ qlock(QLock *q) } int -canqlock(QLock *q) +__canqlock(QLock *q) { if(!canlock(&q->use)) return 0; @@ -64,7 +66,7 @@ canqlock(QLock *q) } void -qunlock(QLock *q) +__qunlock(QLock *q) { Proc *p; @@ -86,7 +88,7 @@ qunlock(QLock *q) } void -rlock(RWlock *q) +__rlock(RWlock *q) { Proc *p; @@ -115,7 +117,7 @@ rlock(RWlock *q) } void -runlock(RWlock *q) +__runlock(RWlock *q) { Proc *p; @@ -138,7 +140,7 @@ runlock(RWlock *q) } void -wlock(RWlock *q) +__wlock(RWlock *q) { Proc *p; @@ -170,7 +172,7 @@ wlock(RWlock *q) } void -wunlock(RWlock *q) +__wunlock(RWlock *q) { Proc *p; @@ -209,7 +211,7 @@ wunlock(RWlock *q) /* same as rlock but punts if there are any writers waiting */ int -canrlock(RWlock *q) +__canrlock(RWlock *q) { lock(&q->use); rwstats.rlock++; diff --git a/src/9vx/a/sd.h b/src/9vx/a/sd.h @@ -129,9 +129,14 @@ extern void sdadddevs(SDev*); extern int sdsetsense(SDreq*, int, int, int, int); extern int sdmodesense(SDreq*, uchar*, void*, int); extern int sdfakescsi(SDreq*, void*, int); +extern void sdaddpart(SDunit*, char*, uvlong, uvlong); +extern SDpart* sdfindpart(SDunit*, char*); /* sdscsi.c */ extern int scsiverify(SDunit*); extern int scsionline(SDunit*); extern long scsibio(SDunit*, int, int, void*, long, uvlong); extern SDev* scsiid(SDev*, SDifc*); + +/* part.c */ +extern void partition(SDunit*); diff --git a/src/9vx/a/sdaoe.c b/src/9vx/a/sdaoe.c @@ -0,0 +1,652 @@ +/* + * aoe sd driver, copyright © 2007 coraid + */ + +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "io.h" +#include "error.h" +#include "sd.h" +#include "netif.h" +#include "aoe.h" + +extern char Echange[]; +extern char Enotup[]; + +#define uprint(...) snprint(up->genbuf, sizeof up->genbuf, __VA_ARGS__); + +enum { + Nctlr = 32, + Maxpath = 128, +}; + +enum { + /* sync with ahci.h */ + Dllba = 1<<0, + Dsmart = 1<<1, + Dpower = 1<<2, + Dnop = 1<<3, + Datapi = 1<<4, + Datapi16= 1<<5, +}; + +static char *flagname[] = { + "llba", + "smart", + "power", + "nop", + "atapi", + "atapi16", +}; + +typedef struct Ctlr Ctlr; +struct Ctlr{ + QLock qlock; + + Ctlr *next; + SDunit *unit; + + char path[Maxpath]; + Chan *c; + + ulong vers; + uchar mediachange; + uchar flag; + uchar smart; + uchar smartrs; + uchar feat; + + uvlong sectors; + char serial[20+1]; + char firmware[8+1]; + char model[40+1]; + char ident[0x100]; +}; + +static Lock ctlrlock; +static Ctlr *head; +static Ctlr *tail; + +SDifc sdaoeifc; + +static void +idmove(char *p, ushort *a, int n) +{ + int i; + char *op, *e; + + op = p; + for(i = 0; i < n/2; i++){ + *p++ = a[i] >> 8; + *p++ = a[i]; + } + *p = 0; + while(p > op && *--p == ' ') + *p = 0; + e = p; + p = op; + while(*p == ' ') + p++; + memmove(op, p, n - (e - p)); +} + +static ushort +gbit16(void *a) +{ + uchar *i; + + i = a; + return i[1] << 8 | i[0]; +} + +static ulong +gbit32(void *a) +{ + ulong j; + uchar *i; + + i = a; + j = i[3] << 24; + j |= i[2] << 16; + j |= i[1] << 8; + j |= i[0]; + return j; +} + +static uvlong +gbit64(void *a) +{ + uchar *i; + + i = a; + return (uvlong)gbit32(i+4)<<32 | gbit32(i); +} + +static int +identify(Ctlr *c, ushort *id) +{ + int i; + uchar oserial[21]; + uvlong osectors, s; + + osectors = c->sectors; + memmove(oserial, c->serial, sizeof c->serial); + + c->feat &= ~(Dllba|Dpower|Dsmart|Dnop); + i = gbit16(id+83) | gbit16(id+86); + if(i & (1<<10)){ + c->feat |= Dllba; + s = gbit64(id+100); + }else + s = gbit32(id+60); + + i = gbit16(id+83); + if((i>>14) == 1) { + if(i & (1<<3)) + c->feat |= Dpower; + i = gbit16(id+82); + if(i & 1) + c->feat |= Dsmart; + if(i & (1<<14)) + c->feat |= Dnop; + } + + idmove(c->serial, id+10, 20); + idmove(c->firmware, id+23, 8); + idmove(c->model, id+27, 40); + + if((osectors == 0 || osectors != s) && + memcmp(oserial, c->serial, sizeof oserial) != 0){ + c->sectors = s; + c->mediachange = 1; + c->vers++; + } + return 0; +} + +/* must call with d qlocked */ +static int +aoeidentify(Ctlr *d, SDunit *u) +{ + Chan *c; + + c = nil; + if(waserror()){ + if(c) + cclose(c); + iprint("aoeidentify: %s\n", up->errstr); + nexterror(); + } + + uprint("%s/ident", d->path); + c = namec(up->genbuf, Aopen, OREAD, 0); + devtab[c->type]->read(c, d->ident, sizeof d->ident, 0); + + poperror(); + cclose(c); + + d->feat = 0; + d->smart = 0; + identify(d, (ushort*)d->ident); + + memset(u->inquiry, 0, sizeof u->inquiry); + u->inquiry[2] = 2; + u->inquiry[3] = 2; + u->inquiry[4] = sizeof u->inquiry - 4; + memmove(u->inquiry+8, d->model, 40); + + return 0; +} + +static Ctlr* +ctlrlookup(char *path) +{ + Ctlr *c; + + lock(&ctlrlock); + for(c = head; c; c = c->next) + if(strcmp(c->path, path) == 0) + break; + unlock(&ctlrlock); + return c; +} + +static Ctlr* +newctlr(char *path) +{ + Ctlr *c; + + /* race? */ + if(ctlrlookup(path)) + error(Eexist); + + if((c = malloc(sizeof *c)) == nil) + return 0; + kstrcpy(c->path, path, sizeof c->path); + lock(&ctlrlock); + if(head != nil) + tail->next = c; + else + head = c; + tail = c; + unlock(&ctlrlock); + return c; +} + +static void +delctlr(Ctlr *c) +{ + Ctlr *x, *prev; + + lock(&ctlrlock); + + for(prev = 0, x = head; x; prev = x, x = c->next) + if(strcmp(c->path, x->path) == 0) + break; + if(x == 0){ + unlock(&ctlrlock); + error(Enonexist); + } + + if(prev) + prev->next = x->next; + else + head = x->next; + if(x->next == nil) + tail = prev; + unlock(&ctlrlock); + + if(x->c) + cclose(x->c); + free(x); +} + +static SDev* +aoeprobe(char *path, SDev *s) +{ + int n, i; + char *p; + Chan *c; + Ctlr *ctlr; + + if((p = strrchr(path, '/')) == 0) + error(Ebadarg); + *p = 0; + uprint("%s/ctl", path); + *p = '/'; + + c = namec(up->genbuf, Aopen, OWRITE, 0); + if(waserror()) { + cclose(c); + nexterror(); + } + n = uprint("discover %s", p+1); + devtab[c->type]->write(c, up->genbuf, n, 0); + poperror(); + cclose(c); + + for(i = 0;; i += 200){ + if(i > 8000 || waserror()) + error(Etimedout); + tsleep(&up->sleep, return0, 0, 200); + poperror(); + + uprint("%s/ident", path); + if(waserror()) + continue; + c = namec(up->genbuf, Aopen, OREAD, 0); + poperror(); + cclose(c); + + ctlr = newctlr(path); + break; + } + + if(s == nil && (s = malloc(sizeof *s)) == nil) + return nil; + s->ctlr = ctlr; + s->ifc = &sdaoeifc; + s->nunit = 1; + return s; +} + +static char *probef[32]; +static int nprobe; + +static int +pnpprobeid(char *s) +{ + int id; + + if(strlen(s) < 2) + return 0; + id = 'e'; + if(s[1] == '!') + id = s[0]; + return id; +} + +static SDev* +aoepnp(void) +{ + int i, id; + char *p; + SDev *h, *t, *s; + +// if((p = getconf("aoedev")) == 0) + if(1) + return 0; + nprobe = tokenize(p, probef, nelem(probef)); + h = t = 0; + for(i = 0; i < nprobe; i++){ + id = pnpprobeid(probef[i]); + if(id == 0) + continue; + s = malloc(sizeof *s); + if(s == nil) + break; + s->ctlr = 0; + s->idno = id; + s->ifc = &sdaoeifc; + s->nunit = 1; + + if(h) + t->next = s; + else + h = s; + t = s; + } + return h; +} + +static Ctlr* +pnpprobe(SDev *sd) +{ + int j; + char *p; + static int i; + + if(i > nprobe) + return 0; + p = probef[i++]; + if(strlen(p) < 2) + return 0; + if(p[1] == '!') + p += 2; + + for(j = 0;; j += 200){ + if(j > 8000){ + print("#æ: pnpprobe: %s: %s\n", probef[i-1], up->errstr); + return 0; + } + if(waserror()){ + tsleep(&up->sleep, return0, 0, 200); + continue; + } + sd = aoeprobe(p, sd); + poperror(); + break; + } + print("#æ: pnpprobe establishes %sin %dms\n", probef[i-1], j); + return sd->ctlr; +} + + +static int +aoeverify(SDunit *u) +{ + SDev *s; + Ctlr *c; + + s = u->dev; + c = s->ctlr; + if(c == nil && (s->ctlr = c = pnpprobe(s)) == nil) + return 0; + c->mediachange = 1; + return 1; +} + +static int +aoeconnect(SDunit *u, Ctlr *c) +{ + QLOCK(c); + if(waserror()){ + QUNLOCK(c); + return -1; + } + + aoeidentify(u->dev->ctlr, u); + if(c->c) + cclose(c->c); + c->c = 0; + uprint("%s/data", c->path); + c->c = namec(up->genbuf, Aopen, ORDWR, 0); + QUNLOCK(c); + poperror(); + + return 0; +} + +static int +aoeonline(SDunit *u) +{ + Ctlr *c; + int r; + + c = u->dev->ctlr; + r = 0; + + if((c->feat&Datapi) && c->mediachange){ + if(aoeconnect(u, c) == 0 && (r = scsionline(u)) > 0) + c->mediachange = 0; + return r; + } + + if(c->mediachange){ + if(aoeconnect(u, c) == -1) + return 0; + r = 2; + c->mediachange = 0; + u->sectors = c->sectors; + u->secsize = Aoesectsz; + } else + r = 1; + + return r; +} + +static int +aoerio(SDreq *r) +{ + int i, count; + uvlong lba; + char *name; + uchar *cmd; + long (*rio)(Chan*, void*, long, vlong); + Ctlr *c; + SDunit *unit; + + unit = r->unit; + c = unit->dev->ctlr; +// if(c->feat & Datapi) +// return aoeriopkt(r, d); + + cmd = r->cmd; + name = unit->perm.name; + + if(r->cmd[0] == 0x35 || r->cmd[0] == 0x91){ +// QLOCK(c); +// i = flushcache(); +// QUNLOCK(c); +// if(i == 0) +// return sdsetsense(r, SDok, 0, 0, 0); + return sdsetsense(r, SDcheck, 3, 0xc, 2); + } + + if((i = sdfakescsi(r, c->ident, sizeof c->ident)) != SDnostatus){ + r->status = i; + return i; + } + + switch(*cmd){ + case 0x88: + case 0x28: + rio = devtab[c->c->type]->read; + break; + case 0x8a: + case 0x2a: + rio = devtab[c->c->type]->write; + break; + default: + print("%s: bad cmd %#.2ux\n", name, cmd[0]); + r->status = SDcheck; + return SDcheck; + } + + if(r->data == nil) + return SDok; + + if(r->clen == 16){ + if(cmd[2] || cmd[3]) + return sdsetsense(r, SDcheck, 3, 0xc, 2); + lba = (uvlong)cmd[4]<<40 | (uvlong)cmd[5]<<32; + lba |= cmd[6]<<24 | cmd[7]<<16 | cmd[8]<<8 | cmd[9]; + count = cmd[10]<<24 | cmd[11]<<16 | cmd[12]<<8 | cmd[13]; + }else{ + lba = cmd[2]<<24 | cmd[3]<<16 | cmd[4]<<8 | cmd[5]; + count = cmd[7]<<8 | cmd[8]; + } + + count *= Aoesectsz; + + if(r->dlen < count) + count = r->dlen & ~0x1ff; + + if(waserror()){ + if(strcmp(up->errstr, Echange) == 0 || + strcmp(up->errstr, Enotup) == 0) + unit->sectors = 0; + nexterror(); + } + r->rlen = rio(c->c, r->data, count, Aoesectsz * lba); + poperror(); + r->status = SDok; + return SDok; +} + +static char *smarttab[] = { + "unset", + "error", + "threshold exceeded", + "normal" +}; + +static char * +pflag(char *s, char *e, uchar f) +{ + uchar i, m; + + for(i = 0; i < 8; i++){ + m = 1 << i; + if(f & m) + s = seprint(s, e, "%s ", flagname[i]); + } + return seprint(s, e, "\n"); +} + +static int +aoerctl(SDunit *u, char *p, int l) +{ + Ctlr *c; + char *e, *op; + + if((c = u->dev->ctlr) == nil) + return 0; + e = p+l; + op = p; + + p = seprint(p, e, "model\t%s\n", c->model); + p = seprint(p, e, "serial\t%s\n", c->serial); + p = seprint(p, e, "firm %s\n", c->firmware); + if(c->smartrs == 0xff) + p = seprint(p, e, "smart\tenable error\n"); + else if(c->smartrs == 0) + p = seprint(p, e, "smart\tdisabled\n"); + else + p = seprint(p, e, "smart\t%s\n", smarttab[c->smart]); + p = seprint(p, e, "flag "); + p = pflag(p, e, c->feat); + p = seprint(p, e, "geometry %llud %d\n", c->sectors, Aoesectsz); + return p-op; +} + +static int +aoewctl(SDunit *d1, Cmdbuf *cmd) +{ + cmderror(cmd, Ebadarg); + return 0; +} + +static SDev* +aoeprobew(DevConf *c) +{ + char *p; + + p = strchr(c->type, '/'); + if(p == nil || strlen(p) > Maxpath - 11) + error(Ebadarg); + if(p[1] == '#') + p++; /* hack */ + if(ctlrlookup(p)) + error(Einuse); + return aoeprobe(p, 0); +} + +static void +aoeclear(SDev *s) +{ + delctlr((Ctlr *)s->ctlr); +} + +static char* +aoertopctl(SDev *s, char *p, char *e) +{ + Ctlr *c; + + c = s->ctlr; + return seprint(p, e, "%s aoe %s\n", s->name, c->path); +} + +static int +aoewtopctl(SDev *d1, Cmdbuf *cmd) +{ + switch(cmd->nf){ + default: + cmderror(cmd, Ebadarg); + } + return 0; +} + +SDifc sdaoeifc = { + "aoe", + + aoepnp, + nil, /* legacy */ + nil, /* enable */ + nil, /* disable */ + + aoeverify, + aoeonline, + aoerio, + aoerctl, + aoewctl, + + scsibio, + aoeprobew, /* probe */ + aoeclear, /* clear */ + aoertopctl, + aoewtopctl, +}; diff --git a/src/9vx/bootcode.9 b/src/9vx/bootcode.9 Binary files differ. diff --git a/src/9vx/devip.c b/src/9vx/devip.c @@ -883,7 +883,7 @@ cswrite(Chan *c, void *a, long n, vlong offset) return n; } -Dev ipdevtab = +Dev pipdevtab = { 'I', "ip", diff --git a/src/9vx/devtab.c b/src/9vx/devtab.c @@ -5,6 +5,7 @@ #include "fns.h" #include "error.h" +extern Dev aoedevtab; extern Dev consdevtab; extern Dev rootdevtab; extern Dev pipedevtab; @@ -24,14 +25,18 @@ extern Dev mntloopdevtab; extern Dev dupdevtab; extern Dev sddevtab; extern Dev capdevtab; +extern Dev etherdevtab; Dev *devtab[] = { &rootdevtab, /* must be first */ + &aoedevtab, &audiodevtab, + &capdevtab, &consdevtab, &drawdevtab, &dupdevtab, &envdevtab, + &etherdevtab, &fsdevtab, &ipdevtab, &mntdevtab, @@ -40,11 +45,9 @@ Dev *devtab[] = { &pipedevtab, &procdevtab, &ramdevtab, + &sddevtab, &srvdevtab, &ssldevtab, &tlsdevtab, - &sddevtab, - &capdevtab, 0 }; - diff --git a/src/9vx/etherpcap.c b/src/9vx/etherpcap.c @@ -0,0 +1,189 @@ +/* + * etherpcap - portable Virtual Ethernet driver for 9vx. + * + * Copyright (c) 2008 Devon H. O'Dell + * copyright © 2008 erik quanstrom + * copyright © 2010 Jesus Galan Lopez + * + * Released under 2-clause BSD license. + */ + +#include "u.h" + +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "io.h" +#include "error.h" +#include "netif.h" +#include "etherif.h" +#include "vether.h" + +#include <pcap.h> + +static uvlong txerrs; + +extern int eafrom(char *ma, uchar ea[6]); + +typedef struct Ctlr Ctlr; +struct Ctlr { + pcap_t *pd; +}; + +static void * +veerror(char* err) +{ + iprint("ve: %s\n", err); + return nil; +} + +static pcap_t * +setup(char *dev, uchar *ea) +{ + char filter[30]; + char errbuf[PCAP_ERRBUF_SIZE]; + pcap_t *pd; + struct bpf_program prog; + bpf_u_int32 net; + bpf_u_int32 mask; + + if(sprint(filter, "ether dst %2.2ux:%2.2ux:%2.2ux:%2.2ux:%2.2ux:%2.2ux", + ea[0], ea[1], ea[2],ea[3], ea[4], ea[5]) == -1) + return veerror("cannot create pcap filter"); + + if (!dev && (dev = pcap_lookupdev(errbuf)) == nil) + return veerror("cannot find network device"); + +// if ((pd = pcap_open_live(netdev, 1514, 1, 1, errbuf)) == nil) + if ((pd = pcap_open_live(dev, 65000, 1, 1, errbuf)) == nil) + return nil; + + pcap_lookupnet(dev, &net, &mask, errbuf); + pcap_compile(pd, &prog, filter, 0, net); + + if (pcap_setfilter(pd, &prog) == -1) + return nil; + + pcap_freecode(&prog); + + return pd; +} + +static Block * +pcappkt(Ctlr *c) +{ + struct pcap_pkthdr hdr; + uchar *p; + Block *b; + + while ((p = pcap_next(c->pd, &hdr)) == nil); + + b = allocb(hdr.caplen); + memcpy(b->rp, p, hdr.caplen); + b->wp += hdr.caplen; + b->flag |= Btcpck|Budpck|Bpktck; + +/* + iprint("+++++++++++ packet %d (len %d):\n", ++fn, hdr.caplen); + int i=0; uchar* u; + static int fn=0; + + for(u=b->rp; u<b->wp; u++){ + if (i%16 == 0) iprint("%.4ux", i); + if (i%8 == 0) iprint(" "); + iprint("%2.2ux ", *u); + if (++i%16 == 0) iprint("\n"); + } + iprint("\n-------------\n"); +*/ + + return b; + +} + +static void +pcaprecvkproc(void *v) +{ + Ether *e; + Block *b; + + e = v; + while ((b = pcappkt(e->ctlr))) + if (b != nil) + etheriq(e, b, 1); +} + +static void +pcaptransmit(Ether* e) +{ + const u_char *u; + Block *b; + Ctlr *c; + + c = e->ctlr; + while ((b = qget(e->oq)) != nil) { + int wlen; + + u = (const u_char*)b->rp; + + wlen = pcap_inject(c->pd, u, BLEN(b)); + // iprint("injected packet len %d\n", wlen); + if (wlen == -1) + txerrs++; + + freeb(b); + } +} + +static long +pcapifstat(Ether *e, void *a, long n, ulong offset) +{ + char buf[128]; + + snprint(buf, sizeof buf, "txerrors: %lud\n", txerrs); + return readstr(offset, a, n, buf); +} + +static void +pcapattach(Ether* e) +{ + kproc("pcaprecv", pcaprecvkproc, e); +} + +static int +pcappnp(Ether* e) +{ + Ctlr c; + static int cve = 0; + + while(cve < nve && ve[cve].tap == 1) + cve++; + if(cve >= nve) + return -1; + + memset(&c, 0, sizeof(c)); + c.pd = setup(ve[cve].dev, ve[cve].ea); + if (c.pd == nil) { + iprint("ve: pcap failed to initialize\n"); + cve++; + return -1; + } + e->ctlr = malloc(sizeof(c)); + memcpy(e->ctlr, &c, sizeof(c)); + e->tbdf = BUSUNKNOWN; + memcpy(e->ea, ve[cve].ea, Eaddrlen); + e->attach = pcapattach; + e->transmit = pcaptransmit; + e->ifstat = pcapifstat; + e->ni.arg = e; + e->ni.link = 1; + cve++; + return 0; +} + +void +etherpcaplink(void) +{ + addethercard("pcap", pcappnp); +} diff --git a/src/9vx/ethertap.c b/src/9vx/ethertap.c @@ -0,0 +1,185 @@ +/* + * ethertap: tap device ethernet driver + * copyright © 2008 erik quanstrom + * copyright © 2010 Tully Gray + * copyright © 2010 Jesus Galan Lopez + */ + +#include "u.h" +#include "lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "io.h" +#include "error.h" +#include "netif.h" +#include "etherif.h" +#include "vether.h" + +#include <net/if.h> +#include <sys/ioctl.h> + +#ifdef linux +#include <netpacket/packet.h> +#include <linux/if_tun.h> +#elif defined(__FreeBSD__) +#include <net/if_tun.h> +#endif + +typedef struct Ctlr Ctlr; +struct Ctlr { + int fd; + int txerrs; + uchar ea[Eaddrlen]; +}; + +static uchar anyea[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff,}; + +#ifdef linux +static int +opentap(char *dev) +{ + int fd; + char *tap0 = "tap0"; + struct ifreq ifr; + + if(dev == nil) + dev = tap0; + if((fd = open("/dev/net/tun", O_RDWR)) < 0) + return -1; + memset(&ifr, 0, sizeof ifr); + strncpy(ifr.ifr_name, dev, sizeof ifr.ifr_name); + ifr.ifr_flags = IFF_TAP | IFF_NO_PI; + if(ioctl(fd, TUNSETIFF, &ifr) < 0){ + close(fd); + return -1; + } + return fd; +} +#elif defined(__FreeBSD__) +static int +opentap(char *dev) +{ + int fd; + struct stat s; + + if((fd = open("/dev/tap", O_RDWR)) < 0) + return -1; + return fd; +} +#endif + +static int +setup(char *dev) +{ + return opentap(dev); +} + +Block* +tappkt(Ctlr *c) +{ + int n; + Block *b; + + b = allocb(1514); + for(;;){ + n = read(c->fd, b->rp, BALLOC(b)); + if(n <= 0) + panic("fd %d read %d", c->fd, n); + if(memcmp(b->rp + 0, anyea, 6) == 0 + || memcmp(b->rp + 0, c->ea, 6) == 0) + break; + } + b->wp += n; + b->flag |= Btcpck|Budpck|Bpktck; + return b; +} + +static void +taprecvkproc(void *v) +{ + Block *b; + Ether *e; + + e = v; + while((b = tappkt(e->ctlr))) + etheriq(e, b, 1); + pexit("read fail", 1); +} + +static void +taptransmit(Ether* e) +{ + Block *b, *h; + Ctlr *c; + + c = e->ctlr; + while ((b = qget(e->oq)) != nil) { + if(memcmp(b->rp + 6, anyea, 6) == 0 || + memcmp(b->rp + 0, c->ea, 6) == 0){ + h = allocb(BLEN(b)); + memcpy(h->rp, b->wp, BLEN(b)); + h->wp += BLEN(b); + h->flag |= Btcpck|Budpck|Bpktck; + etheriq(e, h, 1); + } + if(write(c->fd, b->rp, BLEN(b)) == -1) + c->txerrs++; + freeb(b); + } +} + +static long +tapifstat(Ether *e, void *a, long n, ulong offset) +{ + char buf[128]; + Ctlr *c; + + c = a; + snprint(buf, sizeof buf, "txerrors: %lud\n", c->txerrs); + return readstr(offset, a, n, buf); +} + +static void +tapattach(Ether* e) +{ + kproc("taprecv", taprecvkproc, e); +} + +static int +tappnp(Ether* e) +{ + Ctlr c; + static int cve = 0; + + while(cve < nve && ve[cve].tap == 0) + cve++; + if(cve == nve) + return -1; + + memset(&c, 0, sizeof c); + c.fd = setup(ve[cve].dev); + memcpy(c.ea, ve[cve].ea, Eaddrlen); + if(c.fd== -1){ + iprint("ve: tap failed to initialize\n"); + cve++; + return -1; + } + e->ctlr = malloc(sizeof c); + memcpy(e->ctlr, &c, sizeof c); + e->tbdf = BUSUNKNOWN; + memcpy(e->ea, ve[cve].ea, Eaddrlen); + e->attach = tapattach; + e->transmit = taptransmit; + e->ifstat = tapifstat; + e->ni.arg = e; + e->ni.link = 1; + cve++; + return 0; +} + +void +ethertaplink(void) +{ + addethercard("tap", tappnp); +} diff --git a/src/9vx/fossil.9 b/src/9vx/fossil.9 Binary files differ. diff --git a/src/9vx/main.c b/src/9vx/main.c @@ -25,13 +25,24 @@ #include "arg.h" #include "tos.h" +#include "fs.h" + +#include "netif.h" +#include "etherif.h" +#include "vether.h" + #define Image IMAGE #include "draw.h" #include "memdraw.h" #include "cursor.h" #include "screen.h" +#define BOOTLINELEN 64 +#define BOOTARGSLEN (3584-0x200-BOOTLINELEN) +#define MAXCONF 100 + extern Dev ipdevtab; +extern Dev pipdevtab; extern Dev drawdevtab; extern Dev fsdevtab; extern Dev audiodevtab; @@ -42,8 +53,14 @@ char* argv0; char* conffile = "9vx"; Conf conf; +static char* inifile; +static char inibuf[BOOTARGSLEN]; +static char *iniline[MAXCONF]; static int bootboot; /* run /boot/boot instead of bootscript */ +static int nofork; /* do not fork at init */ static int initrc; /* run rc instead of init */ +static int nogui; /* do not start the gui */ +static int usetty; /* use tty for input/output */ static char* username; static Mach mach0; @@ -56,13 +73,19 @@ static int singlethread; static void bootinit(void); static void siginit(void); +static int readini(char *fn); +static void inifields(void (*fp)(char*, char*)); +static void iniopt(char *name, char *value); +static void inienv(char *name, char *value); + static char* getuser(void); static char* findroot(void); void usage(void) { - fprint(2, "usage: 9vx [-gt] [-r root] [-u user]\n"); + // TODO(yy): add debug and other options by ron + fprint(2, "usage: 9vx [-p file.ini] [-bfgit] [-n [tap] [netdev]] [-m macaddr] [-r root] [-u user]\n"); exit(1); } @@ -74,9 +97,8 @@ nop(void) int main(int argc, char **argv) { - int usetty; - int nogui; - int nofork; + int vetap; + char *vedev; char buf[1024]; /* Minimal set up to make print work. */ @@ -87,6 +109,7 @@ main(int argc, char **argv) nogui = 0; nofork = 0; usetty = 0; + nve = 0; localroot = nil; ARGBEGIN{ /* debugging options */ @@ -102,9 +125,6 @@ main(int argc, char **argv) case 'K': tracekdev++; break; - case 'F': - nofork = 1; - break; case 'M': tracemmu++; break; @@ -125,6 +145,9 @@ main(int argc, char **argv) case 'b': bootboot = 1; break; + case 'f': + nofork = 1; + break; case 'g': nogui = 1; usetty = 1; @@ -132,6 +155,26 @@ main(int argc, char **argv) case 'i': initrc = 1; break; + case 'p': + inifile = EARGF(usage()); + break; + case 'm': + setmac(EARGF(usage())); + break; + case 'n': + vetap = 0; + vedev = ARGF(); + if(vedev != nil && strcmp(vedev, "tap") == 0){ + vetap = 1; + vedev = ARGF(); + } + if(vedev != nil && vedev[0] == '-'){ + vedev = nil; + argc++; + argv--; + } + addve(vedev, vetap); + break; case 'r': localroot = EARGF(usage()); break; @@ -148,6 +191,13 @@ main(int argc, char **argv) if(argc != 0) usage(); + if(inifile){ + if(readini(inifile) != 0) + panic("error reading config file %s", inifile); + conffile=inifile; + inifields(&iniopt); + } + if(!bootboot){ if(localroot == nil && (localroot = findroot()) == nil) panic("cannot find plan 9 root; use -r"); @@ -188,14 +238,34 @@ main(int argc, char **argv) /* * Debugging: tell user what options we guessed. */ - print("9vx %s-r %s -u %s\n", usetty ? "-t " : "", localroot, username); + print("9vx "); + if(inifile) + print("-p %s ", inifile); + if(bootboot | nofork | nogui | initrc | usetty) + print("-%s%s%s%s%s ", bootboot ? "b" : "", nofork ? "f " : "", + nogui ? "g" : "", initrc ? "i " : "", usetty ? "t " : ""); + for(int i=0; i<nve; i++){ + print("-n %s", ve[i].tap ? "tap ": ""); + if(ve[i].dev != nil) + print("%s ", ve[i].dev); + if(ve[i].mac != nil) + print("-m %s ", ve[i].mac); + } + print("-r %s -u %s\n", localroot, username); + + if(nve == 0) + ipdevtab = pipdevtab; printinit(); procinit0(); initseg(); + if(nve > 0) + links(); + chandevreset(); if(!singlethread){ - makekprocdev(&ipdevtab); + if(nve == 0) + makekprocdev(&ipdevtab); makekprocdev(&fsdevtab); makekprocdev(&drawdevtab); makekprocdev(&audiodevtab); @@ -218,6 +288,144 @@ main(int argc, char **argv) } /* + * read configuration file + */ +int +readini(char *fn) +{ + int blankline, incomment, inspace, n, fd; + char *cp, *p, *q; + + if(strcmp(fn, "-") == 0) + fd = stdin; + else if((fd = open(fn, OREAD)) < 0) + return -1; + + cp = inibuf; + *cp = 0; + n = read(fd, cp, BOOTARGSLEN-1); + close(fd); + if(n <= 0) + return -1; + + cp[n] = 0; + + /* + * Strip out '\r', change '\t' -> ' '. + * Change runs of spaces into single spaces. + * Strip out trailing spaces, blank lines. + * + * We do this before we make the copy so that if we + * need to change the copy, it is already fairly clean. + * The main need is in the case when plan9.ini has been + * padded with lots of trailing spaces, as is the case + * for those created during a distribution install. + */ + p = cp; + blankline = 1; + incomment = inspace = 0; + for(q = cp; *q; q++){ + if(*q == '\r') + continue; + if(*q == '\t') + *q = ' '; + if(*q == ' '){ + inspace = 1; + continue; + } + if(*q == '\n'){ + if(!blankline){ + if(!incomment) + *p++ = '\n'; + blankline = 1; + } + incomment = inspace = 0; + continue; + } + if(inspace){ + if(!blankline && !incomment) + *p++ = ' '; + inspace = 0; + } + if(blankline && *q == '#') + incomment = 1; + blankline = 0; + if(!incomment) + *p++ = *q; + } + if(p > cp && p[-1] != '\n') + *p++ = '\n'; + *p++ = 0; + + getfields(cp, iniline, MAXCONF, 0, "\n"); + + return 0; +} + +void +inifields(void (*fp)(char*, char*)) +{ + int i; + char *cp; + + for(i = 0; i < MAXCONF; i++){ + if(!iniline[i]) + break; + cp = strchr(iniline[i], '='); + if(cp == 0) + continue; + *cp++ = 0; + if(cp - iniline[i] >= NAMELEN+1) + *(iniline[i]+NAMELEN-1) = 0; + (fp)(iniline[i], cp); + *(cp-1) = '='; + } +} + +void +iniopt(char *name, char *value) +{ + char *vedev; + int vetap; + + if(*name == '*') + name++; + if(strcmp(name, "bootboot") == 0) + bootboot = 1; + else if(strcmp(name, "initrc") == 0) + initrc = 1; + else if(strcmp(name, "nofork") == 0) + nofork = 1; + else if(strcmp(name, "localroot") == 0 && !localroot) + localroot = value; + else if(strcmp(name, "user") == 0 && !username) + username = value; + else if(strcmp(name, "usetty") == 0) + usetty = 1; + else if(strcmp(name, "macaddr") == 0) + setmac(value); + else if(strcmp(name, "netdev") == 0){ + if(strncmp(value, "tap", 3) == 0) { + vetap = 1; + value += 4; + } + vedev = value; + addve(vedev, vetap); + } + else if(strcmp(name, "nogui") == 0){ + nogui = 1; + usetty = 1; + } +} + +void +inienv(char *name, char *value) +{ + if(*name != '*') + ksetenv(name, value, 0); +} + +/* * Search for Plan 9 /386/bin/rc to find root. */ static char* @@ -228,8 +436,7 @@ findroot(void) char buf[1024]; char *dir[] = { cwd, - "/Users/rsc/9vx", - "/home/rsc/plan9/4e" + "/usr/local/9vx" }; if(getcwd(cwd, sizeof cwd) == nil){ @@ -304,6 +511,10 @@ bootinit(void) */ extern uchar factotumcode[]; extern long factotumlen; + extern uchar fossilcode[]; + extern long fossillen; + extern uchar venticode[]; + extern long ventilen; if(bootboot){ extern uchar bootcode[]; @@ -314,6 +525,8 @@ bootinit(void) else addbootfile("boot", (uchar*)bootscript, strlen(bootscript)); addbootfile("factotum", factotumcode, factotumlen); + addbootfile("fossil", fossilcode, fossillen); + addbootfile("venti", venticode, ventilen); } static uchar *sp; /* user stack of init proc */ @@ -484,7 +697,8 @@ init0(void) ksetenv("service", "terminal", 0); ksetenv("user", username, 0); ksetenv("sysname", "vx32", 0); - + inifields(&inienv); + /* if we're not running /boot/boot, mount / and create /srv/boot */ if(!bootboot){ kbind("#Zplan9/", "/", MAFTER); @@ -556,8 +770,13 @@ sigsegv(int signo, siginfo_t *info, void *v) #elif defined(__FreeBSD__) mcontext_t *mc; mc = &uc->uc_mcontext; +#ifdef __i386__ eip = mc->mc_eip; esp = mc->mc_esp; +#elif defined(__amd64__) + eip = mc->mc_rip; + esp = mc->mc_rsp; +#endif addr = (ulong)info->si_addr; if(__FreeBSD__ < 7){ /* diff --git a/src/9vx/mmu.c b/src/9vx/mmu.c @@ -26,7 +26,7 @@ int tracemmu; * Plan 9 assumes this, and while it's not a ton of work to break that * assumption, it was easier not to. */ -#define MEMSIZE (256<<20) +#define MEMSIZE (256<<20) // same as ../a/devether.c:13 (TODO: var) static int pagefile; static char* pagebase; @@ -35,6 +35,19 @@ static Uspace uspace[16]; static Uspace *ulist[nelem(uspace)]; int nuspace = 1; +#ifdef __i386__ +#define BIT32 0 +#define HINT nil +#elif defined(__amd64__) +#ifdef linux +#define BIT32 MAP_32BIT +#define HINT nil +#elif defined(__FreeBSD__) +#define BIT32 MAP_FIXED +#define HINT (caddr_t)0x40000000 +#endif +#endif + int isuaddr(void *v) { @@ -56,15 +69,14 @@ mapzero(void) { int fd, bit32; void *v; + void *hint; -#ifdef i386 - bit32 = 0; -#else - bit32 = MAP_32BIT; -#endif + bit32 = BIT32; + hint = HINT; + /* First try mmaping /dev/zero. Some OS'es don't allow this. */ if((fd = open("/dev/zero", O_RDONLY)) >= 0){ - v = mmap(nil, USTKTOP, PROT_NONE, bit32|MAP_PRIVATE, fd, 0); + v = mmap(hint, USTKTOP, PROT_NONE, bit32|MAP_PRIVATE, fd, 0); if(v != MAP_FAILED) { if((uint32_t)(uintptr)v != (uintptr)v) { iprint("mmap returned 64-bit pointer %p\n", v); @@ -75,7 +87,7 @@ mapzero(void) } /* Next try an anonymous map. */ - v = mmap(nil, USTKTOP, PROT_NONE, bit32|MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + v = mmap(hint, USTKTOP, PROT_NONE, bit32|MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if(v != MAP_FAILED) { if((uint32_t)(uintptr)v != (uintptr)v) { iprint("mmap returned 64-bit pointer %p\n", v); diff --git a/src/9vx/sched.c b/src/9vx/sched.c @@ -174,7 +174,7 @@ struct Pwaiter }; void -plock(Psleep *p) +__plock(Psleep *p) { int r; @@ -193,7 +193,7 @@ plock(Psleep *p) } void -punlock(Psleep *p) +__punlock(Psleep *p) { int r; @@ -202,7 +202,7 @@ punlock(Psleep *p) } void -psleep(Psleep *p) +__psleep(Psleep *p) { int r; Pwaiter w; @@ -218,7 +218,7 @@ psleep(Psleep *p) } void -pwakeup(Psleep *p) +__pwakeup(Psleep *p) { int r; Pwaiter *w; diff --git a/src/9vx/sdloop.c b/src/9vx/sdloop.c @@ -22,6 +22,7 @@ struct Ctlr{ Chan *c; int mode; uvlong qidpath; + char fn[20]; }; static Lock ctlrlock; @@ -30,9 +31,47 @@ static Ctlr *ctlrtail; SDifc sdloopifc; +static void +loopopen(Ctlr *c) +{ + if(c->c == nil) + c->c = namec(c->fn, Aopen, c->mode, 0); +} + static SDev* looppnp(void) { + struct stat sbuf; + char c, c2; + char fn[20]; + + for(c = 'a'; c <= 'j'; ++c){ + sprint(fn, "#Z/dev/sd%c", c); + if(stat(fn+2, &sbuf) == 0) + loopdev(fn, ORDWR); + } + for(c = '0'; c <= '9'; ++c){ + sprintf(fn, "#Z/dev/sd%c",c); + if(stat(fn+2, &sbuf) == 0) + loopdev(fn, ORDWR); + } + for(c = 'a'; c <= 'j'; ++c){ + sprint(fn, "#Z/dev/hd%c", c); + if(stat(fn+2, &sbuf) == 0) + loopdev(fn, ORDWR); + } + for(c = '0'; c <= '9'; ++c){ + sprint(fn, "#Z/dev/wd%c", c); + if(stat(fn+2, &sbuf) == 0) + loopdev(fn, ORDWR); + } + for(c = '0'; c <= '8'; ++c){ + for(c2 = '0'; c2 <= '8'; ++c2){ + sprint(fn, "#Z/dev/cciss/c%cd%c", c, c2); + if(stat(fn+2, &sbuf) == 0) + loopdev(fn, ORDWR); + } + } return nil; } @@ -69,6 +108,7 @@ looponline(SDunit *unit) sdev = unit->dev; ctlr = sdev->ctlr; + loopopen(ctlr); c = ctlr->c; n = devtab[c->type]->stat(c, buf, sizeof buf); if(convM2D(buf, n, &dir, nil) == 0) @@ -99,6 +139,7 @@ looprio(SDreq *r) unit = r->unit; sdev = unit->dev; ctlr = sdev->ctlr; + loopopen(ctlr); cmd = r->cmd; if((status = sdfakescsi(r, nil, 0)) != SDnostatus){ @@ -141,6 +182,7 @@ looprctl(SDunit *unit, char *p, int l) char *e, *op; ctlr = unit->dev->ctlr; + loopopen(ctlr); e = p+l; op = p; @@ -170,7 +212,8 @@ loopclear1(Ctlr *ctlr) ctlrtail = ctlr->prev; unlock(&ctlrlock); - cclose(ctlr->c); + if(ctlr->c) + cclose(ctlr->c); free(ctlr); } @@ -187,6 +230,7 @@ looprtopctl(SDev *s, char *p, char *e) char *r; c = s->ctlr; + loopopen(c); r = "ro"; if(c->mode == ORDWR) r = "rw"; @@ -219,9 +263,9 @@ loopdev(char *name, int mode) Ctlr *volatile ctlr; SDev *volatile sdev; - c = namec(name, Aopen, mode, 0); ctlr = nil; sdev = nil; +/* if(waserror()){ cclose(c); if(ctlr) @@ -230,6 +274,7 @@ loopdev(char *name, int mode) free(sdev); nexterror(); } +*/ ctlr = smalloc(sizeof *ctlr); sdev = smalloc(sizeof *sdev); @@ -238,9 +283,11 @@ loopdev(char *name, int mode) sdev->nunit = 1; sdev->idno = '0'; ctlr->sdev = sdev; - ctlr->c = c; + strcpy(ctlr->fn, name); ctlr->mode = mode; +/* poperror(); +*/ lock(&ctlrlock); ctlr->next = nil; @@ -277,11 +324,5 @@ SDifc sdloopifc = { loopwtopctl, }; -SDifc *sdifc[] = -{ - &sdloopifc, - nil -}; - diff --git a/src/9vx/u.h b/src/9vx/u.h @@ -17,3 +17,4 @@ typedef int socklen_t; #define nil ((void*)0) #define sleep _ksleep #define syscall _ksyscall +#define atoi(x) strtol(x, 0, 0) diff --git a/src/9vx/venti.9 b/src/9vx/venti.9 Binary files differ. diff --git a/src/9vx/vether.c b/src/9vx/vether.c @@ -0,0 +1,122 @@ +#include "u.h" +#include "mem.h" +#include "lib.h" +#include "dat.h" +#include "fns.h" +#include "error.h" +#include "ip/ip.h" +#include "netif.h" +#include "etherif.h" +#include "vether.h" +#include "sd.h" + +extern int nettap; +extern void ethertaplink(void); +extern void etherpcaplink(void); +extern void ethermediumlink(void); +extern void loopbackmediumlink(void); +extern void netdevmediumlink(void); + +extern void ilinit(Fs*); +extern void tcpinit(Fs*); +extern void udpinit(Fs*); +extern void ipifcinit(Fs*); +extern void icmpinit(Fs*); +extern void icmp6init(Fs*); +extern void greinit(Fs*); +extern void ipmuxinit(Fs*); +extern void espinit(Fs*); + +extern SDifc sdloopifc; +extern SDifc sdaoeifc; + +void +setmac(char *macaddr) +{ + int i; + char **nc = &macaddr; + + if(nve == 0) + return; + ve[nve-1].mac = macaddr; + for(i = 0; i < Eaddrlen; i++){ + ve[nve-1].ea[i] = (uchar)strtoul(macaddr, nc, 16); + macaddr = *nc+1; + } +} + +static int +eainuse(int n, uchar ea[Eaddrlen]) +{ + int i; + + for(i = 0; i < nve; i++) + if((i<n || ve[i].mac != nil) && memcmp(ea, ve[i].ea, Eaddrlen) == 0) + return -1; + return 0; +} + +void +addve(char *dev, int tap) +{ + if(nve == MaxEther) + panic("too many virtual ether cards"); + ve[nve].tap = tap; + ve[nve].dev = dev; + ve[nve].mac = nil; + nve++; +} + +void links(void) { + static uchar ea[Eaddrlen] = {0x00, 0x00, 0x09, 0x00, 0x00, 0x00}; + + ethermediumlink(); + loopbackmediumlink(); + netdevmediumlink(); + for(int i=0; i<nve; i++){ + if(ve[i].mac == nil){ + while(eainuse(i, ea)) + ea[5]++; + memcpy(ve[i].ea, ea, Eaddrlen); + } + if(ve[i].tap == 1) + ethertaplink(); + else + etherpcaplink(); + } +} + +void (*ipprotoinit[])(Fs*) = { + ilinit, + tcpinit, + udpinit, + ipifcinit, + icmpinit, + icmp6init, + greinit, + ipmuxinit, + espinit, + nil, +}; + +int +eafrom(char *ma, uchar ea[6]) +{ + int i; + char **nc = &ma; + + for(i = 0; i < 6; i++){ + if(!ma) + return -1; + ea[i] = (uchar)strtoul(ma, nc, 16); + ma = *nc+1; + } + return 0; +} + +SDifc *sdifc[] = +{ + &sdloopifc, + &sdaoeifc, + 0, +}; diff --git a/src/9vx/vether.h b/src/9vx/vether.h @@ -0,0 +1,15 @@ +typedef struct Vether Vether; +struct Vether +{ + int tap; + char *dev; + char *mac; + uchar ea[Eaddrlen]; +}; + +Vether ve[MaxEther+1]; +int nve; + +void setmac(char*); +void addve(char*, int); +void links(); diff --git a/src/libvx32/Makefrag b/src/libvx32/Makefrag @@ -1,8 +1,12 @@ ifeq ($(ARCH),x86_64) VX32_RUN = run64.o else +ifeq ($(ARCH),amd64) +VX32_RUN = run64.o +else VX32_RUN = run32.o endif +endif ifeq ($(OS),darwin) VX32_RUN := $(VX32_RUN) darwin-asm.o diff --git a/src/libvx32/freebsd.c b/src/libvx32/freebsd.c @@ -20,18 +20,34 @@ #warning "libvx32 and FreeBSD 5 and 6's libpthread are not compatible." #endif +#ifdef __i386__ static void setbase(struct segment_descriptor *desc, unsigned long base) +#elif defined __amd64__ +static void setbase(struct user_segment_descriptor *desc, unsigned long base) +#endif { desc->sd_lobase = base & 0xffffff; desc->sd_hibase = base >> 24; } +#ifdef __i386__ static void setlimit(struct segment_descriptor *desc, unsigned long limit) +#elif defined __amd64__ +static void setlimit(struct user_segment_descriptor *desc, unsigned long limit) +#endif { desc->sd_lolimit = limit & 0xffff; desc->sd_hilimit = limit >> 16; } +/* +#ifdef __amd64__ +union descriptor { + struct user_segment_descriptor sd; + struct gate_descriptor gd; +}; +#endif +*/ int vxemu_map(vxemu *emu, vxmmap *mm) { @@ -52,27 +68,44 @@ int vxemu_map(vxemu *emu, vxmmap *mm) desc.sd.sd_def32 = 1; desc.sd.sd_gran = 1; if(emu->datasel == 0){ +#ifdef __i386__ if ((s = i386_set_ldt(LDT_AUTO_ALLOC, &desc, 1)) < 0) +#elif defined __amd64__ + if ((s = sysarch(I386_SET_GSBASE, &desc)) < 0) +#endif return -1; emu->datasel = (s<<3) + 4 + 3; // 4=LDT, 3=RPL - }else if(i386_set_ldt(emu->datasel >> 3, &desc, 1) < 0) +#ifdef __i386__ + }else if (i386_set_ldt(emu->datasel >> 3, &desc, 1) < 0) +#elif defined __amd64__ + }else if (sysarch(I386_SET_GSBASE, &desc) < 0) +#endif return -1; // Set up the process's vxemu segment selector (for FS). setbase(&desc.sd, (unsigned long)emu); setlimit(&desc.sd, (VXCODEBUFSIZE - 1) >> VXPAGESHIFT); if(emu->emusel == 0){ +#ifdef __i386__ if ((s = i386_set_ldt(LDT_AUTO_ALLOC, &desc, 1)) < 0) +#elif defined __amd64__ + if ((s = sysarch(I386_SET_GSBASE, &desc)) < 0) +#endif return -1; emu->emusel = (s<<3) + 4 + 3; // 4=LDT, 3=RPL - }else if(i386_set_ldt(emu->emusel >> 3, &desc, 1) < 0) +#ifdef __i386__ + }else if (i386_set_ldt(emu->emusel >> 3, &desc, 1) < 0) +#elif defined __amd64__ + }else if (sysarch(I386_SET_GSBASE, &desc) < 0) +#endif return -1; emu->ldt_base = (uintptr_t)mm->base; emu->ldt_size = mm->size; } -#ifdef __x86_64 +#ifdef __amd64__ +/* // Set up 32-bit mode code and data segments (not vxproc-specific), // giving access to the full low 32-bit of linear address space. // The code segment is necessary to get into 32-bit compatibility mode; @@ -80,11 +113,9 @@ int vxemu_map(vxemu *emu, vxmmap *mm) // doesn't give 64-bit processes a "real" data segment by default // but instead just loads zero into the data segment selectors! emu->runptr.sel = FLATCODE; - desc.entry_number = emu->runptr.sel / 8; - desc.base_addr = 0; - desc.limit = 0xfffff; - desc.contents = MODIFY_LDT_CONTENTS_CODE; - if (modify_ldt(1, &desc, sizeof(desc)) < 0) + setbase(&desc.sd, 0); + setlimit(&desc.sd, 0xfffff); + if ((s = sysarch(I386_SET_GSBASE, &desc)) < 0) return -1; desc.entry_number = FLATDATA / 8; @@ -97,6 +128,7 @@ int vxemu_map(vxemu *emu, vxmmap *mm) extern void vxrun_return(); asm volatile("movw %%cs,%0" : "=r" (emu->retptr.sel)); emu->retptr.ofs = (uint32_t)(intptr_t)vxrun_return; +*/ #endif return 0; @@ -122,28 +154,35 @@ static void dumpmcontext(mcontext_t *ctx, uint32_t cr2) "r12 %016lx r13 %016lx\nr14 %016lx r15 %016lx\n" "rip %016lx efl %016lx cs %04x ss %04x\n" "err %016lx trapno %016lx cr2 %016lx\n", - ctx->rax, ctx->rbx, ctx->rcx, ctx->rdx, - ctx->rsi, ctx->rdi, ctx->rbp, ctx->rsp, - ctx->r8, ctx->r9, ctx->r10, ctx->r11, - ctx->r12, ctx->r13, ctx->r14, ctx->r15, - ctx->rip, ctx->eflags, ctx->cs, ctx->__pad0, - ctx->err, ctx->trapno, ctx->cr2); + ctx->mc_rax, ctx->mc_rbx, ctx->mc_rcx, ctx->mc_rdx, + ctx->mc_rsi, ctx->mc_rdi, ctx->mc_rbp, ctx->mc_rsp, + ctx->mc_r8, ctx->mc_r9, ctx->mc_r10, ctx->mc_r11, + ctx->mc_r12, ctx->mc_r13, ctx->mc_r14, ctx->mc_r15, + ctx->mc_rip, ctx->mc_rflags, ctx->mc_cs, ctx->mc_ss, + ctx->mc_err, ctx->mc_trapno, cr2); #endif } static void fprestore(int *state, int fmt) { +#ifdef __i386__ if(fmt == _MC_FPFMT_387) asm volatile("frstor 0(%%eax); fwait\n" : : "a" (state) : "memory"); - else if(fmt == _MC_FPFMT_XMM){ + else +#endif + if(fmt == _MC_FPFMT_XMM){ /* Have to 16-align the 512-byte state */ char buf[512+16], *p; p = buf; if((long)p&15) p += 16 - (long)p&15; memmove(p, state, 512); +#ifdef __i386__ asm volatile("fxrstor 0(%%eax); fwait\n" : : "a" (p) : "memory"); +#elif defined(__amd64__) + asm volatile("fxrstor 0(%%rax); fwait\n" : : "a" (p) : "memory"); +#endif }else abort(); } @@ -167,12 +206,22 @@ int vx32_sighandler(int signo, siginfo_t *si, void *v) // First sanity check vxproc segment number. // FreeBSD reset the register before entering the handler! +#ifdef __i386__ asm("movw %"VSEGSTR",%0" : "=r" (oldvs)); vs = mc->mc_vs & 0xFFFF; /* mc_vs #defined in os.h */ +#elif defined(__amd64__) + if (sysarch(I386_GET_GSBASE, &vs) < 0) + return 0; +#endif +#ifdef __i386__ if(0) vxprint("vx32_sighandler signo=%d eip=%#x esp=%#x vs=%#x currentvs=%#x\n", signo, mc->mc_eip, mc->mc_esp, vs, oldvs); +#elif defined(__amd64__) + if(0) vxprint("vx32_sighandler signo=%d rip=%#x rsp=%#x vs=%#x currentvs=%#x\n", + signo, mc->mc_rip, mc->mc_rsp, vs, oldvs); +#endif if ((vs & 7) != 7) // LDT, RPL=3 return 0; @@ -192,12 +241,21 @@ int vx32_sighandler(int signo, siginfo_t *si, void *v) // Okay, we're convinced. // Find current vxproc and vxemu. +#ifdef __i386__ asm("movw %"VSEGSTR",%1\n" "movw %2,%"VSEGSTR"\n" "movl %"VSEGSTR":%3,%0\n" "movw %1,%"VSEGSTR"\n" : "=r" (vxp), "=r" (oldvs) : "r" (vs), "m" (((vxemu*)0)->proc)); +#elif defined(__amd64__) + asm("movw %"VSEGSTR",%1\n" + "movw %2,%"VSEGSTR"\n" + "movw %"VSEGSTR":%3,%0\n" + "movw %1,%"VSEGSTR"\n" + : "=r" (vxp), "=r" (oldvs) + : "r" (vs), "m" (((vxemu*)0)->proc)); +#endif emu = vxp->emu; // Get back our regular host segment register state, @@ -212,7 +270,11 @@ int vx32_sighandler(int signo, siginfo_t *si, void *v) switch(signo){ case SIGSEGV: newtrap = VXTRAP_PAGEFAULT; +#ifdef __i386__ addr = (uint32_t)si->si_addr; +#elif defined(__amd64__) + addr = (uint64_t)si->si_addr; +#endif break; case SIGBUS: /* @@ -242,7 +304,11 @@ int vx32_sighandler(int signo, siginfo_t *si, void *v) // before entering the signal handler. addr = 0; newtrap = VXTRAP_SINGLESTEP; +#ifdef __i386__ mc->mc_eflags &= ~EFLAGS_TF; // Just in case. +#elif defined(__amd64__) + mc->mc_rflags &= ~EFLAGS_TF; // Just in case. +#endif break; default: @@ -264,51 +330,111 @@ int vx32_sighandler(int signo, siginfo_t *si, void *v) } emu->cpu_trap = newtrap; +#ifdef __i386__ r = vxemu_sighandler(emu, mc->mc_eip); +#elif defined(__amd64__) + r = vxemu_sighandler(emu, mc->mc_rip); +#endif if (r == VXSIG_SINGLESTEP){ // Vxemu_sighandler wants us to single step. // Execution state is in intermediate state - don't touch. +#ifdef __i386__ mc->mc_eflags |= EFLAGS_TF; // x86 TF (single-step) bit +#elif defined(__amd64__) + mc->mc_rflags |= EFLAGS_TF; +#endif vxrun_setup(emu); return 1; } // Copy execution state into emu. if ((r & VXSIG_SAVE_ALL) == VXSIG_SAVE_ALL) { +#ifdef __i386__ emu->cpu.reg[EAX] = mc->mc_eax; emu->cpu.reg[EBX] = mc->mc_ebx; emu->cpu.reg[ECX] = mc->mc_ecx; emu->cpu.reg[EDX] = mc->mc_edx; - emu->cpu.reg[ESI] = mc->mc_esi; + emu->cpu.reg[ESI] = mc->mc_esi; emu->cpu.reg[EDI] = mc->mc_edi; emu->cpu.reg[ESP] = mc->mc_esp; // or esp_at_signal ??? emu->cpu.reg[EBP] = mc->mc_ebp; emu->cpu.eflags = mc->mc_eflags; +#elif defined(__amd64__) + emu->cpu.reg[EAX] = mc->mc_rax; + emu->cpu.reg[EBX] = mc->mc_rbx; + emu->cpu.reg[ECX] = mc->mc_rcx; + emu->cpu.reg[EDX] = mc->mc_rdx; + emu->cpu.reg[ESI] = mc->mc_rsi; + emu->cpu.reg[EDI] = mc->mc_rdi; + emu->cpu.reg[ESP] = mc->mc_rsp; // or esp_at_signal ??? + emu->cpu.reg[EBP] = mc->mc_rbp; + emu->cpu.eflags = mc->mc_rflags; +#endif } else if (r & VXSIG_SAVE_ALL) { if (r & VXSIG_SAVE_EAX) +#ifdef __i386__ emu->cpu.reg[EAX] = mc->mc_eax; +#elif defined(__amd64__) + emu->cpu.reg[EAX] = mc->mc_rax; +#endif if (r & VXSIG_SAVE_EBX) +#ifdef __i386__ emu->cpu.reg[EBX] = mc->mc_ebx; +#elif defined(__amd64__) + emu->cpu.reg[EBX] = mc->mc_rbx; +#endif if (r & VXSIG_SAVE_ECX) +#ifdef __i386__ emu->cpu.reg[ECX] = mc->mc_ecx; +#elif defined(__amd64__) + emu->cpu.reg[ECX] = mc->mc_rcx; +#endif if (r & VXSIG_SAVE_EDX) +#ifdef __i386__ emu->cpu.reg[EDX] = mc->mc_edx; +#elif defined(__amd64__) + emu->cpu.reg[EDX] = mc->mc_rdx; +#endif if (r & VXSIG_SAVE_ESI) +#ifdef __i386__ emu->cpu.reg[ESI] = mc->mc_esi; +#elif defined(__amd64__) + emu->cpu.reg[ESI] = mc->mc_rsi; +#endif if (r & VXSIG_SAVE_EDI) +#ifdef __i386__ emu->cpu.reg[EDI] = mc->mc_edi; +#elif defined(__amd64__) + emu->cpu.reg[EDI] = mc->mc_rdi; +#endif if (r & VXSIG_SAVE_ESP) +#ifdef __i386__ emu->cpu.reg[ESP] = mc->mc_esp; // or esp_at_signal ??? +#elif defined(__amd64__) + emu->cpu.reg[ESP] = mc->mc_rsp; // or esp_at_signal ??? +#endif if (r & VXSIG_SAVE_EBP) +#ifdef __i386__ emu->cpu.reg[EBP] = mc->mc_ebp; +#elif defined(__amd64__) + emu->cpu.reg[EBP] = mc->mc_rbp; +#endif if (r & VXSIG_SAVE_EFLAGS) +#ifdef __i386__ emu->cpu.eflags = mc->mc_eflags; +#elif defined(__amd64__) + emu->cpu.eflags = mc->mc_rflags; +#endif } r &= ~VXSIG_SAVE_ALL; if (r & VXSIG_SAVE_EBX_AS_EIP) +#ifdef __i386__ emu->cpu.eip = mc->mc_ebx; +#elif defined(__amd64__) + emu->cpu.eip = mc->mc_rbx; +#endif r &= ~VXSIG_SAVE_EBX_AS_EIP; if (r & VXSIG_ADD_COUNT_TO_ESP) { @@ -327,7 +453,11 @@ int vx32_sighandler(int signo, siginfo_t *si, void *v) return 0; emu->cpu.traperr = mc->mc_err; emu->cpu.trapva = addr; +#ifdef __i386__ memmove(&mc->mc_gs, &emu->trapenv->mc_gs, 19*4); +#elif defined(__amd64__) + memmove(&mc->mc_onstack, &emu->trapenv->mc_onstack, sizeof(mcontext_t)); +#endif return 1; } diff --git a/src/libvx32/run64.S b/src/libvx32/run64.S @@ -79,7 +79,11 @@ vxrun: movl VXEMU_EDI(%r8),%edi // Run translated code +#ifndef __FreeBSD__ ljmpl *VXEMU_RUNPTR(%r8) // 'ljmpq' doesn't work - gas bug?? +#else + ljmpq *VXEMU_RUNPTR(%r8) +#endif // Return from running translated code to the normal host environment.