tcp.c (66246B)
1 #include "u.h" 2 #include "lib.h" 3 #include "mem.h" 4 #include "dat.h" 5 #include "fns.h" 6 #include "error.h" 7 8 #include "ip.h" 9 10 enum 11 { 12 QMAX = 64*1024-1, 13 IP_TCPPROTO = 6, 14 15 TCP4_IPLEN = 8, 16 TCP4_PHDRSIZE = 12, 17 TCP4_HDRSIZE = 20, 18 TCP4_TCBPHDRSZ = 40, 19 TCP4_PKT = TCP4_IPLEN+TCP4_PHDRSIZE, 20 21 TCP6_IPLEN = 0, 22 TCP6_PHDRSIZE = 40, 23 TCP6_HDRSIZE = 20, 24 TCP6_TCBPHDRSZ = 60, 25 TCP6_PKT = TCP6_IPLEN+TCP6_PHDRSIZE, 26 27 TcptimerOFF = 0, 28 TcptimerON = 1, 29 TcptimerDONE = 2, 30 MAX_TIME = (1<<20), /* Forever */ 31 TCP_ACK = 50, /* Timed ack sequence in ms */ 32 MAXBACKMS = 9*60*1000, /* longest backoff time (ms) before hangup */ 33 34 URG = 0x20, /* Data marked urgent */ 35 ACK = 0x10, /* Acknowledge is valid */ 36 PSH = 0x08, /* Whole data pipe is pushed */ 37 RST = 0x04, /* Reset connection */ 38 SYN = 0x02, /* Pkt. is synchronise */ 39 FIN = 0x01, /* Start close down */ 40 41 EOLOPT = 0, 42 NOOPOPT = 1, 43 MSSOPT = 2, 44 MSS_LENGTH = 4, /* Mean segment size */ 45 WSOPT = 3, 46 WS_LENGTH = 3, /* Bits to scale window size by */ 47 MSL2 = 10, 48 MSPTICK = 50, /* Milliseconds per timer tick */ 49 DEF_MSS = 1460, /* Default mean segment */ 50 DEF_MSS6 = 1280, /* Default mean segment (min) for v6 */ 51 DEF_RTT = 500, /* Default round trip */ 52 DEF_KAT = 120000, /* Default time (ms) between keep alives */ 53 TCP_LISTEN = 0, /* Listen connection */ 54 TCP_CONNECT = 1, /* Outgoing connection */ 55 SYNACK_RXTIMER = 250, /* ms between SYNACK retransmits */ 56 57 TCPREXMTTHRESH = 3, /* dupack threshhold for rxt */ 58 59 FORCE = 1, 60 CLONE = 2, 61 RETRAN = 4, 62 ACTIVE = 8, 63 SYNACK = 16, 64 65 LOGAGAIN = 3, 66 LOGDGAIN = 2, 67 68 Closed = 0, /* Connection states */ 69 Listen, 70 Syn_sent, 71 Syn_received, 72 Established, 73 Finwait1, 74 Finwait2, 75 Close_wait, 76 Closing, 77 Last_ack, 78 Time_wait, 79 80 Maxlimbo = 1000, /* maximum procs waiting for response to SYN ACK */ 81 NLHT = 256, /* hash table size, must be a power of 2 */ 82 LHTMASK = NLHT-1, 83 84 HaveWS = 1<<8, 85 }; 86 87 /* Must correspond to the enumeration above */ 88 char *tcpstates[] = 89 { 90 "Closed", "Listen", "Syn_sent", "Syn_received", 91 "Established", "Finwait1", "Finwait2", "Close_wait", 92 "Closing", "Last_ack", "Time_wait" 93 }; 94 95 typedef struct Tcptimer Tcptimer; 96 struct Tcptimer 97 { 98 Tcptimer *next; 99 Tcptimer *prev; 100 Tcptimer *readynext; 101 int state; 102 int start; 103 int count; 104 void (*func)(void*); 105 void *arg; 106 }; 107 108 /* 109 * v4 and v6 pseudo headers used for 110 * checksuming tcp 111 */ 112 typedef struct Tcp4hdr Tcp4hdr; 113 struct Tcp4hdr 114 { 115 uchar vihl; /* Version and header length */ 116 uchar tos; /* Type of service */ 117 uchar length[2]; /* packet length */ 118 uchar id[2]; /* Identification */ 119 uchar frag[2]; /* Fragment information */ 120 uchar Unused; 121 uchar proto; 122 uchar tcplen[2]; 123 uchar tcpsrc[4]; 124 uchar tcpdst[4]; 125 uchar tcpsport[2]; 126 uchar tcpdport[2]; 127 uchar tcpseq[4]; 128 uchar tcpack[4]; 129 uchar tcpflag[2]; 130 uchar tcpwin[2]; 131 uchar tcpcksum[2]; 132 uchar tcpurg[2]; 133 /* Options segment */ 134 uchar tcpopt[1]; 135 }; 136 137 typedef struct Tcp6hdr Tcp6hdr; 138 struct Tcp6hdr 139 { 140 uchar vcf[4]; 141 uchar ploadlen[2]; 142 uchar proto; 143 uchar ttl; 144 uchar tcpsrc[IPaddrlen]; 145 uchar tcpdst[IPaddrlen]; 146 uchar tcpsport[2]; 147 uchar tcpdport[2]; 148 uchar tcpseq[4]; 149 uchar tcpack[4]; 150 uchar tcpflag[2]; 151 uchar tcpwin[2]; 152 uchar tcpcksum[2]; 153 uchar tcpurg[2]; 154 /* Options segment */ 155 uchar tcpopt[1]; 156 }; 157 158 /* 159 * this represents the control info 160 * for a single packet. It is derived from 161 * a packet in ntohtcp{4,6}() and stuck into 162 * a packet in htontcp{4,6}(). 163 */ 164 typedef struct Tcp Tcp; 165 struct Tcp 166 { 167 ushort source; 168 ushort dest; 169 ulong seq; 170 ulong ack; 171 uchar flags; 172 ushort ws; /* window scale option (if not zero) */ 173 ulong wnd; 174 ushort urg; 175 ushort mss; /* max segment size option (if not zero) */ 176 ushort len; /* size of data */ 177 }; 178 179 /* 180 * this header is malloc'd to thread together fragments 181 * waiting to be coalesced 182 */ 183 typedef struct Reseq Reseq; 184 struct Reseq 185 { 186 Reseq *next; 187 Tcp seg; 188 Block *bp; 189 ushort length; 190 }; 191 192 /* 193 * the QLOCK in the Conv locks this structure 194 */ 195 typedef struct Tcpctl Tcpctl; 196 struct Tcpctl 197 { 198 uchar state; /* Connection state */ 199 uchar type; /* Listening or active connection */ 200 uchar code; /* Icmp code */ 201 struct { 202 ulong una; /* Unacked data pointer */ 203 ulong nxt; /* Next sequence expected */ 204 ulong ptr; /* Data pointer */ 205 ulong wnd; /* Tcp send window */ 206 ulong urg; /* Urgent data pointer */ 207 ulong wl2; 208 int scale; /* how much to right shift window in xmitted packets */ 209 /* to implement tahoe and reno TCP */ 210 ulong dupacks; /* number of duplicate acks rcvd */ 211 int recovery; /* loss recovery flag */ 212 ulong rxt; /* right window marker for recovery */ 213 } snd; 214 struct { 215 ulong nxt; /* Receive pointer to next uchar slot */ 216 ulong wnd; /* Receive window incoming */ 217 ulong urg; /* Urgent pointer */ 218 int blocked; 219 int una; /* unacked data segs */ 220 int scale; /* how much to left shift window in rcved packets */ 221 } rcv; 222 ulong iss; /* Initial sequence number */ 223 int sawwsopt; /* true if we saw a wsopt on the incoming SYN */ 224 ulong cwind; /* Congestion window */ 225 int scale; /* desired snd.scale */ 226 ushort ssthresh; /* Slow start threshold */ 227 int resent; /* Bytes just resent */ 228 int irs; /* Initial received squence */ 229 ushort mss; /* Mean segment size */ 230 int rerecv; /* Overlap of data rerecevived */ 231 ulong window; /* Recevive window */ 232 uchar backoff; /* Exponential backoff counter */ 233 int backedoff; /* ms we've backed off for rexmits */ 234 uchar flags; /* State flags */ 235 Reseq *reseq; /* Resequencing queue */ 236 Tcptimer timer; /* Activity timer */ 237 Tcptimer acktimer; /* Acknowledge timer */ 238 Tcptimer rtt_timer; /* Round trip timer */ 239 Tcptimer katimer; /* keep alive timer */ 240 ulong rttseq; /* Round trip sequence */ 241 int srtt; /* Shortened round trip */ 242 int mdev; /* Mean deviation of round trip */ 243 int kacounter; /* count down for keep alive */ 244 uint sndsyntime; /* time syn sent */ 245 ulong time; /* time Finwait2 or Syn_received was sent */ 246 int nochecksum; /* non-zero means don't send checksums */ 247 int flgcnt; /* number of flags in the sequence (FIN,SEQ) */ 248 249 union { 250 Tcp4hdr tcp4hdr; 251 Tcp6hdr tcp6hdr; 252 } protohdr; /* prototype header */ 253 }; 254 255 /* 256 * New calls are put in limbo rather than having a conversation structure 257 * allocated. Thus, a SYN attack results in lots of limbo'd calls but not 258 * any real Conv structures mucking things up. Calls in limbo rexmit their 259 * SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second. 260 * 261 * In particular they aren't on a listener's queue so that they don't figure 262 * in the input queue limit. 263 * 264 * If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue 265 * of 70000 limbo'd calls. Not great for a linear list but doable. Therefore 266 * there is no hashing of this list. 267 */ 268 typedef struct Limbo Limbo; 269 struct Limbo 270 { 271 Limbo *next; 272 273 uchar laddr[IPaddrlen]; 274 uchar raddr[IPaddrlen]; 275 ushort lport; 276 ushort rport; 277 ulong irs; /* initial received sequence */ 278 ulong iss; /* initial sent sequence */ 279 ushort mss; /* mss from the other end */ 280 ushort rcvscale; /* how much to scale rcvd windows */ 281 ushort sndscale; /* how much to scale sent windows */ 282 ulong lastsend; /* last time we sent a synack */ 283 uchar version; /* v4 or v6 */ 284 uchar rexmits; /* number of retransmissions */ 285 }; 286 287 int tcp_irtt = DEF_RTT; /* Initial guess at round trip time */ 288 ushort tcp_mss = DEF_MSS; /* Maximum segment size to be sent */ 289 290 enum { 291 /* MIB stats */ 292 MaxConn, 293 ActiveOpens, 294 PassiveOpens, 295 EstabResets, 296 CurrEstab, 297 InSegs, 298 OutSegs, 299 RetransSegs, 300 RetransTimeouts, 301 InErrs, 302 OutRsts, 303 304 /* non-MIB stats */ 305 CsumErrs, 306 HlenErrs, 307 LenErrs, 308 OutOfOrder, 309 310 Nstats 311 }; 312 313 static char *statnames[] = 314 { 315 [MaxConn] "MaxConn", 316 [ActiveOpens] "ActiveOpens", 317 [PassiveOpens] "PassiveOpens", 318 [EstabResets] "EstabResets", 319 [CurrEstab] "CurrEstab", 320 [InSegs] "InSegs", 321 [OutSegs] "OutSegs", 322 [RetransSegs] "RetransSegs", 323 [RetransTimeouts] "RetransTimeouts", 324 [InErrs] "InErrs", 325 [OutRsts] "OutRsts", 326 [CsumErrs] "CsumErrs", 327 [HlenErrs] "HlenErrs", 328 [LenErrs] "LenErrs", 329 [OutOfOrder] "OutOfOrder", 330 }; 331 332 typedef struct Tcppriv Tcppriv; 333 struct Tcppriv 334 { 335 /* List of active timers */ 336 QLock tl; 337 Tcptimer *timers; 338 339 /* hash table for matching conversations */ 340 Ipht ht; 341 342 /* calls in limbo waiting for an ACK to our SYN ACK */ 343 int nlimbo; 344 Limbo *lht[NLHT]; 345 346 /* for keeping track of tcpackproc */ 347 QLock apl; 348 int ackprocstarted; 349 350 ulong stats[Nstats]; 351 }; 352 353 /* 354 * Setting tcpporthogdefense to non-zero enables Dong Lin's 355 * solution to hijacked systems staking out port's as a form 356 * of DoS attack. 357 * 358 * To avoid stateless Conv hogs, we pick a sequence number at random. If 359 * that number gets acked by the other end, we shut down the connection. 360 * Look for tcpporthogdefense in the code. 361 */ 362 int tcpporthogdefense = 0; 363 364 int addreseq(Tcpctl*, Tcppriv*, Tcp*, Block*, ushort); 365 void getreseq(Tcpctl*, Tcp*, Block**, ushort*); 366 void localclose(Conv*, char*); 367 void procsyn(Conv*, Tcp*); 368 void tcpiput(Proto*, Ipifc*, Block*); 369 void tcpoutput(Conv*); 370 int tcptrim(Tcpctl*, Tcp*, Block**, ushort*); 371 void tcpstart(Conv*, int); 372 void tcptimeout(void*); 373 void tcpsndsyn(Conv*, Tcpctl*); 374 void tcprcvwin(Conv*); 375 void tcpacktimer(void*); 376 void tcpkeepalive(void*); 377 void tcpsetkacounter(Tcpctl*); 378 void tcprxmit(Conv*); 379 void tcpsettimer(Tcpctl*); 380 void tcpsynackrtt(Conv*); 381 void tcpsetscale(Conv*, Tcpctl*, ushort, ushort); 382 383 static void limborexmit(Proto*); 384 static void limbo(Conv*, uchar*, uchar*, Tcp*, int); 385 386 void 387 tcpsetstate(Conv *s, uchar newstate) 388 { 389 Tcpctl *tcb; 390 uchar oldstate; 391 Tcppriv *tpriv; 392 393 tpriv = s->p->priv; 394 395 tcb = (Tcpctl*)s->ptcl; 396 397 oldstate = tcb->state; 398 if(oldstate == newstate) 399 return; 400 401 if(oldstate == Established) 402 tpriv->stats[CurrEstab]--; 403 if(newstate == Established) 404 tpriv->stats[CurrEstab]++; 405 406 /** 407 print( "%d/%d %s->%s CurrEstab=%d\n", s->lport, s->rport, 408 tcpstates[oldstate], tcpstates[newstate], tpriv->tstats.tcpCurrEstab ); 409 **/ 410 411 switch(newstate) { 412 case Closed: 413 qclose(s->rq); 414 qclose(s->wq); 415 qclose(s->eq); 416 break; 417 418 case Close_wait: /* Remote closes */ 419 qhangup(s->rq, nil); 420 break; 421 } 422 423 tcb->state = newstate; 424 425 if(oldstate == Syn_sent && newstate != Closed) 426 Fsconnected(s, nil); 427 } 428 429 static char* 430 tcpconnect(Conv *c, char **argv, int argc) 431 { 432 char *e; 433 Tcpctl *tcb; 434 435 tcb = (Tcpctl*)(c->ptcl); 436 if(tcb->state != Closed) 437 return Econinuse; 438 439 e = Fsstdconnect(c, argv, argc); 440 if(e != nil) 441 return e; 442 tcpstart(c, TCP_CONNECT); 443 444 return nil; 445 } 446 447 static int 448 tcpstate(Conv *c, char *state, int n) 449 { 450 Tcpctl *s; 451 452 s = (Tcpctl*)(c->ptcl); 453 454 return snprint(state, n, 455 "%s qin %d qout %d srtt %d mdev %d cwin %lud swin %lud>>%d rwin %lud>>%d timer.start %d timer.count %d rerecv %d katimer.start %d katimer.count %d\n", 456 tcpstates[s->state], 457 c->rq ? qlen(c->rq) : 0, 458 c->wq ? qlen(c->wq) : 0, 459 s->srtt, s->mdev, 460 s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale, 461 s->timer.start, s->timer.count, s->rerecv, 462 s->katimer.start, s->katimer.count); 463 } 464 465 static int 466 tcpinuse(Conv *c) 467 { 468 Tcpctl *s; 469 470 s = (Tcpctl*)(c->ptcl); 471 return s->state != Closed; 472 } 473 474 static char* 475 tcpannounce(Conv *c, char **argv, int argc) 476 { 477 char *e; 478 Tcpctl *tcb; 479 480 tcb = (Tcpctl*)(c->ptcl); 481 if(tcb->state != Closed) 482 return Econinuse; 483 484 e = Fsstdannounce(c, argv, argc); 485 if(e != nil) 486 return e; 487 tcpstart(c, TCP_LISTEN); 488 Fsconnected(c, nil); 489 490 return nil; 491 } 492 493 /* 494 * tcpclose is always called with the q locked 495 */ 496 static void 497 tcpclose(Conv *c) 498 { 499 Tcpctl *tcb; 500 501 tcb = (Tcpctl*)c->ptcl; 502 503 qhangup(c->rq, nil); 504 qhangup(c->wq, nil); 505 qhangup(c->eq, nil); 506 qflush(c->rq); 507 508 switch(tcb->state) { 509 case Listen: 510 /* 511 * reset any incoming calls to this listener 512 */ 513 Fsconnected(c, "Hangup"); 514 515 localclose(c, nil); 516 break; 517 case Closed: 518 case Syn_sent: 519 localclose(c, nil); 520 break; 521 case Syn_received: 522 case Established: 523 tcb->flgcnt++; 524 tcb->snd.nxt++; 525 tcpsetstate(c, Finwait1); 526 tcpoutput(c); 527 break; 528 case Close_wait: 529 tcb->flgcnt++; 530 tcb->snd.nxt++; 531 tcpsetstate(c, Last_ack); 532 tcpoutput(c); 533 break; 534 } 535 } 536 537 void 538 tcpkick(void *x) 539 { 540 Conv *s = x; 541 Tcpctl *tcb; 542 543 tcb = (Tcpctl*)s->ptcl; 544 545 if(waserror()){ 546 QUNLOCK(s); 547 nexterror(); 548 } 549 QLOCK(s); 550 551 switch(tcb->state) { 552 case Syn_sent: 553 case Syn_received: 554 case Established: 555 case Close_wait: 556 /* 557 * Push data 558 */ 559 tcprcvwin(s); 560 tcpoutput(s); 561 break; 562 default: 563 localclose(s, "Hangup"); 564 break; 565 } 566 567 QUNLOCK(s); 568 poperror(); 569 } 570 571 void 572 tcprcvwin(Conv *s) /* Call with tcb locked */ 573 { 574 int w; 575 Tcpctl *tcb; 576 577 tcb = (Tcpctl*)s->ptcl; 578 w = tcb->window - qlen(s->rq); 579 if(w < 0) 580 w = 0; 581 tcb->rcv.wnd = w; 582 if(w == 0) 583 tcb->rcv.blocked = 1; 584 } 585 586 void 587 tcpacktimer(void *v) 588 { 589 Tcpctl *tcb; 590 Conv *s; 591 592 s = v; 593 tcb = (Tcpctl*)s->ptcl; 594 595 if(waserror()){ 596 QUNLOCK(s); 597 nexterror(); 598 } 599 QLOCK(s); 600 if(tcb->state != Closed){ 601 tcb->flags |= FORCE; 602 tcprcvwin(s); 603 tcpoutput(s); 604 } 605 QUNLOCK(s); 606 poperror(); 607 } 608 609 static void 610 tcpcreate(Conv *c) 611 { 612 c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c); 613 c->wq = qopen((3*QMAX)/2, Qkick, tcpkick, c); 614 } 615 616 static void 617 timerstate(Tcppriv *priv, Tcptimer *t, int newstate) 618 { 619 if(newstate != TcptimerON){ 620 if(t->state == TcptimerON){ 621 /* unchain */ 622 if(priv->timers == t){ 623 priv->timers = t->next; 624 if(t->prev != nil) 625 panic("timerstate1"); 626 } 627 if(t->next) 628 t->next->prev = t->prev; 629 if(t->prev) 630 t->prev->next = t->next; 631 t->next = t->prev = nil; 632 } 633 } else { 634 if(t->state != TcptimerON){ 635 /* chain */ 636 if(t->prev != nil || t->next != nil) 637 panic("timerstate2"); 638 t->prev = nil; 639 t->next = priv->timers; 640 if(t->next) 641 t->next->prev = t; 642 priv->timers = t; 643 } 644 } 645 t->state = newstate; 646 } 647 648 void 649 tcpackproc(void *a) 650 { 651 Tcptimer *t, *tp, *timeo; 652 Proto *tcp; 653 Tcppriv *priv; 654 int loop; 655 656 tcp = a; 657 priv = tcp->priv; 658 659 for(;;) { 660 tsleep(&up->sleep, return0, 0, MSPTICK); 661 662 qlock(&priv->tl); 663 timeo = nil; 664 loop = 0; 665 for(t = priv->timers; t != nil; t = tp) { 666 if(loop++ > 10000) 667 panic("tcpackproc1"); 668 tp = t->next; 669 if(t->state == TcptimerON) { 670 t->count--; 671 if(t->count == 0) { 672 timerstate(priv, t, TcptimerDONE); 673 t->readynext = timeo; 674 timeo = t; 675 } 676 } 677 } 678 qunlock(&priv->tl); 679 680 loop = 0; 681 for(t = timeo; t != nil; t = t->readynext) { 682 if(loop++ > 10000) 683 panic("tcpackproc2"); 684 if(t->state == TcptimerDONE && t->func != nil && !waserror()){ 685 (*t->func)(t->arg); 686 poperror(); 687 } 688 } 689 690 limborexmit(tcp); 691 } 692 } 693 694 void 695 tcpgo(Tcppriv *priv, Tcptimer *t) 696 { 697 if(t == nil || t->start == 0) 698 return; 699 700 qlock(&priv->tl); 701 t->count = t->start; 702 timerstate(priv, t, TcptimerON); 703 qunlock(&priv->tl); 704 } 705 706 void 707 tcphalt(Tcppriv *priv, Tcptimer *t) 708 { 709 if(t == nil) 710 return; 711 712 qlock(&priv->tl); 713 timerstate(priv, t, TcptimerOFF); 714 qunlock(&priv->tl); 715 } 716 717 int 718 backoff(int n) 719 { 720 return 1 << n; 721 } 722 723 void 724 localclose(Conv *s, char *reason) /* called with tcb locked */ 725 { 726 Tcpctl *tcb; 727 Reseq *rp,*rp1; 728 Tcppriv *tpriv; 729 730 tpriv = s->p->priv; 731 tcb = (Tcpctl*)s->ptcl; 732 733 iphtrem(&tpriv->ht, s); 734 735 tcphalt(tpriv, &tcb->timer); 736 tcphalt(tpriv, &tcb->rtt_timer); 737 tcphalt(tpriv, &tcb->acktimer); 738 tcphalt(tpriv, &tcb->katimer); 739 740 /* Flush reassembly queue; nothing more can arrive */ 741 for(rp = tcb->reseq; rp != nil; rp = rp1) { 742 rp1 = rp->next; 743 freeblist(rp->bp); 744 free(rp); 745 } 746 tcb->reseq = nil; 747 748 if(tcb->state == Syn_sent) 749 Fsconnected(s, reason); 750 if(s->state == Announced) 751 wakeup(&s->listenr); 752 753 qhangup(s->rq, reason); 754 qhangup(s->wq, reason); 755 756 tcpsetstate(s, Closed); 757 } 758 759 /* mtu (- TCP + IP hdr len) of 1st hop */ 760 int 761 tcpmtu(Proto *tcp, uchar *addr, int version, int *scale) 762 { 763 Ipifc *ifc; 764 int mtu; 765 766 ifc = findipifc(tcp->f, addr, 0); 767 switch(version){ 768 default: 769 case V4: 770 mtu = DEF_MSS; 771 if(ifc != nil) 772 mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE); 773 break; 774 case V6: 775 mtu = DEF_MSS6; 776 if(ifc != nil) 777 mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE); 778 break; 779 } 780 if(ifc != nil){ 781 if(ifc->mbps > 1000) 782 *scale = HaveWS | 4; 783 else if(ifc->mbps > 100) 784 *scale = HaveWS | 3; 785 else if(ifc->mbps > 10) 786 *scale = HaveWS | 1; 787 else 788 *scale = HaveWS | 0; 789 } else 790 *scale = HaveWS | 0; 791 792 return mtu; 793 } 794 795 void 796 inittcpctl(Conv *s, int mode) 797 { 798 Tcpctl *tcb; 799 Tcp4hdr* h4; 800 Tcp6hdr* h6; 801 int mss; 802 803 tcb = (Tcpctl*)s->ptcl; 804 805 memset(tcb, 0, sizeof(Tcpctl)); 806 807 tcb->ssthresh = 65535; 808 tcb->srtt = tcp_irtt<<LOGAGAIN; 809 tcb->mdev = 0; 810 811 /* setup timers */ 812 tcb->timer.start = tcp_irtt / MSPTICK; 813 tcb->timer.func = tcptimeout; 814 tcb->timer.arg = s; 815 tcb->rtt_timer.start = MAX_TIME; 816 tcb->acktimer.start = TCP_ACK / MSPTICK; 817 tcb->acktimer.func = tcpacktimer; 818 tcb->acktimer.arg = s; 819 tcb->katimer.start = DEF_KAT / MSPTICK; 820 tcb->katimer.func = tcpkeepalive; 821 tcb->katimer.arg = s; 822 823 mss = DEF_MSS; 824 825 /* create a prototype(pseudo) header */ 826 if(mode != TCP_LISTEN){ 827 if(ipcmp(s->laddr, IPnoaddr) == 0) 828 findlocalip(s->p->f, s->laddr, s->raddr); 829 830 switch(s->ipversion){ 831 case V4: 832 h4 = &tcb->protohdr.tcp4hdr; 833 memset(h4, 0, sizeof(*h4)); 834 h4->proto = IP_TCPPROTO; 835 hnputs(h4->tcpsport, s->lport); 836 hnputs(h4->tcpdport, s->rport); 837 v6tov4(h4->tcpsrc, s->laddr); 838 v6tov4(h4->tcpdst, s->raddr); 839 break; 840 case V6: 841 h6 = &tcb->protohdr.tcp6hdr; 842 memset(h6, 0, sizeof(*h6)); 843 h6->proto = IP_TCPPROTO; 844 hnputs(h6->tcpsport, s->lport); 845 hnputs(h6->tcpdport, s->rport); 846 ipmove(h6->tcpsrc, s->laddr); 847 ipmove(h6->tcpdst, s->raddr); 848 mss = DEF_MSS6; 849 break; 850 default: 851 panic("inittcpctl: version %d", s->ipversion); 852 } 853 } 854 855 tcb->mss = tcb->cwind = mss; 856 857 /* default is no window scaling */ 858 tcb->window = QMAX; 859 tcb->rcv.wnd = QMAX; 860 tcb->rcv.scale = 0; 861 tcb->snd.scale = 0; 862 qsetlimit(s->rq, QMAX); 863 } 864 865 /* 866 * called with s QLOCKed 867 */ 868 void 869 tcpstart(Conv *s, int mode) 870 { 871 Tcpctl *tcb; 872 Tcppriv *tpriv; 873 char kpname[KNAMELEN]; 874 875 tpriv = s->p->priv; 876 877 if(tpriv->ackprocstarted == 0){ 878 qlock(&tpriv->apl); 879 if(tpriv->ackprocstarted == 0){ 880 sprint(kpname, "#I%dtcpack", s->p->f->dev); 881 kproc(kpname, tcpackproc, s->p); 882 tpriv->ackprocstarted = 1; 883 } 884 qunlock(&tpriv->apl); 885 } 886 887 tcb = (Tcpctl*)s->ptcl; 888 889 inittcpctl(s, mode); 890 891 iphtadd(&tpriv->ht, s); 892 switch(mode) { 893 case TCP_LISTEN: 894 tpriv->stats[PassiveOpens]++; 895 tcb->flags |= CLONE; 896 tcpsetstate(s, Listen); 897 break; 898 899 case TCP_CONNECT: 900 tpriv->stats[ActiveOpens]++; 901 tcb->flags |= ACTIVE; 902 tcpsndsyn(s, tcb); 903 tcpsetstate(s, Syn_sent); 904 tcpoutput(s); 905 break; 906 } 907 } 908 909 static char* 910 tcpflag(ushort flag) 911 { 912 static char buf[128]; 913 914 sprint(buf, "%d", flag>>10); /* Head len */ 915 if(flag & URG) 916 strcat(buf, " URG"); 917 if(flag & ACK) 918 strcat(buf, " ACK"); 919 if(flag & PSH) 920 strcat(buf, " PSH"); 921 if(flag & RST) 922 strcat(buf, " RST"); 923 if(flag & SYN) 924 strcat(buf, " SYN"); 925 if(flag & FIN) 926 strcat(buf, " FIN"); 927 928 return buf; 929 } 930 931 Block * 932 htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb) 933 { 934 int dlen; 935 Tcp6hdr *h; 936 ushort csum; 937 ushort hdrlen, optpad = 0; 938 uchar *opt; 939 940 hdrlen = TCP6_HDRSIZE; 941 if(tcph->flags & SYN){ 942 if(tcph->mss) 943 hdrlen += MSS_LENGTH; 944 if(tcph->ws) 945 hdrlen += WS_LENGTH; 946 optpad = hdrlen & 3; 947 if(optpad) 948 optpad = 4 - optpad; 949 hdrlen += optpad; 950 } 951 952 if(data) { 953 dlen = blocklen(data); 954 data = padblock(data, hdrlen + TCP6_PKT); 955 if(data == nil) 956 return nil; 957 } 958 else { 959 dlen = 0; 960 data = allocb(hdrlen + TCP6_PKT + 64); /* the 64 pad is to meet mintu's */ 961 if(data == nil) 962 return nil; 963 data->wp += hdrlen + TCP6_PKT; 964 } 965 966 /* copy in pseudo ip header plus port numbers */ 967 h = (Tcp6hdr *)(data->rp); 968 memmove(h, ph, TCP6_TCBPHDRSZ); 969 970 /* compose pseudo tcp header, do cksum calculation */ 971 hnputl(h->vcf, hdrlen + dlen); 972 h->ploadlen[0] = h->ploadlen[1] = h->proto = 0; 973 h->ttl = ph->proto; 974 975 /* copy in variable bits */ 976 hnputl(h->tcpseq, tcph->seq); 977 hnputl(h->tcpack, tcph->ack); 978 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags); 979 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0)); 980 hnputs(h->tcpurg, tcph->urg); 981 982 if(tcph->flags & SYN){ 983 opt = h->tcpopt; 984 if(tcph->mss != 0){ 985 *opt++ = MSSOPT; 986 *opt++ = MSS_LENGTH; 987 hnputs(opt, tcph->mss); 988 opt += 2; 989 } 990 if(tcph->ws != 0){ 991 *opt++ = WSOPT; 992 *opt++ = WS_LENGTH; 993 *opt++ = tcph->ws; 994 } 995 while(optpad-- > 0) 996 *opt++ = NOOPOPT; 997 } 998 999 if(tcb != nil && tcb->nochecksum){ 1000 h->tcpcksum[0] = h->tcpcksum[1] = 0; 1001 } else { 1002 csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE); 1003 hnputs(h->tcpcksum, csum); 1004 } 1005 1006 /* move from pseudo header back to normal ip header */ 1007 memset(h->vcf, 0, 4); 1008 h->vcf[0] = IP_VER6; 1009 hnputs(h->ploadlen, hdrlen+dlen); 1010 h->proto = ph->proto; 1011 1012 return data; 1013 } 1014 1015 Block * 1016 htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb) 1017 { 1018 int dlen; 1019 Tcp4hdr *h; 1020 ushort csum; 1021 ushort hdrlen, optpad = 0; 1022 uchar *opt; 1023 1024 hdrlen = TCP4_HDRSIZE; 1025 if(tcph->flags & SYN){ 1026 if(tcph->mss) 1027 hdrlen += MSS_LENGTH; 1028 if(tcph->ws) 1029 hdrlen += WS_LENGTH; 1030 optpad = hdrlen & 3; 1031 if(optpad) 1032 optpad = 4 - optpad; 1033 hdrlen += optpad; 1034 } 1035 1036 if(data) { 1037 dlen = blocklen(data); 1038 data = padblock(data, hdrlen + TCP4_PKT); 1039 if(data == nil) 1040 return nil; 1041 } 1042 else { 1043 dlen = 0; 1044 data = allocb(hdrlen + TCP4_PKT + 64); /* the 64 pad is to meet mintu's */ 1045 if(data == nil) 1046 return nil; 1047 data->wp += hdrlen + TCP4_PKT; 1048 } 1049 1050 /* copy in pseudo ip header plus port numbers */ 1051 h = (Tcp4hdr *)(data->rp); 1052 memmove(h, ph, TCP4_TCBPHDRSZ); 1053 1054 /* copy in variable bits */ 1055 hnputs(h->tcplen, hdrlen + dlen); 1056 hnputl(h->tcpseq, tcph->seq); 1057 hnputl(h->tcpack, tcph->ack); 1058 hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags); 1059 hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0)); 1060 hnputs(h->tcpurg, tcph->urg); 1061 1062 if(tcph->flags & SYN){ 1063 opt = h->tcpopt; 1064 if(tcph->mss != 0){ 1065 *opt++ = MSSOPT; 1066 *opt++ = MSS_LENGTH; 1067 hnputs(opt, tcph->mss); 1068 opt += 2; 1069 } 1070 if(tcph->ws != 0){ 1071 *opt++ = WSOPT; 1072 *opt++ = WS_LENGTH; 1073 *opt++ = tcph->ws; 1074 } 1075 while(optpad-- > 0) 1076 *opt++ = NOOPOPT; 1077 } 1078 1079 if(tcb != nil && tcb->nochecksum){ 1080 h->tcpcksum[0] = h->tcpcksum[1] = 0; 1081 } else { 1082 csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE); 1083 hnputs(h->tcpcksum, csum); 1084 } 1085 1086 return data; 1087 } 1088 1089 int 1090 ntohtcp6(Tcp *tcph, Block **bpp) 1091 { 1092 Tcp6hdr *h; 1093 uchar *optr; 1094 ushort hdrlen; 1095 ushort optlen; 1096 int n; 1097 1098 *bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE); 1099 if(*bpp == nil) 1100 return -1; 1101 1102 h = (Tcp6hdr *)((*bpp)->rp); 1103 tcph->source = nhgets(h->tcpsport); 1104 tcph->dest = nhgets(h->tcpdport); 1105 tcph->seq = nhgetl(h->tcpseq); 1106 tcph->ack = nhgetl(h->tcpack); 1107 hdrlen = (h->tcpflag[0]>>2) & ~3; 1108 if(hdrlen < TCP6_HDRSIZE) { 1109 freeblist(*bpp); 1110 return -1; 1111 } 1112 1113 tcph->flags = h->tcpflag[1]; 1114 tcph->wnd = nhgets(h->tcpwin); 1115 tcph->urg = nhgets(h->tcpurg); 1116 tcph->mss = 0; 1117 tcph->ws = 0; 1118 tcph->len = nhgets(h->ploadlen) - hdrlen; 1119 1120 *bpp = pullupblock(*bpp, hdrlen+TCP6_PKT); 1121 if(*bpp == nil) 1122 return -1; 1123 1124 optr = h->tcpopt; 1125 n = hdrlen - TCP6_HDRSIZE; 1126 while(n > 0 && *optr != EOLOPT) { 1127 if(*optr == NOOPOPT) { 1128 n--; 1129 optr++; 1130 continue; 1131 } 1132 optlen = optr[1]; 1133 if(optlen < 2 || optlen > n) 1134 break; 1135 switch(*optr) { 1136 case MSSOPT: 1137 if(optlen == MSS_LENGTH) 1138 tcph->mss = nhgets(optr+2); 1139 break; 1140 case WSOPT: 1141 if(optlen == WS_LENGTH && *(optr+2) <= 14) 1142 tcph->ws = HaveWS | *(optr+2); 1143 break; 1144 } 1145 n -= optlen; 1146 optr += optlen; 1147 } 1148 return hdrlen; 1149 } 1150 1151 int 1152 ntohtcp4(Tcp *tcph, Block **bpp) 1153 { 1154 Tcp4hdr *h; 1155 uchar *optr; 1156 ushort hdrlen; 1157 ushort optlen; 1158 int n; 1159 1160 *bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE); 1161 if(*bpp == nil) 1162 return -1; 1163 1164 h = (Tcp4hdr *)((*bpp)->rp); 1165 tcph->source = nhgets(h->tcpsport); 1166 tcph->dest = nhgets(h->tcpdport); 1167 tcph->seq = nhgetl(h->tcpseq); 1168 tcph->ack = nhgetl(h->tcpack); 1169 1170 hdrlen = (h->tcpflag[0]>>2) & ~3; 1171 if(hdrlen < TCP4_HDRSIZE) { 1172 freeblist(*bpp); 1173 return -1; 1174 } 1175 1176 tcph->flags = h->tcpflag[1]; 1177 tcph->wnd = nhgets(h->tcpwin); 1178 tcph->urg = nhgets(h->tcpurg); 1179 tcph->mss = 0; 1180 tcph->ws = 0; 1181 tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT); 1182 1183 *bpp = pullupblock(*bpp, hdrlen+TCP4_PKT); 1184 if(*bpp == nil) 1185 return -1; 1186 1187 optr = h->tcpopt; 1188 n = hdrlen - TCP4_HDRSIZE; 1189 while(n > 0 && *optr != EOLOPT) { 1190 if(*optr == NOOPOPT) { 1191 n--; 1192 optr++; 1193 continue; 1194 } 1195 optlen = optr[1]; 1196 if(optlen < 2 || optlen > n) 1197 break; 1198 switch(*optr) { 1199 case MSSOPT: 1200 if(optlen == MSS_LENGTH) 1201 tcph->mss = nhgets(optr+2); 1202 break; 1203 case WSOPT: 1204 if(optlen == WS_LENGTH && *(optr+2) <= 14) 1205 tcph->ws = HaveWS | *(optr+2); 1206 break; 1207 } 1208 n -= optlen; 1209 optr += optlen; 1210 } 1211 return hdrlen; 1212 } 1213 1214 /* 1215 * For outgiing calls, generate an initial sequence 1216 * number and put a SYN on the send queue 1217 */ 1218 void 1219 tcpsndsyn(Conv *s, Tcpctl *tcb) 1220 { 1221 tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16); 1222 tcb->rttseq = tcb->iss; 1223 tcb->snd.wl2 = tcb->iss; 1224 tcb->snd.una = tcb->iss; 1225 tcb->snd.ptr = tcb->rttseq; 1226 tcb->snd.nxt = tcb->rttseq; 1227 tcb->flgcnt++; 1228 tcb->flags |= FORCE; 1229 tcb->sndsyntime = NOW; 1230 1231 /* set desired mss and scale */ 1232 tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale); 1233 } 1234 1235 void 1236 sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason) 1237 { 1238 Block *hbp; 1239 uchar rflags; 1240 Tcppriv *tpriv; 1241 Tcp4hdr ph4; 1242 Tcp6hdr ph6; 1243 1244 netlog(tcp->f, Logtcp, "sndrst: %s\n", reason); 1245 1246 tpriv = tcp->priv; 1247 1248 if(seg->flags & RST) 1249 return; 1250 1251 /* make pseudo header */ 1252 switch(version) { 1253 case V4: 1254 memset(&ph4, 0, sizeof(ph4)); 1255 ph4.vihl = IP_VER4; 1256 v6tov4(ph4.tcpsrc, dest); 1257 v6tov4(ph4.tcpdst, source); 1258 ph4.proto = IP_TCPPROTO; 1259 hnputs(ph4.tcplen, TCP4_HDRSIZE); 1260 hnputs(ph4.tcpsport, seg->dest); 1261 hnputs(ph4.tcpdport, seg->source); 1262 break; 1263 case V6: 1264 memset(&ph6, 0, sizeof(ph6)); 1265 ph6.vcf[0] = IP_VER6; 1266 ipmove(ph6.tcpsrc, dest); 1267 ipmove(ph6.tcpdst, source); 1268 ph6.proto = IP_TCPPROTO; 1269 hnputs(ph6.ploadlen, TCP6_HDRSIZE); 1270 hnputs(ph6.tcpsport, seg->dest); 1271 hnputs(ph6.tcpdport, seg->source); 1272 break; 1273 default: 1274 panic("sndrst: version %d", version); 1275 } 1276 1277 tpriv->stats[OutRsts]++; 1278 rflags = RST; 1279 1280 /* convince the other end that this reset is in band */ 1281 if(seg->flags & ACK) { 1282 seg->seq = seg->ack; 1283 seg->ack = 0; 1284 } 1285 else { 1286 rflags |= ACK; 1287 seg->ack = seg->seq; 1288 seg->seq = 0; 1289 if(seg->flags & SYN) 1290 seg->ack++; 1291 seg->ack += length; 1292 if(seg->flags & FIN) 1293 seg->ack++; 1294 } 1295 seg->flags = rflags; 1296 seg->wnd = 0; 1297 seg->urg = 0; 1298 seg->mss = 0; 1299 seg->ws = 0; 1300 switch(version) { 1301 case V4: 1302 hbp = htontcp4(seg, nil, &ph4, nil); 1303 if(hbp == nil) 1304 return; 1305 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1306 break; 1307 case V6: 1308 hbp = htontcp6(seg, nil, &ph6, nil); 1309 if(hbp == nil) 1310 return; 1311 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1312 break; 1313 default: 1314 panic("sndrst2: version %d", version); 1315 } 1316 } 1317 1318 /* 1319 * send a reset to the remote side and close the conversation 1320 * called with s QLOCKed 1321 */ 1322 char* 1323 tcphangup(Conv *s) 1324 { 1325 Tcp seg; 1326 Tcpctl *tcb; 1327 Block *hbp; 1328 1329 tcb = (Tcpctl*)s->ptcl; 1330 if(waserror()) 1331 return commonerror(); 1332 if(ipcmp(s->raddr, IPnoaddr) != 0) { 1333 if(!waserror()){ 1334 seg.flags = RST | ACK; 1335 seg.ack = tcb->rcv.nxt; 1336 tcb->rcv.una = 0; 1337 seg.seq = tcb->snd.ptr; 1338 seg.wnd = 0; 1339 seg.urg = 0; 1340 seg.mss = 0; 1341 seg.ws = 0; 1342 switch(s->ipversion) { 1343 case V4: 1344 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 1345 hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb); 1346 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); 1347 break; 1348 case V6: 1349 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 1350 hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb); 1351 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); 1352 break; 1353 default: 1354 panic("tcphangup: version %d", s->ipversion); 1355 } 1356 poperror(); 1357 } 1358 } 1359 localclose(s, nil); 1360 poperror(); 1361 return nil; 1362 } 1363 1364 /* 1365 * (re)send a SYN ACK 1366 */ 1367 int 1368 sndsynack(Proto *tcp, Limbo *lp) 1369 { 1370 Block *hbp; 1371 Tcp4hdr ph4; 1372 Tcp6hdr ph6; 1373 Tcp seg; 1374 int scale; 1375 1376 /* make pseudo header */ 1377 switch(lp->version) { 1378 case V4: 1379 memset(&ph4, 0, sizeof(ph4)); 1380 ph4.vihl = IP_VER4; 1381 v6tov4(ph4.tcpsrc, lp->laddr); 1382 v6tov4(ph4.tcpdst, lp->raddr); 1383 ph4.proto = IP_TCPPROTO; 1384 hnputs(ph4.tcplen, TCP4_HDRSIZE); 1385 hnputs(ph4.tcpsport, lp->lport); 1386 hnputs(ph4.tcpdport, lp->rport); 1387 break; 1388 case V6: 1389 memset(&ph6, 0, sizeof(ph6)); 1390 ph6.vcf[0] = IP_VER6; 1391 ipmove(ph6.tcpsrc, lp->laddr); 1392 ipmove(ph6.tcpdst, lp->raddr); 1393 ph6.proto = IP_TCPPROTO; 1394 hnputs(ph6.ploadlen, TCP6_HDRSIZE); 1395 hnputs(ph6.tcpsport, lp->lport); 1396 hnputs(ph6.tcpdport, lp->rport); 1397 break; 1398 default: 1399 panic("sndrst: version %d", lp->version); 1400 } 1401 1402 seg.seq = lp->iss; 1403 seg.ack = lp->irs+1; 1404 seg.flags = SYN|ACK; 1405 seg.urg = 0; 1406 seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale); 1407 seg.wnd = QMAX; 1408 1409 /* if the other side set scale, we should too */ 1410 if(lp->rcvscale){ 1411 seg.ws = scale; 1412 lp->sndscale = scale; 1413 } else { 1414 seg.ws = 0; 1415 lp->sndscale = 0; 1416 } 1417 1418 switch(lp->version) { 1419 case V4: 1420 hbp = htontcp4(&seg, nil, &ph4, nil); 1421 if(hbp == nil) 1422 return -1; 1423 ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1424 break; 1425 case V6: 1426 hbp = htontcp6(&seg, nil, &ph6, nil); 1427 if(hbp == nil) 1428 return -1; 1429 ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil); 1430 break; 1431 default: 1432 panic("sndsnack: version %d", lp->version); 1433 } 1434 lp->lastsend = NOW; 1435 return 0; 1436 } 1437 1438 #define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK ) 1439 1440 /* 1441 * put a call into limbo and respond with a SYN ACK 1442 * 1443 * called with proto locked 1444 */ 1445 static void 1446 limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version) 1447 { 1448 Limbo *lp, **l; 1449 Tcppriv *tpriv; 1450 int h; 1451 1452 tpriv = s->p->priv; 1453 h = hashipa(source, seg->source); 1454 1455 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){ 1456 lp = *l; 1457 if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version) 1458 continue; 1459 if(ipcmp(lp->raddr, source) != 0) 1460 continue; 1461 if(ipcmp(lp->laddr, dest) != 0) 1462 continue; 1463 1464 /* each new SYN restarts the retransmits */ 1465 lp->irs = seg->seq; 1466 break; 1467 } 1468 lp = *l; 1469 if(lp == nil){ 1470 if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){ 1471 lp = tpriv->lht[h]; 1472 tpriv->lht[h] = lp->next; 1473 lp->next = nil; 1474 } else { 1475 lp = malloc(sizeof(*lp)); 1476 if(lp == nil) 1477 return; 1478 tpriv->nlimbo++; 1479 } 1480 *l = lp; 1481 lp->version = version; 1482 ipmove(lp->laddr, dest); 1483 ipmove(lp->raddr, source); 1484 lp->lport = seg->dest; 1485 lp->rport = seg->source; 1486 lp->mss = seg->mss; 1487 lp->rcvscale = seg->ws; 1488 lp->irs = seg->seq; 1489 lp->iss = (nrand(1<<16)<<16)|nrand(1<<16); 1490 } 1491 1492 if(sndsynack(s->p, lp) < 0){ 1493 *l = lp->next; 1494 tpriv->nlimbo--; 1495 free(lp); 1496 } 1497 } 1498 1499 /* 1500 * resend SYN ACK's once every SYNACK_RXTIMER ms. 1501 */ 1502 static void 1503 limborexmit(Proto *tcp) 1504 { 1505 Tcppriv *tpriv; 1506 Limbo **l, *lp; 1507 int h; 1508 int seen; 1509 ulong now; 1510 1511 tpriv = tcp->priv; 1512 1513 if(!CANQLOCK(tcp)) 1514 return; 1515 seen = 0; 1516 now = NOW; 1517 for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){ 1518 for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){ 1519 lp = *l; 1520 seen++; 1521 if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER) 1522 continue; 1523 1524 /* time it out after 1 second */ 1525 if(++(lp->rexmits) > 5){ 1526 tpriv->nlimbo--; 1527 *l = lp->next; 1528 free(lp); 1529 continue; 1530 } 1531 1532 /* if we're being attacked, don't bother resending SYN ACK's */ 1533 if(tpriv->nlimbo > 100) 1534 continue; 1535 1536 if(sndsynack(tcp, lp) < 0){ 1537 tpriv->nlimbo--; 1538 *l = lp->next; 1539 free(lp); 1540 continue; 1541 } 1542 1543 l = &lp->next; 1544 } 1545 } 1546 QUNLOCK(tcp); 1547 } 1548 1549 /* 1550 * lookup call in limbo. if found, throw it out. 1551 * 1552 * called with proto locked 1553 */ 1554 static void 1555 limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) 1556 { 1557 Limbo *lp, **l; 1558 int h; 1559 Tcppriv *tpriv; 1560 1561 tpriv = s->p->priv; 1562 1563 /* find a call in limbo */ 1564 h = hashipa(src, segp->source); 1565 for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){ 1566 lp = *l; 1567 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version) 1568 continue; 1569 if(ipcmp(lp->laddr, dst) != 0) 1570 continue; 1571 if(ipcmp(lp->raddr, src) != 0) 1572 continue; 1573 1574 /* RST can only follow the SYN */ 1575 if(segp->seq == lp->irs+1){ 1576 tpriv->nlimbo--; 1577 *l = lp->next; 1578 free(lp); 1579 } 1580 break; 1581 } 1582 } 1583 1584 /* 1585 * come here when we finally get an ACK to our SYN-ACK. 1586 * lookup call in limbo. if found, create a new conversation 1587 * 1588 * called with proto locked 1589 */ 1590 static Conv* 1591 tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version) 1592 { 1593 Conv *new; 1594 Tcpctl *tcb; 1595 Tcppriv *tpriv; 1596 Tcp4hdr *h4; 1597 Tcp6hdr *h6; 1598 Limbo *lp, **l; 1599 int h; 1600 1601 /* unless it's just an ack, it can't be someone coming out of limbo */ 1602 if((segp->flags & SYN) || (segp->flags & ACK) == 0) 1603 return nil; 1604 1605 tpriv = s->p->priv; 1606 1607 /* find a call in limbo */ 1608 h = hashipa(src, segp->source); 1609 for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){ 1610 netlog(s->p->f, Logtcp, "tcpincoming s %I,%ux/%I,%ux d %I,%ux/%I,%ux v %d/%d\n", 1611 src, segp->source, lp->raddr, lp->rport, 1612 dst, segp->dest, lp->laddr, lp->lport, 1613 version, lp->version 1614 ); 1615 1616 if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version) 1617 continue; 1618 if(ipcmp(lp->laddr, dst) != 0) 1619 continue; 1620 if(ipcmp(lp->raddr, src) != 0) 1621 continue; 1622 1623 /* we're assuming no data with the initial SYN */ 1624 if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){ 1625 netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n", 1626 segp->seq, lp->irs+1, segp->ack, lp->iss+1); 1627 lp = nil; 1628 } else { 1629 tpriv->nlimbo--; 1630 *l = lp->next; 1631 } 1632 break; 1633 } 1634 if(lp == nil) 1635 return nil; 1636 1637 new = Fsnewcall(s, src, segp->source, dst, segp->dest, version); 1638 if(new == nil) 1639 return nil; 1640 1641 memmove(new->ptcl, s->ptcl, sizeof(Tcpctl)); 1642 tcb = (Tcpctl*)new->ptcl; 1643 tcb->flags &= ~CLONE; 1644 tcb->timer.arg = new; 1645 tcb->timer.state = TcptimerOFF; 1646 tcb->acktimer.arg = new; 1647 tcb->acktimer.state = TcptimerOFF; 1648 tcb->katimer.arg = new; 1649 tcb->katimer.state = TcptimerOFF; 1650 tcb->rtt_timer.arg = new; 1651 tcb->rtt_timer.state = TcptimerOFF; 1652 1653 tcb->irs = lp->irs; 1654 tcb->rcv.nxt = tcb->irs+1; 1655 tcb->rcv.urg = tcb->rcv.nxt; 1656 1657 tcb->iss = lp->iss; 1658 tcb->rttseq = tcb->iss; 1659 tcb->snd.wl2 = tcb->iss; 1660 tcb->snd.una = tcb->iss+1; 1661 tcb->snd.ptr = tcb->iss+1; 1662 tcb->snd.nxt = tcb->iss+1; 1663 tcb->flgcnt = 0; 1664 tcb->flags |= SYNACK; 1665 1666 /* our sending max segment size cannot be bigger than what he asked for */ 1667 if(lp->mss != 0 && lp->mss < tcb->mss) 1668 tcb->mss = lp->mss; 1669 1670 /* window scaling */ 1671 tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale); 1672 1673 /* the congestion window always starts out as a single segment */ 1674 tcb->snd.wnd = segp->wnd; 1675 tcb->cwind = tcb->mss; 1676 1677 /* set initial round trip time */ 1678 tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER; 1679 tcpsynackrtt(new); 1680 1681 free(lp); 1682 1683 /* set up proto header */ 1684 switch(version){ 1685 case V4: 1686 h4 = &tcb->protohdr.tcp4hdr; 1687 memset(h4, 0, sizeof(*h4)); 1688 h4->proto = IP_TCPPROTO; 1689 hnputs(h4->tcpsport, new->lport); 1690 hnputs(h4->tcpdport, new->rport); 1691 v6tov4(h4->tcpsrc, dst); 1692 v6tov4(h4->tcpdst, src); 1693 break; 1694 case V6: 1695 h6 = &tcb->protohdr.tcp6hdr; 1696 memset(h6, 0, sizeof(*h6)); 1697 h6->proto = IP_TCPPROTO; 1698 hnputs(h6->tcpsport, new->lport); 1699 hnputs(h6->tcpdport, new->rport); 1700 ipmove(h6->tcpsrc, dst); 1701 ipmove(h6->tcpdst, src); 1702 break; 1703 default: 1704 panic("tcpincoming: version %d", new->ipversion); 1705 } 1706 1707 tcpsetstate(new, Established); 1708 1709 iphtadd(&tpriv->ht, new); 1710 1711 return new; 1712 } 1713 1714 int 1715 seq_within(ulong x, ulong low, ulong high) 1716 { 1717 if(low <= high){ 1718 if(low <= x && x <= high) 1719 return 1; 1720 } 1721 else { 1722 if(x >= low || x <= high) 1723 return 1; 1724 } 1725 return 0; 1726 } 1727 1728 int 1729 seq_lt(ulong x, ulong y) 1730 { 1731 return (int)(x-y) < 0; 1732 } 1733 1734 int 1735 seq_le(ulong x, ulong y) 1736 { 1737 return (int)(x-y) <= 0; 1738 } 1739 1740 int 1741 seq_gt(ulong x, ulong y) 1742 { 1743 return (int)(x-y) > 0; 1744 } 1745 1746 int 1747 seq_ge(ulong x, ulong y) 1748 { 1749 return (int)(x-y) >= 0; 1750 } 1751 1752 /* 1753 * use the time between the first SYN and it's ack as the 1754 * initial round trip time 1755 */ 1756 void 1757 tcpsynackrtt(Conv *s) 1758 { 1759 Tcpctl *tcb; 1760 int delta; 1761 Tcppriv *tpriv; 1762 1763 tcb = (Tcpctl*)s->ptcl; 1764 tpriv = s->p->priv; 1765 1766 delta = NOW - tcb->sndsyntime; 1767 tcb->srtt = delta<<LOGAGAIN; 1768 tcb->mdev = delta<<LOGDGAIN; 1769 1770 /* halt round trip timer */ 1771 tcphalt(tpriv, &tcb->rtt_timer); 1772 } 1773 1774 void 1775 update(Conv *s, Tcp *seg) 1776 { 1777 int rtt, delta; 1778 Tcpctl *tcb; 1779 ulong acked; 1780 ulong expand; 1781 Tcppriv *tpriv; 1782 1783 tpriv = s->p->priv; 1784 tcb = (Tcpctl*)s->ptcl; 1785 1786 /* if everything has been acked, force output(?) */ 1787 if(seq_gt(seg->ack, tcb->snd.nxt)) { 1788 tcb->flags |= FORCE; 1789 return; 1790 } 1791 1792 /* added by Dong Lin for fast retransmission */ 1793 if(seg->ack == tcb->snd.una 1794 && tcb->snd.una != tcb->snd.nxt 1795 && seg->len == 0 1796 && seg->wnd == tcb->snd.wnd) { 1797 1798 /* this is a pure ack w/o window update */ 1799 netlog(s->p->f, Logtcprxmt, "dupack %lud ack %lud sndwnd %d advwin %d\n", 1800 tcb->snd.dupacks, seg->ack, tcb->snd.wnd, seg->wnd); 1801 1802 if(++tcb->snd.dupacks == TCPREXMTTHRESH) { 1803 /* 1804 * tahoe tcp rxt the packet, half sshthresh, 1805 * and set cwnd to one packet 1806 */ 1807 tcb->snd.recovery = 1; 1808 tcb->snd.rxt = tcb->snd.nxt; 1809 netlog(s->p->f, Logtcprxmt, "fast rxt %lud, nxt %lud\n", tcb->snd.una, tcb->snd.nxt); 1810 tcprxmit(s); 1811 } else { 1812 /* do reno tcp here. */ 1813 } 1814 } 1815 1816 /* 1817 * update window 1818 */ 1819 if(seq_gt(seg->ack, tcb->snd.wl2) 1820 || (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){ 1821 tcb->snd.wnd = seg->wnd; 1822 tcb->snd.wl2 = seg->ack; 1823 } 1824 1825 if(!seq_gt(seg->ack, tcb->snd.una)){ 1826 /* 1827 * don't let us hangup if sending into a closed window and 1828 * we're still getting acks 1829 */ 1830 if((tcb->flags&RETRAN) && tcb->snd.wnd == 0){ 1831 tcb->backedoff = MAXBACKMS/4; 1832 } 1833 return; 1834 } 1835 1836 /* 1837 * any positive ack turns off fast rxt, 1838 * (should we do new-reno on partial acks?) 1839 */ 1840 if(!tcb->snd.recovery || seq_ge(seg->ack, tcb->snd.rxt)) { 1841 tcb->snd.dupacks = 0; 1842 tcb->snd.recovery = 0; 1843 } else 1844 netlog(s->p->f, Logtcp, "rxt next %lud, cwin %ud\n", seg->ack, tcb->cwind); 1845 1846 /* Compute the new send window size */ 1847 acked = seg->ack - tcb->snd.una; 1848 1849 /* avoid slow start and timers for SYN acks */ 1850 if((tcb->flags & SYNACK) == 0) { 1851 tcb->flags |= SYNACK; 1852 acked--; 1853 tcb->flgcnt--; 1854 goto done; 1855 } 1856 1857 /* slow start as long as we're not recovering from lost packets */ 1858 if(tcb->cwind < tcb->snd.wnd && !tcb->snd.recovery) { 1859 if(tcb->cwind < tcb->ssthresh) { 1860 expand = tcb->mss; 1861 if(acked < expand) 1862 expand = acked; 1863 } 1864 else 1865 expand = ((int)tcb->mss * tcb->mss) / tcb->cwind; 1866 1867 if(tcb->cwind + expand < tcb->cwind) 1868 expand = tcb->snd.wnd - tcb->cwind; 1869 if(tcb->cwind + expand > tcb->snd.wnd) 1870 expand = tcb->snd.wnd - tcb->cwind; 1871 tcb->cwind += expand; 1872 } 1873 1874 /* Adjust the timers according to the round trip time */ 1875 if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) { 1876 tcphalt(tpriv, &tcb->rtt_timer); 1877 if((tcb->flags&RETRAN) == 0) { 1878 tcb->backoff = 0; 1879 tcb->backedoff = 0; 1880 rtt = tcb->rtt_timer.start - tcb->rtt_timer.count; 1881 if(rtt == 0) 1882 rtt = 1; /* otherwise all close systems will rexmit in 0 time */ 1883 rtt *= MSPTICK; 1884 if(tcb->srtt == 0) { 1885 tcb->srtt = rtt << LOGAGAIN; 1886 tcb->mdev = rtt << LOGDGAIN; 1887 } else { 1888 delta = rtt - (tcb->srtt>>LOGAGAIN); 1889 tcb->srtt += delta; 1890 if(tcb->srtt <= 0) 1891 tcb->srtt = 1; 1892 1893 delta = abs(delta) - (tcb->mdev>>LOGDGAIN); 1894 tcb->mdev += delta; 1895 if(tcb->mdev <= 0) 1896 tcb->mdev = 1; 1897 } 1898 tcpsettimer(tcb); 1899 } 1900 } 1901 1902 done: 1903 if(qdiscard(s->wq, acked) < acked) 1904 tcb->flgcnt--; 1905 1906 tcb->snd.una = seg->ack; 1907 if(seq_gt(seg->ack, tcb->snd.urg)) 1908 tcb->snd.urg = seg->ack; 1909 1910 if(tcb->snd.una != tcb->snd.nxt) 1911 tcpgo(tpriv, &tcb->timer); 1912 else 1913 tcphalt(tpriv, &tcb->timer); 1914 1915 if(seq_lt(tcb->snd.ptr, tcb->snd.una)) 1916 tcb->snd.ptr = tcb->snd.una; 1917 1918 tcb->flags &= ~RETRAN; 1919 tcb->backoff = 0; 1920 tcb->backedoff = 0; 1921 } 1922 1923 void 1924 tcpiput(Proto *tcp, Ipifc* _, Block *bp) 1925 { 1926 Tcp seg; 1927 Tcp4hdr *h4; 1928 Tcp6hdr *h6; 1929 int hdrlen; 1930 Tcpctl *tcb; 1931 ushort length, csum; 1932 uchar source[IPaddrlen], dest[IPaddrlen]; 1933 Conv *s; 1934 Fs *f; 1935 Tcppriv *tpriv; 1936 uchar version; 1937 1938 f = tcp->f; 1939 tpriv = tcp->priv; 1940 1941 tpriv->stats[InSegs]++; 1942 1943 h4 = (Tcp4hdr*)(bp->rp); 1944 h6 = (Tcp6hdr*)(bp->rp); 1945 1946 if((h4->vihl&0xF0)==IP_VER4) { 1947 version = V4; 1948 length = nhgets(h4->length); 1949 v4tov6(dest, h4->tcpdst); 1950 v4tov6(source, h4->tcpsrc); 1951 1952 h4->Unused = 0; 1953 hnputs(h4->tcplen, length-TCP4_PKT); 1954 if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) && 1955 ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) { 1956 tpriv->stats[CsumErrs]++; 1957 tpriv->stats[InErrs]++; 1958 netlog(f, Logtcp, "bad tcp proto cksum\n"); 1959 freeblist(bp); 1960 return; 1961 } 1962 1963 hdrlen = ntohtcp4(&seg, &bp); 1964 if(hdrlen < 0){ 1965 tpriv->stats[HlenErrs]++; 1966 tpriv->stats[InErrs]++; 1967 netlog(f, Logtcp, "bad tcp hdr len\n"); 1968 return; 1969 } 1970 1971 /* trim the packet to the size claimed by the datagram */ 1972 length -= hdrlen+TCP4_PKT; 1973 bp = trimblock(bp, hdrlen+TCP4_PKT, length); 1974 if(bp == nil){ 1975 tpriv->stats[LenErrs]++; 1976 tpriv->stats[InErrs]++; 1977 netlog(f, Logtcp, "tcp len < 0 after trim\n"); 1978 return; 1979 } 1980 } 1981 else { 1982 int ttl = h6->ttl; 1983 int proto = h6->proto; 1984 1985 version = V6; 1986 length = nhgets(h6->ploadlen); 1987 ipmove(dest, h6->tcpdst); 1988 ipmove(source, h6->tcpsrc); 1989 1990 h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0; 1991 h6->ttl = proto; 1992 hnputl(h6->vcf, length); 1993 if((h6->tcpcksum[0] || h6->tcpcksum[1]) && 1994 (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) { 1995 tpriv->stats[CsumErrs]++; 1996 tpriv->stats[InErrs]++; 1997 netlog(f, Logtcp, 1998 "bad tcpv6 proto cksum: got %#ux, computed %#ux\n", 1999 h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum); 2000 freeblist(bp); 2001 return; 2002 } 2003 h6->ttl = ttl; 2004 h6->proto = proto; 2005 hnputs(h6->ploadlen, length); 2006 2007 hdrlen = ntohtcp6(&seg, &bp); 2008 if(hdrlen < 0){ 2009 tpriv->stats[HlenErrs]++; 2010 tpriv->stats[InErrs]++; 2011 netlog(f, Logtcp, "bad tcpv6 hdr len\n"); 2012 return; 2013 } 2014 2015 /* trim the packet to the size claimed by the datagram */ 2016 length -= hdrlen; 2017 bp = trimblock(bp, hdrlen+TCP6_PKT, length); 2018 if(bp == nil){ 2019 tpriv->stats[LenErrs]++; 2020 tpriv->stats[InErrs]++; 2021 netlog(f, Logtcp, "tcpv6 len < 0 after trim\n"); 2022 return; 2023 } 2024 } 2025 2026 /* lock protocol while searching for a conversation */ 2027 QLOCK(tcp); 2028 2029 /* Look for a matching conversation */ 2030 s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest); 2031 if(s == nil){ 2032 netlog(f, Logtcp, "iphtlook failed\n"); 2033 reset: 2034 QUNLOCK(tcp); 2035 sndrst(tcp, source, dest, length, &seg, version, "no conversation"); 2036 freeblist(bp); 2037 return; 2038 } 2039 2040 /* if it's a listener, look for the right flags and get a new conv */ 2041 tcb = (Tcpctl*)s->ptcl; 2042 if(tcb->state == Listen){ 2043 if(seg.flags & RST){ 2044 limborst(s, &seg, source, dest, version); 2045 QUNLOCK(tcp); 2046 freeblist(bp); 2047 return; 2048 } 2049 2050 /* if this is a new SYN, put the call into limbo */ 2051 if((seg.flags & SYN) && (seg.flags & ACK) == 0){ 2052 limbo(s, source, dest, &seg, version); 2053 QUNLOCK(tcp); 2054 freeblist(bp); 2055 return; 2056 } 2057 2058 /* 2059 * if there's a matching call in limbo, tcpincoming will 2060 * return it in state Syn_received 2061 */ 2062 s = tcpincoming(s, &seg, source, dest, version); 2063 if(s == nil) 2064 goto reset; 2065 } 2066 2067 /* The rest of the input state machine is run with the control block 2068 * locked and implements the state machine directly out of the RFC. 2069 * Out-of-band data is ignored - it was always a bad idea. 2070 */ 2071 tcb = (Tcpctl*)s->ptcl; 2072 if(waserror()){ 2073 QUNLOCK(s); 2074 nexterror(); 2075 } 2076 QLOCK(s); 2077 QUNLOCK(tcp); 2078 2079 /* fix up window */ 2080 seg.wnd <<= tcb->rcv.scale; 2081 2082 /* every input packet in puts off the keep alive time out */ 2083 tcpsetkacounter(tcb); 2084 2085 switch(tcb->state) { 2086 case Closed: 2087 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed"); 2088 goto raise; 2089 case Syn_sent: 2090 if(seg.flags & ACK) { 2091 if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) { 2092 sndrst(tcp, source, dest, length, &seg, version, 2093 "bad seq in Syn_sent"); 2094 goto raise; 2095 } 2096 } 2097 if(seg.flags & RST) { 2098 if(seg.flags & ACK) 2099 localclose(s, Econrefused); 2100 goto raise; 2101 } 2102 2103 if(seg.flags & SYN) { 2104 procsyn(s, &seg); 2105 if(seg.flags & ACK){ 2106 update(s, &seg); 2107 tcpsynackrtt(s); 2108 tcpsetstate(s, Established); 2109 tcpsetscale(s, tcb, seg.ws, tcb->scale); 2110 } 2111 else { 2112 tcb->time = NOW; 2113 tcpsetstate(s, Syn_received); /* DLP - shouldn't this be a reset? */ 2114 } 2115 2116 if(length != 0 || (seg.flags & FIN)) 2117 break; 2118 2119 freeblist(bp); 2120 goto output; 2121 } 2122 else 2123 freeblist(bp); 2124 2125 QUNLOCK(s); 2126 poperror(); 2127 return; 2128 case Syn_received: 2129 /* doesn't matter if it's the correct ack, we're just trying to set timing */ 2130 if(seg.flags & ACK) 2131 tcpsynackrtt(s); 2132 break; 2133 } 2134 2135 /* 2136 * One DOS attack is to open connections to us and then forget about them, 2137 * thereby tying up a conv at no long term cost to the attacker. 2138 * This is an attempt to defeat these stateless DOS attacks. See 2139 * corresponding code in tcpsendka(). 2140 */ 2141 if(tcb->state != Syn_received && (seg.flags & RST) == 0){ 2142 if(tcpporthogdefense 2143 && seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){ 2144 print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n", 2145 source, seg.source, dest, seg.dest, seg.flags, 2146 tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29)); 2147 localclose(s, "stateless hog"); 2148 } 2149 } 2150 2151 /* Cut the data to fit the receive window */ 2152 if(tcptrim(tcb, &seg, &bp, &length) == -1) { 2153 netlog(f, Logtcp, "tcp len < 0, %lud %d\n", seg.seq, length); 2154 update(s, &seg); 2155 if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) { 2156 tcphalt(tpriv, &tcb->rtt_timer); 2157 tcphalt(tpriv, &tcb->acktimer); 2158 tcphalt(tpriv, &tcb->katimer); 2159 tcpsetstate(s, Time_wait); 2160 tcb->timer.start = MSL2*(1000 / MSPTICK); 2161 tcpgo(tpriv, &tcb->timer); 2162 } 2163 if(!(seg.flags & RST)) { 2164 tcb->flags |= FORCE; 2165 goto output; 2166 } 2167 QUNLOCK(s); 2168 poperror(); 2169 return; 2170 } 2171 2172 /* Cannot accept so answer with a rst */ 2173 if(length && tcb->state == Closed) { 2174 sndrst(tcp, source, dest, length, &seg, version, "sending to Closed"); 2175 goto raise; 2176 } 2177 2178 /* The segment is beyond the current receive pointer so 2179 * queue the data in the resequence queue 2180 */ 2181 if(seg.seq != tcb->rcv.nxt) 2182 if(length != 0 || (seg.flags & (SYN|FIN))) { 2183 update(s, &seg); 2184 if(addreseq(tcb, tpriv, &seg, bp, length) < 0) 2185 print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport, s->laddr, s->lport); 2186 tcb->flags |= FORCE; 2187 goto output; 2188 } 2189 2190 /* 2191 * keep looping till we've processed this packet plus any 2192 * adjacent packets in the resequence queue 2193 */ 2194 for(;;) { 2195 if(seg.flags & RST) { 2196 if(tcb->state == Established) { 2197 tpriv->stats[EstabResets]++; 2198 if(tcb->rcv.nxt != seg.seq) 2199 print("out of order RST rcvd: %I.%d -> %I.%d, rcv.nxt %lux seq %lux\n", s->raddr, s->rport, s->laddr, s->lport, tcb->rcv.nxt, seg.seq); 2200 } 2201 localclose(s, Econrefused); 2202 goto raise; 2203 } 2204 2205 if((seg.flags&ACK) == 0) 2206 goto raise; 2207 2208 switch(tcb->state) { 2209 case Syn_received: 2210 if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){ 2211 sndrst(tcp, source, dest, length, &seg, version, 2212 "bad seq in Syn_received"); 2213 goto raise; 2214 } 2215 update(s, &seg); 2216 tcpsetstate(s, Established); 2217 case Established: 2218 case Close_wait: 2219 update(s, &seg); 2220 break; 2221 case Finwait1: 2222 update(s, &seg); 2223 if(qlen(s->wq)+tcb->flgcnt == 0){ 2224 tcphalt(tpriv, &tcb->rtt_timer); 2225 tcphalt(tpriv, &tcb->acktimer); 2226 tcpsetkacounter(tcb); 2227 tcb->time = NOW; 2228 tcpsetstate(s, Finwait2); 2229 tcb->katimer.start = MSL2 * (1000 / MSPTICK); 2230 tcpgo(tpriv, &tcb->katimer); 2231 } 2232 break; 2233 case Finwait2: 2234 update(s, &seg); 2235 break; 2236 case Closing: 2237 update(s, &seg); 2238 if(qlen(s->wq)+tcb->flgcnt == 0) { 2239 tcphalt(tpriv, &tcb->rtt_timer); 2240 tcphalt(tpriv, &tcb->acktimer); 2241 tcphalt(tpriv, &tcb->katimer); 2242 tcpsetstate(s, Time_wait); 2243 tcb->timer.start = MSL2*(1000 / MSPTICK); 2244 tcpgo(tpriv, &tcb->timer); 2245 } 2246 break; 2247 case Last_ack: 2248 update(s, &seg); 2249 if(qlen(s->wq)+tcb->flgcnt == 0) { 2250 localclose(s, nil); 2251 goto raise; 2252 } 2253 case Time_wait: 2254 tcb->flags |= FORCE; 2255 if(tcb->timer.state != TcptimerON) 2256 tcpgo(tpriv, &tcb->timer); 2257 } 2258 2259 if((seg.flags&URG) && seg.urg) { 2260 if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) { 2261 tcb->rcv.urg = seg.urg + seg.seq; 2262 pullblock(&bp, seg.urg); 2263 } 2264 } 2265 else 2266 if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg)) 2267 tcb->rcv.urg = tcb->rcv.nxt; 2268 2269 if(length == 0) { 2270 if(bp != nil) 2271 freeblist(bp); 2272 } 2273 else { 2274 switch(tcb->state){ 2275 default: 2276 /* Ignore segment text */ 2277 if(bp != nil) 2278 freeblist(bp); 2279 break; 2280 2281 case Syn_received: 2282 case Established: 2283 case Finwait1: 2284 /* If we still have some data place on 2285 * receive queue 2286 */ 2287 if(bp) { 2288 bp = packblock(bp); 2289 if(bp == nil) 2290 panic("tcp packblock"); 2291 qpassnolim(s->rq, bp); 2292 bp = nil; 2293 2294 /* 2295 * Force an ack every 2 data messages. This is 2296 * a hack for rob to make his home system run 2297 * faster. 2298 * 2299 * this also keeps the standard TCP congestion 2300 * control working since it needs an ack every 2301 * 2 max segs worth. This is not quite that, 2302 * but under a real stream is equivalent since 2303 * every packet has a max seg in it. 2304 */ 2305 if(++(tcb->rcv.una) >= 2) 2306 tcb->flags |= FORCE; 2307 } 2308 tcb->rcv.nxt += length; 2309 2310 /* 2311 * update our rcv window 2312 */ 2313 tcprcvwin(s); 2314 2315 /* 2316 * turn on the acktimer if there's something 2317 * to ack 2318 */ 2319 if(tcb->acktimer.state != TcptimerON) 2320 tcpgo(tpriv, &tcb->acktimer); 2321 2322 break; 2323 case Finwait2: 2324 /* no process to read the data, send a reset */ 2325 if(bp != nil) 2326 freeblist(bp); 2327 sndrst(tcp, source, dest, length, &seg, version, 2328 "send to Finwait2"); 2329 QUNLOCK(s); 2330 poperror(); 2331 return; 2332 } 2333 } 2334 2335 if(seg.flags & FIN) { 2336 tcb->flags |= FORCE; 2337 2338 switch(tcb->state) { 2339 case Syn_received: 2340 case Established: 2341 tcb->rcv.nxt++; 2342 tcpsetstate(s, Close_wait); 2343 break; 2344 case Finwait1: 2345 tcb->rcv.nxt++; 2346 if(qlen(s->wq)+tcb->flgcnt == 0) { 2347 tcphalt(tpriv, &tcb->rtt_timer); 2348 tcphalt(tpriv, &tcb->acktimer); 2349 tcphalt(tpriv, &tcb->katimer); 2350 tcpsetstate(s, Time_wait); 2351 tcb->timer.start = MSL2*(1000/MSPTICK); 2352 tcpgo(tpriv, &tcb->timer); 2353 } 2354 else 2355 tcpsetstate(s, Closing); 2356 break; 2357 case Finwait2: 2358 tcb->rcv.nxt++; 2359 tcphalt(tpriv, &tcb->rtt_timer); 2360 tcphalt(tpriv, &tcb->acktimer); 2361 tcphalt(tpriv, &tcb->katimer); 2362 tcpsetstate(s, Time_wait); 2363 tcb->timer.start = MSL2 * (1000/MSPTICK); 2364 tcpgo(tpriv, &tcb->timer); 2365 break; 2366 case Close_wait: 2367 case Closing: 2368 case Last_ack: 2369 break; 2370 case Time_wait: 2371 tcpgo(tpriv, &tcb->timer); 2372 break; 2373 } 2374 } 2375 2376 /* 2377 * get next adjacent segment from the resequence queue. 2378 * dump/trim any overlapping segments 2379 */ 2380 for(;;) { 2381 if(tcb->reseq == nil) 2382 goto output; 2383 2384 if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0) 2385 goto output; 2386 2387 getreseq(tcb, &seg, &bp, &length); 2388 2389 if(tcptrim(tcb, &seg, &bp, &length) == 0) 2390 break; 2391 } 2392 } 2393 output: 2394 tcpoutput(s); 2395 QUNLOCK(s); 2396 poperror(); 2397 return; 2398 raise: 2399 QUNLOCK(s); 2400 poperror(); 2401 freeblist(bp); 2402 tcpkick(s); 2403 } 2404 2405 /* 2406 * always enters and exits with the s locked. We drop 2407 * the lock to ipoput the packet so some care has to be 2408 * taken by callers. 2409 */ 2410 void 2411 tcpoutput(Conv *s) 2412 { 2413 Tcp seg; 2414 int msgs; 2415 Tcpctl *tcb; 2416 Block *hbp, *bp; 2417 int sndcnt, n; 2418 ulong ssize, dsize, usable, sent; 2419 Fs *f; 2420 Tcppriv *tpriv; 2421 uchar version; 2422 2423 f = s->p->f; 2424 tpriv = s->p->priv; 2425 version = s->ipversion; 2426 2427 for(msgs = 0; msgs < 100; msgs++) { 2428 tcb = (Tcpctl*)s->ptcl; 2429 2430 switch(tcb->state) { 2431 case Listen: 2432 case Closed: 2433 case Finwait2: 2434 return; 2435 } 2436 2437 /* force an ack when a window has opened up */ 2438 if(tcb->rcv.blocked && tcb->rcv.wnd > 0){ 2439 tcb->rcv.blocked = 0; 2440 tcb->flags |= FORCE; 2441 } 2442 2443 sndcnt = qlen(s->wq)+tcb->flgcnt; 2444 sent = tcb->snd.ptr - tcb->snd.una; 2445 2446 /* Don't send anything else until our SYN has been acked */ 2447 if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0) 2448 break; 2449 2450 /* Compute usable segment based on offered window and limit 2451 * window probes to one 2452 */ 2453 if(tcb->snd.wnd == 0){ 2454 if(sent != 0) { 2455 if((tcb->flags&FORCE) == 0) 2456 break; 2457 // tcb->snd.ptr = tcb->snd.una; 2458 } 2459 usable = 1; 2460 } 2461 else { 2462 usable = tcb->cwind; 2463 if(tcb->snd.wnd < usable) 2464 usable = tcb->snd.wnd; 2465 usable -= sent; 2466 } 2467 ssize = sndcnt-sent; 2468 if(ssize && usable < 2) 2469 netlog(s->p->f, Logtcp, "throttled snd.wnd %lud cwind %lud\n", 2470 tcb->snd.wnd, tcb->cwind); 2471 if(usable < ssize) 2472 ssize = usable; 2473 if(tcb->mss < ssize) 2474 ssize = tcb->mss; 2475 dsize = ssize; 2476 seg.urg = 0; 2477 2478 if(ssize == 0) 2479 if((tcb->flags&FORCE) == 0) 2480 break; 2481 2482 tcb->flags &= ~FORCE; 2483 tcprcvwin(s); 2484 2485 /* By default we will generate an ack */ 2486 tcphalt(tpriv, &tcb->acktimer); 2487 tcb->rcv.una = 0; 2488 seg.source = s->lport; 2489 seg.dest = s->rport; 2490 seg.flags = ACK; 2491 seg.mss = 0; 2492 seg.ws = 0; 2493 switch(tcb->state){ 2494 case Syn_sent: 2495 seg.flags = 0; 2496 if(tcb->snd.ptr == tcb->iss){ 2497 seg.flags |= SYN; 2498 dsize--; 2499 seg.mss = tcb->mss; 2500 seg.ws = tcb->scale; 2501 } 2502 break; 2503 case Syn_received: 2504 /* 2505 * don't send any data with a SYN/ACK packet 2506 * because Linux rejects the packet in its 2507 * attempt to solve the SYN attack problem 2508 */ 2509 if(tcb->snd.ptr == tcb->iss){ 2510 seg.flags |= SYN; 2511 dsize = 0; 2512 ssize = 1; 2513 seg.mss = tcb->mss; 2514 seg.ws = tcb->scale; 2515 } 2516 break; 2517 } 2518 seg.seq = tcb->snd.ptr; 2519 seg.ack = tcb->rcv.nxt; 2520 seg.wnd = tcb->rcv.wnd; 2521 2522 /* Pull out data to send */ 2523 bp = nil; 2524 if(dsize != 0) { 2525 bp = qcopy(s->wq, dsize, sent); 2526 if(BLEN(bp) != dsize) { 2527 seg.flags |= FIN; 2528 dsize--; 2529 } 2530 } 2531 2532 if(sent+dsize == sndcnt) 2533 seg.flags |= PSH; 2534 2535 /* keep track of balance of resent data */ 2536 if(seq_lt(tcb->snd.ptr, tcb->snd.nxt)) { 2537 n = tcb->snd.nxt - tcb->snd.ptr; 2538 if(ssize < n) 2539 n = ssize; 2540 tcb->resent += n; 2541 netlog(f, Logtcp, "rexmit: %I.%d -> %I.%d ptr %lux nxt %lux\n", 2542 s->raddr, s->rport, s->laddr, s->lport, tcb->snd.ptr, tcb->snd.nxt); 2543 tpriv->stats[RetransSegs]++; 2544 } 2545 2546 tcb->snd.ptr += ssize; 2547 2548 /* Pull up the send pointer so we can accept acks 2549 * for this window 2550 */ 2551 if(seq_gt(tcb->snd.ptr,tcb->snd.nxt)) 2552 tcb->snd.nxt = tcb->snd.ptr; 2553 2554 /* Build header, link data and compute cksum */ 2555 switch(version){ 2556 case V4: 2557 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 2558 hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb); 2559 if(hbp == nil) { 2560 freeblist(bp); 2561 return; 2562 } 2563 break; 2564 case V6: 2565 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 2566 hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb); 2567 if(hbp == nil) { 2568 freeblist(bp); 2569 return; 2570 } 2571 break; 2572 default: 2573 hbp = nil; /* to suppress a warning */ 2574 panic("tcpoutput: version %d", version); 2575 } 2576 2577 /* Start the transmission timers if there is new data and we 2578 * expect acknowledges 2579 */ 2580 if(ssize != 0){ 2581 if(tcb->timer.state != TcptimerON) 2582 tcpgo(tpriv, &tcb->timer); 2583 2584 /* If round trip timer isn't running, start it. 2585 * measure the longest packet only in case the 2586 * transmission time dominates RTT 2587 */ 2588 if(tcb->rtt_timer.state != TcptimerON) 2589 if(ssize == tcb->mss) { 2590 tcpgo(tpriv, &tcb->rtt_timer); 2591 tcb->rttseq = tcb->snd.ptr; 2592 } 2593 } 2594 2595 tpriv->stats[OutSegs]++; 2596 2597 /* put off the next keep alive */ 2598 tcpgo(tpriv, &tcb->katimer); 2599 2600 switch(version){ 2601 case V4: 2602 if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){ 2603 /* a negative return means no route */ 2604 localclose(s, "no route"); 2605 } 2606 break; 2607 case V6: 2608 if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){ 2609 /* a negative return means no route */ 2610 localclose(s, "no route"); 2611 } 2612 break; 2613 default: 2614 panic("tcpoutput2: version %d", version); 2615 } 2616 if((uint)(msgs%4) == 1){ 2617 QUNLOCK(s); 2618 sched(); 2619 QLOCK(s); 2620 } 2621 } 2622 } 2623 2624 /* 2625 * the BSD convention (hack?) for keep alives. resend last uchar acked. 2626 */ 2627 void 2628 tcpsendka(Conv *s) 2629 { 2630 Tcp seg; 2631 Tcpctl *tcb; 2632 Block *hbp,*dbp; 2633 2634 tcb = (Tcpctl*)s->ptcl; 2635 2636 dbp = nil; 2637 seg.urg = 0; 2638 seg.source = s->lport; 2639 seg.dest = s->rport; 2640 seg.flags = ACK|PSH; 2641 seg.mss = 0; 2642 seg.ws = 0; 2643 if(tcpporthogdefense) 2644 seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20); 2645 else 2646 seg.seq = tcb->snd.una-1; 2647 seg.ack = tcb->rcv.nxt; 2648 tcb->rcv.una = 0; 2649 seg.wnd = tcb->rcv.wnd; 2650 if(tcb->state == Finwait2){ 2651 seg.flags |= FIN; 2652 } else { 2653 dbp = allocb(1); 2654 dbp->wp++; 2655 } 2656 2657 if(isv4(s->raddr)) { 2658 /* Build header, link data and compute cksum */ 2659 tcb->protohdr.tcp4hdr.vihl = IP_VER4; 2660 hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb); 2661 if(hbp == nil) { 2662 freeblist(dbp); 2663 return; 2664 } 2665 ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s); 2666 } 2667 else { 2668 /* Build header, link data and compute cksum */ 2669 tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6; 2670 hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb); 2671 if(hbp == nil) { 2672 freeblist(dbp); 2673 return; 2674 } 2675 ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s); 2676 } 2677 } 2678 2679 /* 2680 * set connection to time out after 12 minutes 2681 */ 2682 void 2683 tcpsetkacounter(Tcpctl *tcb) 2684 { 2685 tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK); 2686 if(tcb->kacounter < 3) 2687 tcb->kacounter = 3; 2688 } 2689 2690 /* 2691 * if we've timed out, close the connection 2692 * otherwise, send a keepalive and restart the timer 2693 */ 2694 void 2695 tcpkeepalive(void *v) 2696 { 2697 Tcpctl *tcb; 2698 Conv *s; 2699 2700 s = v; 2701 tcb = (Tcpctl*)s->ptcl; 2702 if(waserror()){ 2703 QUNLOCK(s); 2704 nexterror(); 2705 } 2706 QLOCK(s); 2707 if(tcb->state != Closed){ 2708 if(--(tcb->kacounter) <= 0) { 2709 localclose(s, Etimedout); 2710 } else { 2711 tcpsendka(s); 2712 tcpgo(s->p->priv, &tcb->katimer); 2713 } 2714 } 2715 QUNLOCK(s); 2716 poperror(); 2717 } 2718 2719 /* 2720 * start keepalive timer 2721 */ 2722 char* 2723 tcpstartka(Conv *s, char **f, int n) 2724 { 2725 Tcpctl *tcb; 2726 int x; 2727 2728 tcb = (Tcpctl*)s->ptcl; 2729 if(tcb->state != Established) 2730 return "connection must be in Establised state"; 2731 if(n > 1){ 2732 x = atoi(f[1]); 2733 if(x >= MSPTICK) 2734 tcb->katimer.start = x/MSPTICK; 2735 } 2736 tcpsetkacounter(tcb); 2737 tcpgo(s->p->priv, &tcb->katimer); 2738 2739 return nil; 2740 } 2741 2742 /* 2743 * turn checksums on/off 2744 */ 2745 char* 2746 tcpsetchecksum(Conv *s, char **f, int _) 2747 { 2748 Tcpctl *tcb; 2749 2750 tcb = (Tcpctl*)s->ptcl; 2751 tcb->nochecksum = !atoi(f[1]); 2752 2753 return nil; 2754 } 2755 2756 void 2757 tcprxmit(Conv *s) 2758 { 2759 Tcpctl *tcb; 2760 2761 tcb = (Tcpctl*)s->ptcl; 2762 2763 tcb->flags |= RETRAN|FORCE; 2764 tcb->snd.ptr = tcb->snd.una; 2765 2766 /* 2767 * We should be halving the slow start threshhold (down to one 2768 * mss) but leaving it at mss seems to work well enough 2769 */ 2770 tcb->ssthresh = tcb->mss; 2771 2772 /* 2773 * pull window down to a single packet 2774 */ 2775 tcb->cwind = tcb->mss; 2776 tcpoutput(s); 2777 } 2778 2779 void 2780 tcptimeout(void *arg) 2781 { 2782 Conv *s; 2783 Tcpctl *tcb; 2784 int maxback; 2785 Tcppriv *tpriv; 2786 2787 s = (Conv*)arg; 2788 tpriv = s->p->priv; 2789 tcb = (Tcpctl*)s->ptcl; 2790 2791 if(waserror()){ 2792 QUNLOCK(s); 2793 nexterror(); 2794 } 2795 QLOCK(s); 2796 switch(tcb->state){ 2797 default: 2798 tcb->backoff++; 2799 if(tcb->state == Syn_sent) 2800 maxback = MAXBACKMS/2; 2801 else 2802 maxback = MAXBACKMS; 2803 tcb->backedoff += tcb->timer.start * MSPTICK; 2804 if(tcb->backedoff >= maxback) { 2805 localclose(s, Etimedout); 2806 break; 2807 } 2808 netlog(s->p->f, Logtcprxmt, "timeout rexmit 0x%lux %d/%d\n", tcb->snd.una, tcb->timer.start, NOW); 2809 tcpsettimer(tcb); 2810 tcprxmit(s); 2811 tpriv->stats[RetransTimeouts]++; 2812 tcb->snd.dupacks = 0; 2813 break; 2814 case Time_wait: 2815 localclose(s, nil); 2816 break; 2817 case Closed: 2818 break; 2819 } 2820 QUNLOCK(s); 2821 poperror(); 2822 } 2823 2824 int 2825 inwindow(Tcpctl *tcb, int seq) 2826 { 2827 return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1); 2828 } 2829 2830 /* 2831 * set up state for a received SYN (or SYN ACK) packet 2832 */ 2833 void 2834 procsyn(Conv *s, Tcp *seg) 2835 { 2836 Tcpctl *tcb; 2837 2838 tcb = (Tcpctl*)s->ptcl; 2839 tcb->flags |= FORCE; 2840 2841 tcb->rcv.nxt = seg->seq + 1; 2842 tcb->rcv.urg = tcb->rcv.nxt; 2843 tcb->irs = seg->seq; 2844 2845 /* our sending max segment size cannot be bigger than what he asked for */ 2846 if(seg->mss != 0 && seg->mss < tcb->mss) 2847 tcb->mss = seg->mss; 2848 2849 /* the congestion window always starts out as a single segment */ 2850 tcb->snd.wnd = seg->wnd; 2851 tcb->cwind = tcb->mss; 2852 } 2853 2854 int 2855 addreseq(Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length) 2856 { 2857 Reseq *rp, *rp1; 2858 int i, rqlen, qmax; 2859 2860 rp = malloc(sizeof(Reseq)); 2861 if(rp == nil){ 2862 freeblist(bp); /* bp always consumed by add_reseq */ 2863 return 0; 2864 } 2865 2866 rp->seg = *seg; 2867 rp->bp = bp; 2868 rp->length = length; 2869 2870 /* Place on reassembly list sorting by starting seq number */ 2871 rp1 = tcb->reseq; 2872 if(rp1 == nil || seq_lt(seg->seq, rp1->seg.seq)) { 2873 rp->next = rp1; 2874 tcb->reseq = rp; 2875 if(rp->next != nil) 2876 tpriv->stats[OutOfOrder]++; 2877 return 0; 2878 } 2879 2880 rqlen = 0; 2881 for(i = 0;; i++) { 2882 rqlen += rp1->length; 2883 if(rp1->next == nil || seq_lt(seg->seq, rp1->next->seg.seq)) { 2884 rp->next = rp1->next; 2885 rp1->next = rp; 2886 if(rp->next != nil) 2887 tpriv->stats[OutOfOrder]++; 2888 break; 2889 } 2890 rp1 = rp1->next; 2891 } 2892 qmax = QMAX<<tcb->rcv.scale; 2893 if(rqlen > qmax){ 2894 print("resequence queue > window: %d > %d\n", rqlen, qmax); 2895 i = 0; 2896 for(rp1 = tcb->reseq; rp1 != nil; rp1 = rp1->next){ 2897 print("%#lux %#lux %#ux\n", rp1->seg.seq, 2898 rp1->seg.ack, rp1->seg.flags); 2899 if(i++ > 10){ 2900 print("...\n"); 2901 break; 2902 } 2903 } 2904 2905 /* 2906 * delete entire reassembly queue; wait for retransmit. 2907 * - should we be smarter and only delete the tail? 2908 */ 2909 for(rp = tcb->reseq; rp != nil; rp = rp1){ 2910 rp1 = rp->next; 2911 freeblist(rp->bp); 2912 free(rp); 2913 } 2914 tcb->reseq = nil; 2915 2916 return -1; 2917 } 2918 return 0; 2919 } 2920 2921 void 2922 getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) 2923 { 2924 Reseq *rp; 2925 2926 rp = tcb->reseq; 2927 if(rp == nil) 2928 return; 2929 2930 tcb->reseq = rp->next; 2931 2932 *seg = rp->seg; 2933 *bp = rp->bp; 2934 *length = rp->length; 2935 2936 free(rp); 2937 } 2938 2939 int 2940 tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length) 2941 { 2942 ushort len; 2943 uchar accept; 2944 int dupcnt, excess; 2945 2946 accept = 0; 2947 len = *length; 2948 if(seg->flags & SYN) 2949 len++; 2950 if(seg->flags & FIN) 2951 len++; 2952 2953 if(tcb->rcv.wnd == 0) { 2954 if(len == 0 && seg->seq == tcb->rcv.nxt) 2955 return 0; 2956 } 2957 else { 2958 /* Some part of the segment should be in the window */ 2959 if(inwindow(tcb,seg->seq)) 2960 accept++; 2961 else 2962 if(len != 0) { 2963 if(inwindow(tcb, seg->seq+len-1) || 2964 seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1)) 2965 accept++; 2966 } 2967 } 2968 if(!accept) { 2969 freeblist(*bp); 2970 return -1; 2971 } 2972 dupcnt = tcb->rcv.nxt - seg->seq; 2973 if(dupcnt > 0){ 2974 tcb->rerecv += dupcnt; 2975 if(seg->flags & SYN){ 2976 seg->flags &= ~SYN; 2977 seg->seq++; 2978 2979 if(seg->urg > 1) 2980 seg->urg--; 2981 else 2982 seg->flags &= ~URG; 2983 dupcnt--; 2984 } 2985 if(dupcnt > 0){ 2986 pullblock(bp, (ushort)dupcnt); 2987 seg->seq += dupcnt; 2988 *length -= dupcnt; 2989 2990 if(seg->urg > dupcnt) 2991 seg->urg -= dupcnt; 2992 else { 2993 seg->flags &= ~URG; 2994 seg->urg = 0; 2995 } 2996 } 2997 } 2998 excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd); 2999 if(excess > 0) { 3000 tcb->rerecv += excess; 3001 *length -= excess; 3002 *bp = trimblock(*bp, 0, *length); 3003 if(*bp == nil) 3004 panic("presotto is a boofhead"); 3005 seg->flags &= ~FIN; 3006 } 3007 return 0; 3008 } 3009 3010 void 3011 tcpadvise(Proto *tcp, Block *bp, char *msg) 3012 { 3013 Tcp4hdr *h4; 3014 Tcp6hdr *h6; 3015 Tcpctl *tcb; 3016 uchar source[IPaddrlen]; 3017 uchar dest[IPaddrlen]; 3018 ushort psource, pdest; 3019 Conv *s, **p; 3020 3021 h4 = (Tcp4hdr*)(bp->rp); 3022 h6 = (Tcp6hdr*)(bp->rp); 3023 3024 if((h4->vihl&0xF0)==IP_VER4) { 3025 v4tov6(dest, h4->tcpdst); 3026 v4tov6(source, h4->tcpsrc); 3027 psource = nhgets(h4->tcpsport); 3028 pdest = nhgets(h4->tcpdport); 3029 } 3030 else { 3031 ipmove(dest, h6->tcpdst); 3032 ipmove(source, h6->tcpsrc); 3033 psource = nhgets(h6->tcpsport); 3034 pdest = nhgets(h6->tcpdport); 3035 } 3036 3037 /* Look for a connection */ 3038 QLOCK(tcp); 3039 for(p = tcp->conv; *p; p++) { 3040 s = *p; 3041 tcb = (Tcpctl*)s->ptcl; 3042 if(s->rport == pdest) 3043 if(s->lport == psource) 3044 if(tcb->state != Closed) 3045 if(ipcmp(s->raddr, dest) == 0) 3046 if(ipcmp(s->laddr, source) == 0){ 3047 QLOCK(s); 3048 QUNLOCK(tcp); 3049 switch(tcb->state){ 3050 case Syn_sent: 3051 localclose(s, msg); 3052 break; 3053 } 3054 QUNLOCK(s); 3055 freeblist(bp); 3056 return; 3057 } 3058 } 3059 QUNLOCK(tcp); 3060 freeblist(bp); 3061 } 3062 3063 static char* 3064 tcpporthogdefensectl(char *val) 3065 { 3066 if(strcmp(val, "on") == 0) 3067 tcpporthogdefense = 1; 3068 else if(strcmp(val, "off") == 0) 3069 tcpporthogdefense = 0; 3070 else 3071 return "unknown value for tcpporthogdefense"; 3072 return nil; 3073 } 3074 3075 /* called with c QLOCKed */ 3076 char* 3077 tcpctl(Conv* c, char** f, int n) 3078 { 3079 if(n == 1 && strcmp(f[0], "hangup") == 0) 3080 return tcphangup(c); 3081 if(n >= 1 && strcmp(f[0], "keepalive") == 0) 3082 return tcpstartka(c, f, n); 3083 if(n >= 1 && strcmp(f[0], "checksum") == 0) 3084 return tcpsetchecksum(c, f, n); 3085 if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0) 3086 return tcpporthogdefensectl(f[1]); 3087 return "unknown control request"; 3088 } 3089 3090 int 3091 tcpstats(Proto *tcp, char *buf, int len) 3092 { 3093 Tcppriv *priv; 3094 char *p, *e; 3095 int i; 3096 3097 priv = tcp->priv; 3098 p = buf; 3099 e = p+len; 3100 for(i = 0; i < Nstats; i++) 3101 p = seprint(p, e, "%s: %lud\n", statnames[i], priv->stats[i]); 3102 return p - buf; 3103 } 3104 3105 /* 3106 * garbage collect any stale conversations: 3107 * - SYN received but no SYN-ACK after 5 seconds (could be the SYN attack) 3108 * - Finwait2 after 5 minutes 3109 * 3110 * this is called whenever we run out of channels. Both checks are 3111 * of questionable validity so we try to use them only when we're 3112 * up against the wall. 3113 */ 3114 int 3115 tcpgc(Proto *tcp) 3116 { 3117 Conv *c, **pp, **ep; 3118 int n; 3119 Tcpctl *tcb; 3120 3121 3122 n = 0; 3123 ep = &tcp->conv[tcp->nc]; 3124 for(pp = tcp->conv; pp < ep; pp++) { 3125 c = *pp; 3126 if(c == nil) 3127 break; 3128 if(!CANQLOCK(c)) 3129 continue; 3130 tcb = (Tcpctl*)c->ptcl; 3131 switch(tcb->state){ 3132 case Syn_received: 3133 if(NOW - tcb->time > 5000){ 3134 localclose(c, "timed out"); 3135 n++; 3136 } 3137 break; 3138 case Finwait2: 3139 if(NOW - tcb->time > 5*60*1000){ 3140 localclose(c, "timed out"); 3141 n++; 3142 } 3143 break; 3144 } 3145 QUNLOCK(c); 3146 } 3147 return n; 3148 } 3149 3150 void 3151 tcpsettimer(Tcpctl *tcb) 3152 { 3153 int x; 3154 3155 /* round trip dependency */ 3156 x = backoff(tcb->backoff) * 3157 (tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK; 3158 3159 /* bounded twixt 1/2 and 64 seconds */ 3160 if(x < 500/MSPTICK) 3161 x = 500/MSPTICK; 3162 else if(x > (64000/MSPTICK)) 3163 x = 64000/MSPTICK; 3164 tcb->timer.start = x; 3165 } 3166 3167 void 3168 tcpinit(Fs *fs) 3169 { 3170 Proto *tcp; 3171 Tcppriv *tpriv; 3172 3173 tcp = smalloc(sizeof(Proto)); 3174 tpriv = tcp->priv = smalloc(sizeof(Tcppriv)); 3175 tcp->name = "tcp"; 3176 tcp->connect = tcpconnect; 3177 tcp->announce = tcpannounce; 3178 tcp->ctl = tcpctl; 3179 tcp->state = tcpstate; 3180 tcp->create = tcpcreate; 3181 tcp->close = tcpclose; 3182 tcp->rcv = tcpiput; 3183 tcp->advise = tcpadvise; 3184 tcp->stats = tcpstats; 3185 tcp->inuse = tcpinuse; 3186 tcp->gc = tcpgc; 3187 tcp->ipproto = IP_TCPPROTO; 3188 tcp->nc = scalednconv(); 3189 tcp->ptclsize = sizeof(Tcpctl); 3190 tpriv->stats[MaxConn] = tcp->nc; 3191 3192 Fsproto(fs, tcp); 3193 } 3194 3195 void 3196 tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale) 3197 { 3198 if(rcvscale){ 3199 tcb->rcv.scale = rcvscale & 0xff; 3200 tcb->snd.scale = sndscale & 0xff; 3201 tcb->window = QMAX<<tcb->snd.scale; 3202 qsetlimit(s->rq, tcb->window); 3203 } else { 3204 tcb->rcv.scale = 0; 3205 tcb->snd.scale = 0; 3206 tcb->window = QMAX; 3207 qsetlimit(s->rq, tcb->window); 3208 } 3209 }