aboutsummaryrefslogtreecommitdiffstats
path: root/gpxe/src/net
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@zytor.com>2008-03-26 16:25:35 -0700
committerH. Peter Anvin <hpa@zytor.com>2008-03-26 16:25:35 -0700
commit9eddd22a7b53b1d02fbae0d987df8af122924248 (patch)
tree882f5152880b0b1aa2d7a0619d30065acc69fb16 /gpxe/src/net
parentbbb8f15936b851e6a0ef6f7bb2c95197bff35994 (diff)
downloadsyslinux.git-9eddd22a7b53b1d02fbae0d987df8af122924248.tar.gz
syslinux.git-9eddd22a7b53b1d02fbae0d987df8af122924248.tar.xz
syslinux.git-9eddd22a7b53b1d02fbae0d987df8af122924248.zip
Add gPXE into the source tree; build unified imagesyslinux-3.70-pre7
Diffstat (limited to 'gpxe/src/net')
-rw-r--r--gpxe/src/net/aoe.c375
-rw-r--r--gpxe/src/net/arp.c294
-rw-r--r--gpxe/src/net/dhcpopts.c434
-rw-r--r--gpxe/src/net/dhcppkt.c172
-rw-r--r--gpxe/src/net/ethernet.c116
-rw-r--r--gpxe/src/net/fakedhcp.c205
-rw-r--r--gpxe/src/net/icmpv6.c128
-rw-r--r--gpxe/src/net/infiniband.c437
-rw-r--r--gpxe/src/net/iobpad.c66
-rw-r--r--gpxe/src/net/ipv4.c633
-rw-r--r--gpxe/src/net/ipv6.c380
-rw-r--r--gpxe/src/net/ndp.c180
-rw-r--r--gpxe/src/net/netdev_settings.c90
-rw-r--r--gpxe/src/net/netdevice.c513
-rw-r--r--gpxe/src/net/nullnet.c58
-rw-r--r--gpxe/src/net/rarp.c68
-rw-r--r--gpxe/src/net/retry.c184
-rw-r--r--gpxe/src/net/tcp.c1073
-rw-r--r--gpxe/src/net/tcp/ftp.c467
-rw-r--r--gpxe/src/net/tcp/http.c534
-rw-r--r--gpxe/src/net/tcp/https.c49
-rw-r--r--gpxe/src/net/tcp/iscsi.c1726
-rw-r--r--gpxe/src/net/tcpip.c145
-rw-r--r--gpxe/src/net/tls.c1731
-rw-r--r--gpxe/src/net/udp.c465
-rw-r--r--gpxe/src/net/udp/dhcp.c825
-rw-r--r--gpxe/src/net/udp/dns.c547
-rw-r--r--gpxe/src/net/udp/tftp.c1149
28 files changed, 13044 insertions, 0 deletions
diff --git a/gpxe/src/net/aoe.c b/gpxe/src/net/aoe.c
new file mode 100644
index 00000000..e3f84e5a
--- /dev/null
+++ b/gpxe/src/net/aoe.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright (C) 2006 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+#include <byteswap.h>
+#include <gpxe/list.h>
+#include <gpxe/if_ether.h>
+#include <gpxe/ethernet.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/uaccess.h>
+#include <gpxe/ata.h>
+#include <gpxe/netdevice.h>
+#include <gpxe/process.h>
+#include <gpxe/features.h>
+#include <gpxe/aoe.h>
+
+/** @file
+ *
+ * AoE protocol
+ *
+ */
+
+FEATURE ( FEATURE_PROTOCOL, "AoE", DHCP_EB_FEATURE_AOE, 1 );
+
+struct net_protocol aoe_protocol;
+
+/** List of all AoE sessions */
+static LIST_HEAD ( aoe_sessions );
+
+static void aoe_free ( struct refcnt *refcnt ) {
+ struct aoe_session *aoe =
+ container_of ( refcnt, struct aoe_session, refcnt );
+
+ netdev_put ( aoe->netdev );
+ free ( aoe );
+}
+
+/**
+ * Mark current AoE command complete
+ *
+ * @v aoe AoE session
+ * @v rc Return status code
+ */
+static void aoe_done ( struct aoe_session *aoe, int rc ) {
+
+ /* Record overall command status */
+ aoe->command->cb.cmd_stat = aoe->status;
+ aoe->command = NULL;
+
+ /* Mark operation as complete */
+ aoe->rc = rc;
+}
+
+/**
+ * Send AoE command
+ *
+ * @v aoe AoE session
+ * @ret rc Return status code
+ *
+ * This transmits an AoE command packet. It does not wait for a
+ * response.
+ */
+static int aoe_send_command ( struct aoe_session *aoe ) {
+ struct ata_command *command = aoe->command;
+ struct io_buffer *iobuf;
+ struct aoehdr *aoehdr;
+ struct aoecmd *aoecmd;
+ unsigned int count;
+ unsigned int data_out_len;
+
+ /* Fail immediately if we have no netdev to send on */
+ if ( ! aoe->netdev ) {
+ aoe_done ( aoe, -ENETUNREACH );
+ return -ENETUNREACH;
+ }
+
+ /* Calculate count and data_out_len for this subcommand */
+ count = command->cb.count.native;
+ if ( count > AOE_MAX_COUNT )
+ count = AOE_MAX_COUNT;
+ data_out_len = ( command->data_out ? ( count * ATA_SECTOR_SIZE ) : 0 );
+
+ /* Create outgoing I/O buffer */
+ iobuf = alloc_iob ( ETH_HLEN + sizeof ( *aoehdr ) + sizeof ( *aoecmd ) +
+ data_out_len );
+ if ( ! iobuf )
+ return -ENOMEM;
+ iob_reserve ( iobuf, ETH_HLEN );
+ aoehdr = iob_put ( iobuf, sizeof ( *aoehdr ) );
+ aoecmd = iob_put ( iobuf, sizeof ( *aoecmd ) );
+ memset ( aoehdr, 0, ( sizeof ( *aoehdr ) + sizeof ( *aoecmd ) ) );
+
+ /* Fill AoE header */
+ aoehdr->ver_flags = AOE_VERSION;
+ aoehdr->major = htons ( aoe->major );
+ aoehdr->minor = aoe->minor;
+ aoehdr->tag = htonl ( ++aoe->tag );
+
+ /* Fill AoE command */
+ linker_assert ( AOE_FL_DEV_HEAD == ATA_DEV_SLAVE, __fix_ata_h__ );
+ aoecmd->aflags = ( ( command->cb.lba48 ? AOE_FL_EXTENDED : 0 ) |
+ ( command->cb.device & ATA_DEV_SLAVE ) |
+ ( data_out_len ? AOE_FL_WRITE : 0 ) );
+ aoecmd->err_feat = command->cb.err_feat.bytes.cur;
+ aoecmd->count = count;
+ aoecmd->cmd_stat = command->cb.cmd_stat;
+ aoecmd->lba.u64 = cpu_to_le64 ( command->cb.lba.native );
+ if ( ! command->cb.lba48 )
+ aoecmd->lba.bytes[3] |= ( command->cb.device & ATA_DEV_MASK );
+
+ /* Fill data payload */
+ copy_from_user ( iob_put ( iobuf, data_out_len ), command->data_out,
+ aoe->command_offset, data_out_len );
+
+ /* Send packet */
+ start_timer ( &aoe->timer );
+ return net_tx ( iobuf, aoe->netdev, &aoe_protocol, aoe->target );
+}
+
+/**
+ * Handle AoE retry timer expiry
+ *
+ * @v timer AoE retry timer
+ * @v fail Failure indicator
+ */
+static void aoe_timer_expired ( struct retry_timer *timer, int fail ) {
+ struct aoe_session *aoe =
+ container_of ( timer, struct aoe_session, timer );
+
+ if ( fail ) {
+ aoe_done ( aoe, -ETIMEDOUT );
+ } else {
+ aoe_send_command ( aoe );
+ }
+}
+
+/**
+ * Handle AoE response
+ *
+ * @v aoe AoE session
+ * @v aoehdr AoE header
+ * @ret rc Return status code
+ */
+static int aoe_rx_response ( struct aoe_session *aoe, struct aoehdr *aoehdr,
+ unsigned int len ) {
+ struct aoecmd *aoecmd = aoehdr->arg.command;
+ struct ata_command *command = aoe->command;
+ unsigned int rx_data_len;
+ unsigned int count;
+ unsigned int data_len;
+
+ /* Sanity check */
+ if ( len < ( sizeof ( *aoehdr ) + sizeof ( *aoecmd ) ) ) {
+ /* Ignore packet; allow timer to trigger retransmit */
+ return -EINVAL;
+ }
+ rx_data_len = ( len - sizeof ( *aoehdr ) - sizeof ( *aoecmd ) );
+
+ /* Stop retry timer. After this point, every code path must
+ * either terminate the AoE operation via aoe_done(), or
+ * transmit a new packet.
+ */
+ stop_timer ( &aoe->timer );
+
+ /* Check for fatal errors */
+ if ( aoehdr->ver_flags & AOE_FL_ERROR ) {
+ aoe_done ( aoe, -EIO );
+ return 0;
+ }
+
+ /* Calculate count and data_len for this subcommand */
+ count = command->cb.count.native;
+ if ( count > AOE_MAX_COUNT )
+ count = AOE_MAX_COUNT;
+ data_len = count * ATA_SECTOR_SIZE;
+
+ /* Merge into overall ATA status */
+ aoe->status |= aoecmd->cmd_stat;
+
+ /* Copy data payload */
+ if ( command->data_in ) {
+ if ( rx_data_len > data_len )
+ rx_data_len = data_len;
+ copy_to_user ( command->data_in, aoe->command_offset,
+ aoecmd->data, rx_data_len );
+ }
+
+ /* Update ATA command and offset */
+ aoe->command_offset += data_len;
+ command->cb.lba.native += count;
+ command->cb.count.native -= count;
+
+ /* Check for operation complete */
+ if ( ! command->cb.count.native ) {
+ aoe_done ( aoe, 0 );
+ return 0;
+ }
+
+ /* Transmit next portion of request */
+ aoe_send_command ( aoe );
+
+ return 0;
+}
+
+/**
+ * Process incoming AoE packets
+ *
+ * @v iobuf I/O buffer
+ * @v netdev Network device
+ * @v ll_source Link-layer source address
+ * @ret rc Return status code
+ *
+ */
+static int aoe_rx ( struct io_buffer *iobuf, struct net_device *netdev __unused,
+ const void *ll_source ) {
+ struct aoehdr *aoehdr = iobuf->data;
+ unsigned int len = iob_len ( iobuf );
+ struct aoe_session *aoe;
+ int rc = 0;
+
+ /* Sanity checks */
+ if ( len < sizeof ( *aoehdr ) ) {
+ rc = -EINVAL;
+ goto done;
+ }
+ if ( ( aoehdr->ver_flags & AOE_VERSION_MASK ) != AOE_VERSION ) {
+ rc = -EPROTONOSUPPORT;
+ goto done;
+ }
+ if ( ! ( aoehdr->ver_flags & AOE_FL_RESPONSE ) ) {
+ /* Ignore AoE requests that we happen to see */
+ goto done;
+ }
+
+ /* Demultiplex amongst active AoE sessions */
+ list_for_each_entry ( aoe, &aoe_sessions, list ) {
+ if ( ntohs ( aoehdr->major ) != aoe->major )
+ continue;
+ if ( aoehdr->minor != aoe->minor )
+ continue;
+ if ( ntohl ( aoehdr->tag ) != aoe->tag )
+ continue;
+ memcpy ( aoe->target, ll_source, sizeof ( aoe->target ) );
+ rc = aoe_rx_response ( aoe, aoehdr, len );
+ break;
+ }
+
+ done:
+ free_iob ( iobuf );
+ return rc;
+}
+
+/** AoE protocol */
+struct net_protocol aoe_protocol __net_protocol = {
+ .name = "AoE",
+ .net_proto = htons ( ETH_P_AOE ),
+ .rx = aoe_rx,
+};
+
+/**
+ * Issue ATA command via an open AoE session
+ *
+ * @v ata ATA device
+ * @v command ATA command
+ * @ret rc Return status code
+ */
+static int aoe_command ( struct ata_device *ata,
+ struct ata_command *command ) {
+ struct aoe_session *aoe =
+ container_of ( ata->backend, struct aoe_session, refcnt );
+ int rc;
+
+ aoe->command = command;
+ aoe->status = 0;
+ aoe->command_offset = 0;
+ aoe_send_command ( aoe );
+
+ aoe->rc = -EINPROGRESS;
+ while ( aoe->rc == -EINPROGRESS )
+ step();
+ rc = aoe->rc;
+
+ return rc;
+}
+
+static int aoe_detached_command ( struct ata_device *ata __unused,
+ struct ata_command *command __unused ) {
+ return -ENODEV;
+}
+
+void aoe_detach ( struct ata_device *ata ) {
+ struct aoe_session *aoe =
+ container_of ( ata->backend, struct aoe_session, refcnt );
+
+ stop_timer ( &aoe->timer );
+ ata->command = aoe_detached_command;
+ list_del ( &aoe->list );
+ ref_put ( ata->backend );
+ ata->backend = NULL;
+}
+
+static int aoe_parse_root_path ( struct aoe_session *aoe,
+ const char *root_path ) {
+ char *ptr;
+
+ if ( strncmp ( root_path, "aoe:", 4 ) != 0 )
+ return -EINVAL;
+ ptr = ( ( char * ) root_path + 4 );
+
+ if ( *ptr++ != 'e' )
+ return -EINVAL;
+
+ aoe->major = strtoul ( ptr, &ptr, 10 );
+ if ( *ptr++ != '.' )
+ return -EINVAL;
+
+ aoe->minor = strtoul ( ptr, &ptr, 10 );
+ if ( *ptr )
+ return -EINVAL;
+
+ return 0;
+}
+
+int aoe_attach ( struct ata_device *ata, struct net_device *netdev,
+ const char *root_path ) {
+ struct aoe_session *aoe;
+ int rc;
+
+ /* Allocate and initialise structure */
+ aoe = zalloc ( sizeof ( *aoe ) );
+ if ( ! aoe )
+ return -ENOMEM;
+ aoe->refcnt.free = aoe_free;
+ aoe->netdev = netdev_get ( netdev );
+ memcpy ( aoe->target, ethernet_protocol.ll_broadcast,
+ sizeof ( aoe->target ) );
+ aoe->tag = AOE_TAG_MAGIC;
+ aoe->timer.expired = aoe_timer_expired;
+
+ /* Parse root path */
+ if ( ( rc = aoe_parse_root_path ( aoe, root_path ) ) != 0 )
+ goto err;
+
+ /* Attach parent interface, transfer reference to connection
+ * list, and return
+ */
+ ata->backend = ref_get ( &aoe->refcnt );
+ ata->command = aoe_command;
+ list_add ( &aoe->list, &aoe_sessions );
+ return 0;
+
+ err:
+ ref_put ( &aoe->refcnt );
+ return rc;
+}
diff --git a/gpxe/src/net/arp.c b/gpxe/src/net/arp.c
new file mode 100644
index 00000000..011d4fef
--- /dev/null
+++ b/gpxe/src/net/arp.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (C) 2006 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <byteswap.h>
+#include <errno.h>
+#include <gpxe/if_ether.h>
+#include <gpxe/if_arp.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/netdevice.h>
+#include <gpxe/arp.h>
+
+/** @file
+ *
+ * Address Resolution Protocol
+ *
+ * This file implements the address resolution protocol as defined in
+ * RFC826. The implementation is media-independent and
+ * protocol-independent; it is not limited to Ethernet or to IPv4.
+ *
+ */
+
+/** Registered ARP protocols */
+static struct arp_net_protocol arp_net_protocols[0]
+ __table_start ( struct arp_net_protocol, arp_net_protocols );
+static struct arp_net_protocol arp_net_protocols_end[0]
+ __table_end ( struct arp_net_protocol, arp_net_protocols );
+
+/** An ARP cache entry */
+struct arp_entry {
+ /** Network-layer protocol */
+ struct net_protocol *net_protocol;
+ /** Link-layer protocol */
+ struct ll_protocol *ll_protocol;
+ /** Network-layer address */
+ uint8_t net_addr[MAX_NET_ADDR_LEN];
+ /** Link-layer address */
+ uint8_t ll_addr[MAX_LL_ADDR_LEN];
+};
+
+/** Number of entries in the ARP cache
+ *
+ * This is a global cache, covering all network interfaces,
+ * network-layer protocols and link-layer protocols.
+ */
+#define NUM_ARP_ENTRIES 4
+
+/** The ARP cache */
+static struct arp_entry arp_table[NUM_ARP_ENTRIES];
+#define arp_table_end &arp_table[NUM_ARP_ENTRIES]
+
+static unsigned int next_new_arp_entry = 0;
+
+struct net_protocol arp_protocol;
+
+/**
+ * Find entry in the ARP cache
+ *
+ * @v ll_protocol Link-layer protocol
+ * @v net_protocol Network-layer protocol
+ * @v net_addr Network-layer address
+ * @ret arp ARP cache entry, or NULL if not found
+ *
+ */
+static struct arp_entry *
+arp_find_entry ( struct ll_protocol *ll_protocol,
+ struct net_protocol *net_protocol,
+ const void *net_addr ) {
+ struct arp_entry *arp;
+
+ for ( arp = arp_table ; arp < arp_table_end ; arp++ ) {
+ if ( ( arp->ll_protocol == ll_protocol ) &&
+ ( arp->net_protocol == net_protocol ) &&
+ ( memcmp ( arp->net_addr, net_addr,
+ net_protocol->net_addr_len ) == 0 ) )
+ return arp;
+ }
+ return NULL;
+}
+
+/**
+ * Look up media-specific link-layer address in the ARP cache
+ *
+ * @v netdev Network device
+ * @v net_protocol Network-layer protocol
+ * @v dest_net_addr Destination network-layer address
+ * @v source_net_addr Source network-layer address
+ * @ret dest_ll_addr Destination link layer address
+ * @ret rc Return status code
+ *
+ * This function will use the ARP cache to look up the link-layer
+ * address for the link-layer protocol associated with the network
+ * device and the given network-layer protocol and addresses. If
+ * found, the destination link-layer address will be filled in in @c
+ * dest_ll_addr.
+ *
+ * If no address is found in the ARP cache, an ARP request will be
+ * transmitted on the specified network device and -ENOENT will be
+ * returned.
+ */
+int arp_resolve ( struct net_device *netdev, struct net_protocol *net_protocol,
+ const void *dest_net_addr, const void *source_net_addr,
+ void *dest_ll_addr ) {
+ struct ll_protocol *ll_protocol = netdev->ll_protocol;
+ const struct arp_entry *arp;
+ struct io_buffer *iobuf;
+ struct arphdr *arphdr;
+ int rc;
+
+ /* Look for existing entry in ARP table */
+ arp = arp_find_entry ( ll_protocol, net_protocol, dest_net_addr );
+ if ( arp ) {
+ DBG ( "ARP cache hit: %s %s => %s %s\n",
+ net_protocol->name, net_protocol->ntoa ( arp->net_addr ),
+ ll_protocol->name, ll_protocol->ntoa ( arp->ll_addr ) );
+ memcpy ( dest_ll_addr, arp->ll_addr, ll_protocol->ll_addr_len);
+ return 0;
+ }
+ DBG ( "ARP cache miss: %s %s\n", net_protocol->name,
+ net_protocol->ntoa ( dest_net_addr ) );
+
+ /* Allocate ARP packet */
+ iobuf = alloc_iob ( MAX_LL_HEADER_LEN + sizeof ( *arphdr ) +
+ 2 * ( MAX_LL_ADDR_LEN + MAX_NET_ADDR_LEN ) );
+ if ( ! iobuf )
+ return -ENOMEM;
+ iob_reserve ( iobuf, MAX_LL_HEADER_LEN );
+
+ /* Build up ARP request */
+ arphdr = iob_put ( iobuf, sizeof ( *arphdr ) );
+ arphdr->ar_hrd = ll_protocol->ll_proto;
+ arphdr->ar_hln = ll_protocol->ll_addr_len;
+ arphdr->ar_pro = net_protocol->net_proto;
+ arphdr->ar_pln = net_protocol->net_addr_len;
+ arphdr->ar_op = htons ( ARPOP_REQUEST );
+ memcpy ( iob_put ( iobuf, ll_protocol->ll_addr_len ),
+ netdev->ll_addr, ll_protocol->ll_addr_len );
+ memcpy ( iob_put ( iobuf, net_protocol->net_addr_len ),
+ source_net_addr, net_protocol->net_addr_len );
+ memset ( iob_put ( iobuf, ll_protocol->ll_addr_len ),
+ 0, ll_protocol->ll_addr_len );
+ memcpy ( iob_put ( iobuf, net_protocol->net_addr_len ),
+ dest_net_addr, net_protocol->net_addr_len );
+
+ /* Transmit ARP request */
+ if ( ( rc = net_tx ( iobuf, netdev, &arp_protocol,
+ ll_protocol->ll_broadcast ) ) != 0 )
+ return rc;
+
+ return -ENOENT;
+}
+
+/**
+ * Identify ARP protocol
+ *
+ * @v net_proto Network-layer protocol, in network-endian order
+ * @ret arp_net_protocol ARP protocol, or NULL
+ *
+ */
+static struct arp_net_protocol * arp_find_protocol ( uint16_t net_proto ) {
+ struct arp_net_protocol *arp_net_protocol;
+
+ for ( arp_net_protocol = arp_net_protocols ;
+ arp_net_protocol < arp_net_protocols_end ; arp_net_protocol++ ) {
+ if ( arp_net_protocol->net_protocol->net_proto == net_proto ) {
+ return arp_net_protocol;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * Process incoming ARP packets
+ *
+ * @v iobuf I/O buffer
+ * @v netdev Network device
+ * @v ll_source Link-layer source address
+ * @ret rc Return status code
+ *
+ * This handles ARP requests and responses as detailed in RFC826. The
+ * method detailed within the RFC is pretty optimised, handling
+ * requests and responses with basically a single code path and
+ * avoiding the need for extraneous ARP requests; read the RFC for
+ * details.
+ */
+static int arp_rx ( struct io_buffer *iobuf, struct net_device *netdev,
+ const void *ll_source __unused ) {
+ struct arphdr *arphdr = iobuf->data;
+ struct arp_net_protocol *arp_net_protocol;
+ struct net_protocol *net_protocol;
+ struct ll_protocol *ll_protocol;
+ struct arp_entry *arp;
+ int merge = 0;
+
+ /* Identify network-layer and link-layer protocols */
+ arp_net_protocol = arp_find_protocol ( arphdr->ar_pro );
+ if ( ! arp_net_protocol )
+ goto done;
+ net_protocol = arp_net_protocol->net_protocol;
+ ll_protocol = netdev->ll_protocol;
+
+ /* Sanity checks */
+ if ( ( arphdr->ar_hrd != ll_protocol->ll_proto ) ||
+ ( arphdr->ar_hln != ll_protocol->ll_addr_len ) ||
+ ( arphdr->ar_pln != net_protocol->net_addr_len ) )
+ goto done;
+
+ /* See if we have an entry for this sender, and update it if so */
+ arp = arp_find_entry ( ll_protocol, net_protocol,
+ arp_sender_pa ( arphdr ) );
+ if ( arp ) {
+ memcpy ( arp->ll_addr, arp_sender_ha ( arphdr ),
+ arphdr->ar_hln );
+ merge = 1;
+ DBG ( "ARP cache update: %s %s => %s %s\n",
+ net_protocol->name, net_protocol->ntoa ( arp->net_addr ),
+ ll_protocol->name, ll_protocol->ntoa ( arp->ll_addr ) );
+ }
+
+ /* See if we own the target protocol address */
+ if ( arp_net_protocol->check ( netdev, arp_target_pa ( arphdr ) ) != 0)
+ goto done;
+
+ /* Create new ARP table entry if necessary */
+ if ( ! merge ) {
+ arp = &arp_table[next_new_arp_entry++ % NUM_ARP_ENTRIES];
+ arp->ll_protocol = ll_protocol;
+ arp->net_protocol = net_protocol;
+ memcpy ( arp->ll_addr, arp_sender_ha ( arphdr ),
+ arphdr->ar_hln );
+ memcpy ( arp->net_addr, arp_sender_pa ( arphdr ),
+ arphdr->ar_pln);
+ DBG ( "ARP cache add: %s %s => %s %s\n",
+ net_protocol->name, net_protocol->ntoa ( arp->net_addr ),
+ ll_protocol->name, ll_protocol->ntoa ( arp->ll_addr ) );
+ }
+
+ /* If it's not a request, there's nothing more to do */
+ if ( arphdr->ar_op != htons ( ARPOP_REQUEST ) )
+ goto done;
+
+ /* Change request to a reply */
+ DBG ( "ARP reply: %s %s => %s %s\n", net_protocol->name,
+ net_protocol->ntoa ( arp_target_pa ( arphdr ) ),
+ ll_protocol->name, ll_protocol->ntoa ( netdev->ll_addr ) );
+ arphdr->ar_op = htons ( ARPOP_REPLY );
+ memswap ( arp_sender_ha ( arphdr ), arp_target_ha ( arphdr ),
+ arphdr->ar_hln + arphdr->ar_pln );
+ memcpy ( arp_sender_ha ( arphdr ), netdev->ll_addr, arphdr->ar_hln );
+
+ /* Send reply */
+ net_tx ( iobuf, netdev, &arp_protocol, arp_target_ha (arphdr ) );
+ iobuf = NULL;
+
+ done:
+ free_iob ( iobuf );
+ return 0;
+}
+
+/**
+ * Transcribe ARP address
+ *
+ * @v net_addr ARP address
+ * @ret string "<ARP>"
+ *
+ * This operation is meaningless for the ARP protocol.
+ */
+static const char * arp_ntoa ( const void *net_addr __unused ) {
+ return "<ARP>";
+}
+
+/** ARP protocol */
+struct net_protocol arp_protocol __net_protocol = {
+ .name = "ARP",
+ .net_proto = htons ( ETH_P_ARP ),
+ .rx = arp_rx,
+ .ntoa = arp_ntoa,
+};
diff --git a/gpxe/src/net/dhcpopts.c b/gpxe/src/net/dhcpopts.c
new file mode 100644
index 00000000..1898011a
--- /dev/null
+++ b/gpxe/src/net/dhcpopts.c
@@ -0,0 +1,434 @@
+/*
+ * Copyright (C) 2008 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <gpxe/dhcp.h>
+#include <gpxe/dhcpopts.h>
+
+/** @file
+ *
+ * DHCP options
+ *
+ */
+
+/**
+ * Obtain printable version of a DHCP option tag
+ *
+ * @v tag DHCP option tag
+ * @ret name String representation of the tag
+ *
+ */
+static inline char * dhcp_tag_name ( unsigned int tag ) {
+ static char name[8];
+
+ if ( DHCP_IS_ENCAP_OPT ( tag ) ) {
+ snprintf ( name, sizeof ( name ), "%d.%d",
+ DHCP_ENCAPSULATOR ( tag ),
+ DHCP_ENCAPSULATED ( tag ) );
+ } else {
+ snprintf ( name, sizeof ( name ), "%d", tag );
+ }
+ return name;
+}
+
+/**
+ * Get pointer to DHCP option
+ *
+ * @v options DHCP options block
+ * @v offset Offset within options block
+ * @ret option DHCP option
+ */
+static inline __attribute__ (( always_inline )) struct dhcp_option *
+dhcp_option ( struct dhcp_options *options, unsigned int offset ) {
+ return ( ( struct dhcp_option * ) ( options->data + offset ) );
+}
+
+/**
+ * Get offset of a DHCP option
+ *
+ * @v options DHCP options block
+ * @v option DHCP option
+ * @ret offset Offset within options block
+ */
+static inline __attribute__ (( always_inline )) int
+dhcp_option_offset ( struct dhcp_options *options,
+ struct dhcp_option *option ) {
+ return ( ( ( void * ) option ) - options->data );
+}
+
+/**
+ * Calculate length of any DHCP option
+ *
+ * @v option DHCP option
+ * @ret len Length (including tag and length field)
+ */
+static unsigned int dhcp_option_len ( struct dhcp_option *option ) {
+ if ( ( option->tag == DHCP_END ) || ( option->tag == DHCP_PAD ) ) {
+ return 1;
+ } else {
+ return ( option->len + DHCP_OPTION_HEADER_LEN );
+ }
+}
+
+/**
+ * Find DHCP option within DHCP options block, and its encapsulator (if any)
+ *
+ * @v options DHCP options block
+ * @v tag DHCP option tag to search for
+ * @ret encap_offset Offset of encapsulating DHCP option
+ * @ret offset Offset of DHCP option, or negative error
+ *
+ * Searches for the DHCP option matching the specified tag within the
+ * DHCP option block. Encapsulated options may be searched for by
+ * using DHCP_ENCAP_OPT() to construct the tag value.
+ *
+ * If the option is encapsulated, and @c encapsulator is non-NULL, it
+ * will be filled in with the offset of the encapsulating option.
+ *
+ * This routine is designed to be paranoid. It does not assume that
+ * the option data is well-formatted, and so must guard against flaws
+ * such as options missing a @c DHCP_END terminator, or options whose
+ * length would take them beyond the end of the data block.
+ */
+static int find_dhcp_option_with_encap ( struct dhcp_options *options,
+ unsigned int tag,
+ int *encap_offset ) {
+ unsigned int original_tag __attribute__ (( unused )) = tag;
+ struct dhcp_option *option;
+ int offset = 0;
+ ssize_t remaining = options->len;
+ unsigned int option_len;
+
+ /* Sanity check */
+ if ( tag == DHCP_PAD )
+ return -ENOENT;
+
+ /* Search for option */
+ while ( remaining ) {
+ /* Calculate length of this option. Abort processing
+ * if the length is malformed (i.e. takes us beyond
+ * the end of the data block).
+ */
+ option = dhcp_option ( options, offset );
+ option_len = dhcp_option_len ( option );
+ remaining -= option_len;
+ if ( remaining < 0 )
+ break;
+ /* Check for explicit end marker */
+ if ( option->tag == DHCP_END )
+ break;
+ /* Check for matching tag */
+ if ( option->tag == tag ) {
+ DBGC ( options, "DHCPOPT %p found %s (length %d)\n",
+ options, dhcp_tag_name ( original_tag ),
+ option_len );
+ return offset;
+ }
+ /* Check for start of matching encapsulation block */
+ if ( DHCP_IS_ENCAP_OPT ( tag ) &&
+ ( option->tag == DHCP_ENCAPSULATOR ( tag ) ) ) {
+ if ( encap_offset )
+ *encap_offset = offset;
+ /* Continue search within encapsulated option block */
+ tag = DHCP_ENCAPSULATED ( tag );
+ remaining = option_len;
+ offset += DHCP_OPTION_HEADER_LEN;
+ continue;
+ }
+ offset += option_len;
+ }
+
+ return -ENOENT;
+}
+
+/**
+ * Resize a DHCP option
+ *
+ * @v options DHCP option block
+ * @v offset Offset of option to resize
+ * @v encap_offset Offset of encapsulating offset (or -ve for none)
+ * @v old_len Old length (including header)
+ * @v new_len New length (including header)
+ * @v can_realloc Can reallocate options data if necessary
+ * @ret rc Return status code
+ */
+static int resize_dhcp_option ( struct dhcp_options *options,
+ int offset, int encap_offset,
+ size_t old_len, size_t new_len,
+ int can_realloc ) {
+ struct dhcp_option *encapsulator;
+ struct dhcp_option *option;
+ ssize_t delta = ( new_len - old_len );
+ size_t new_options_len;
+ size_t new_encapsulator_len;
+ void *new_data;
+ void *source;
+ void *dest;
+ void *end;
+
+ /* Check for sufficient space, and update length fields */
+ if ( new_len > DHCP_MAX_LEN ) {
+ DBGC ( options, "DHCPOPT %p overlength option\n", options );
+ return -ENOSPC;
+ }
+ new_options_len = ( options->len + delta );
+ if ( new_options_len > options->max_len ) {
+ /* Reallocate options block if allowed to do so. */
+ if ( can_realloc ) {
+ new_data = realloc ( options->data, new_options_len );
+ if ( ! new_data ) {
+ DBGC ( options, "DHCPOPT %p could not "
+ "reallocate to %zd bytes\n", options,
+ new_options_len );
+ return -ENOMEM;
+ }
+ options->data = new_data;
+ options->max_len = new_options_len;
+ } else {
+ DBGC ( options, "DHCPOPT %p out of space\n", options );
+ return -ENOMEM;
+ }
+ }
+ if ( encap_offset >= 0 ) {
+ encapsulator = dhcp_option ( options, encap_offset );
+ new_encapsulator_len = ( encapsulator->len + delta );
+ if ( new_encapsulator_len > DHCP_MAX_LEN ) {
+ DBGC ( options, "DHCPOPT %p overlength encapsulator\n",
+ options );
+ return -ENOSPC;
+ }
+ encapsulator->len = new_encapsulator_len;
+ }
+ options->len = new_options_len;
+
+ /* Move remainder of option data */
+ option = dhcp_option ( options, offset );
+ source = ( ( ( void * ) option ) + old_len );
+ dest = ( ( ( void * ) option ) + new_len );
+ end = ( options->data + options->max_len );
+ memmove ( dest, source, ( end - dest ) );
+
+ return 0;
+}
+
+/**
+ * Set value of DHCP option
+ *
+ * @v options DHCP option block
+ * @v tag DHCP option tag
+ * @v data New value for DHCP option
+ * @v len Length of value, in bytes
+ * @v can_realloc Can reallocate options data if necessary
+ * @ret offset Offset of DHCP option, or negative error
+ *
+ * Sets the value of a DHCP option within the options block. The
+ * option may or may not already exist. Encapsulators will be created
+ * (and deleted) as necessary.
+ *
+ * This call may fail due to insufficient space in the options block.
+ * If it does fail, and the option existed previously, the option will
+ * be left with its original value.
+ */
+static int set_dhcp_option ( struct dhcp_options *options, unsigned int tag,
+ const void *data, size_t len,
+ int can_realloc ) {
+ static const uint8_t empty_encapsulator[] = { DHCP_END };
+ int offset;
+ int encap_offset = -1;
+ int creation_offset = 0;
+ struct dhcp_option *option;
+ unsigned int encap_tag = DHCP_ENCAPSULATOR ( tag );
+ size_t old_len = 0;
+ size_t new_len = ( len ? ( len + DHCP_OPTION_HEADER_LEN ) : 0 );
+ int rc;
+
+ /* Sanity check */
+ if ( tag == DHCP_PAD )
+ return -ENOTTY;
+
+ /* Find old instance of this option, if any */
+ offset = find_dhcp_option_with_encap ( options, tag, &encap_offset );
+ if ( offset >= 0 ) {
+ old_len = dhcp_option_len ( dhcp_option ( options, offset ) );
+ DBGC ( options, "DHCPOPT %p resizing %s from %zd to %zd\n",
+ options, dhcp_tag_name ( tag ), old_len, new_len );
+ } else {
+ DBGC ( options, "DHCPOPT %p creating %s (length %zd)\n",
+ options, dhcp_tag_name ( tag ), new_len );
+ }
+
+ /* Ensure that encapsulator exists, if required */
+ if ( encap_tag ) {
+ if ( encap_offset < 0 )
+ encap_offset = set_dhcp_option ( options, encap_tag,
+ empty_encapsulator, 1,
+ can_realloc );
+ if ( encap_offset < 0 )
+ return encap_offset;
+ creation_offset = ( encap_offset + DHCP_OPTION_HEADER_LEN );
+ }
+
+ /* Create new option if necessary */
+ if ( offset < 0 )
+ offset = creation_offset;
+
+ /* Resize option to fit new data */
+ if ( ( rc = resize_dhcp_option ( options, offset, encap_offset,
+ old_len, new_len,
+ can_realloc ) ) != 0 )
+ return rc;
+
+ /* Copy new data into option, if applicable */
+ if ( len ) {
+ option = dhcp_option ( options, offset );
+ option->tag = tag;
+ option->len = len;
+ memcpy ( &option->data, data, len );
+ }
+
+ /* Delete encapsulator if there's nothing else left in it */
+ if ( encap_offset >= 0 ) {
+ option = dhcp_option ( options, encap_offset );
+ if ( option->len <= 1 )
+ set_dhcp_option ( options, encap_tag, NULL, 0, 0 );
+ }
+
+ return offset;
+}
+
+/**
+ * Store value of DHCP option setting
+ *
+ * @v options DHCP option block
+ * @v tag Setting tag number
+ * @v data Setting data, or NULL to clear setting
+ * @v len Length of setting data
+ * @ret rc Return status code
+ */
+int dhcpopt_store ( struct dhcp_options *options, unsigned int tag,
+ const void *data, size_t len ) {
+ int offset;
+
+ offset = set_dhcp_option ( options, tag, data, len, 0 );
+ if ( offset < 0 )
+ return offset;
+ return 0;
+}
+
+/**
+ * Store value of DHCP option setting, extending options block if necessary
+ *
+ * @v options DHCP option block
+ * @v tag Setting tag number
+ * @v data Setting data, or NULL to clear setting
+ * @v len Length of setting data
+ * @ret rc Return status code
+ */
+int dhcpopt_extensible_store ( struct dhcp_options *options, unsigned int tag,
+ const void *data, size_t len ) {
+ int offset;
+
+ offset = set_dhcp_option ( options, tag, data, len, 1 );
+ if ( offset < 0 )
+ return offset;
+ return 0;
+}
+
+/**
+ * Fetch value of DHCP option setting
+ *
+ * @v options DHCP option block
+ * @v tag Setting tag number
+ * @v data Buffer to fill with setting data
+ * @v len Length of buffer
+ * @ret len Length of setting data, or negative error
+ */
+int dhcpopt_fetch ( struct dhcp_options *options, unsigned int tag,
+ void *data, size_t len ) {
+ int offset;
+ struct dhcp_option *option;
+ size_t option_len;
+
+ offset = find_dhcp_option_with_encap ( options, tag, NULL );
+ if ( offset < 0 )
+ return offset;
+
+ option = dhcp_option ( options, offset );
+ option_len = option->len;
+ if ( len > option_len )
+ len = option_len;
+ memcpy ( data, option->data, len );
+
+ return option_len;
+}
+
+/**
+ * Recalculate length of DHCP options block
+ *
+ * @v options Uninitialised DHCP option block
+ *
+ * The "used length" field will be updated based on scanning through
+ * the block to find the end of the options.
+ */
+static void dhcpopt_update_len ( struct dhcp_options *options ) {
+ struct dhcp_option *option;
+ int offset = 0;
+ ssize_t remaining = options->max_len;
+ unsigned int option_len;
+
+ /* Find last non-pad option */
+ options->len = 0;
+ while ( remaining ) {
+ option = dhcp_option ( options, offset );
+ option_len = dhcp_option_len ( option );
+ remaining -= option_len;
+ if ( remaining < 0 )
+ break;
+ offset += option_len;
+ if ( option->tag != DHCP_PAD )
+ options->len = offset;
+ }
+}
+
+/**
+ * Initialise prepopulated block of DHCP options
+ *
+ * @v options Uninitialised DHCP option block
+ * @v data Memory for DHCP option data
+ * @v max_len Length of memory for DHCP option data
+ *
+ * The memory content must already be filled with valid DHCP options.
+ * A zeroed block counts as a block of valid DHCP options.
+ */
+void dhcpopt_init ( struct dhcp_options *options, void *data,
+ size_t max_len ) {
+
+ /* Fill in fields */
+ options->data = data;
+ options->max_len = max_len;
+
+ /* Update length */
+ dhcpopt_update_len ( options );
+
+ DBGC ( options, "DHCPOPT %p created (data %p len %#zx max_len %#zx)\n",
+ options, options->data, options->len, options->max_len );
+}
diff --git a/gpxe/src/net/dhcppkt.c b/gpxe/src/net/dhcppkt.c
new file mode 100644
index 00000000..1cf99d8d
--- /dev/null
+++ b/gpxe/src/net/dhcppkt.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright (C) 2008 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <gpxe/netdevice.h>
+#include <gpxe/dhcp.h>
+#include <gpxe/dhcpopts.h>
+#include <gpxe/dhcppkt.h>
+
+/** @file
+ *
+ * DHCP packets
+ *
+ */
+
+/** A dedicated field within a DHCP packet */
+struct dhcp_packet_field {
+ /** Settings tag number */
+ unsigned int tag;
+ /** Offset within DHCP packet */
+ uint16_t offset;
+ /** Length of field */
+ uint16_t len;
+};
+
+/** Declare a dedicated field within a DHCP packet
+ *
+ * @v _tag Settings tag number
+ * @v _field Field name
+ */
+#define DHCP_PACKET_FIELD( _tag, _field ) { \
+ .tag = (_tag), \
+ .offset = offsetof ( struct dhcphdr, _field ), \
+ .len = sizeof ( ( ( struct dhcphdr * ) 0 )->_field ), \
+ }
+
+/** Dedicated fields within a DHCP packet */
+static struct dhcp_packet_field dhcp_packet_fields[] = {
+ DHCP_PACKET_FIELD ( DHCP_EB_YIADDR, yiaddr ),
+ DHCP_PACKET_FIELD ( DHCP_EB_SIADDR, siaddr ),
+ DHCP_PACKET_FIELD ( DHCP_TFTP_SERVER_NAME, sname ),
+ DHCP_PACKET_FIELD ( DHCP_BOOTFILE_NAME, file ),
+};
+
+/**
+ * Get address of a DHCP packet field
+ *
+ * @v dhcphdr DHCP packet header
+ * @v field DHCP packet field
+ * @ret data Packet field data
+ */
+static inline void * dhcp_packet_field ( struct dhcphdr *dhcphdr,
+ struct dhcp_packet_field *field ) {
+ return ( ( ( void * ) dhcphdr ) + field->offset );
+}
+
+/**
+ * Find DHCP packet field corresponding to settings tag number
+ *
+ * @v tag Settings tag number
+ * @ret field DHCP packet field, or NULL
+ */
+static struct dhcp_packet_field *
+find_dhcp_packet_field ( unsigned int tag ) {
+ struct dhcp_packet_field *field;
+ unsigned int i;
+
+ for ( i = 0 ; i < ( sizeof ( dhcp_packet_fields ) /
+ sizeof ( dhcp_packet_fields[0] ) ) ; i++ ) {
+ field = &dhcp_packet_fields[i];
+ if ( field->tag == tag )
+ return field;
+ }
+ return NULL;
+}
+
+/**
+ * Store value of DHCP packet setting
+ *
+ * @v dhcppkt DHCP packet
+ * @v tag Setting tag number
+ * @v data Setting data, or NULL to clear setting
+ * @v len Length of setting data
+ * @ret rc Return status code
+ */
+int dhcppkt_store ( struct dhcp_packet *dhcppkt, unsigned int tag,
+ const void *data, size_t len ) {
+ struct dhcp_packet_field *field;
+ int rc;
+
+ /* If this is a special field, fill it in */
+ if ( ( field = find_dhcp_packet_field ( tag ) ) != NULL ) {
+ if ( len > field->len )
+ return -ENOSPC;
+ memcpy ( dhcp_packet_field ( dhcppkt->dhcphdr, field ),
+ data, len );
+ return 0;
+ }
+
+ /* Otherwise, use the generic options block */
+ rc = dhcpopt_store ( &dhcppkt->options, tag, data, len );
+
+ /* Update our used-length field */
+ dhcppkt->len = ( offsetof ( struct dhcphdr, options ) +
+ dhcppkt->options.len );
+
+ return rc;
+}
+
+/**
+ * Fetch value of DHCP packet setting
+ *
+ * @v dhcppkt DHCP packet
+ * @v tag Setting tag number
+ * @v data Buffer to fill with setting data
+ * @v len Length of buffer
+ * @ret len Length of setting data, or negative error
+ */
+int dhcppkt_fetch ( struct dhcp_packet *dhcppkt, unsigned int tag,
+ void *data, size_t len ) {
+ struct dhcp_packet_field *field;
+
+ /* If this is a special field, return it */
+ if ( ( field = find_dhcp_packet_field ( tag ) ) != NULL ) {
+ if ( len > field->len )
+ len = field->len;
+ memcpy ( data,
+ dhcp_packet_field ( dhcppkt->dhcphdr, field ), len );
+ return field->len;
+ }
+
+ /* Otherwise, use the generic options block */
+ return dhcpopt_fetch ( &dhcppkt->options, tag, data, len );
+}
+
+/**
+ * Initialise prepopulated DHCP packet
+ *
+ * @v dhcppkt Uninitialised DHCP packet
+ * @v data Memory for DHCP packet data
+ * @v max_len Length of memory for DHCP packet data
+ *
+ * The memory content must already be filled with valid DHCP options.
+ * A zeroed block counts as a block of valid DHCP options.
+ */
+void dhcppkt_init ( struct dhcp_packet *dhcppkt, void *data, size_t len ) {
+ dhcppkt->dhcphdr = data;
+ dhcppkt->max_len = len;
+ dhcpopt_init ( &dhcppkt->options, &dhcppkt->dhcphdr->options,
+ ( len - offsetof ( struct dhcphdr, options ) ) );
+ dhcppkt->len = ( offsetof ( struct dhcphdr, options ) +
+ dhcppkt->options.len );
+}
diff --git a/gpxe/src/net/ethernet.c b/gpxe/src/net/ethernet.c
new file mode 100644
index 00000000..55035de5
--- /dev/null
+++ b/gpxe/src/net/ethernet.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (C) 2006 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <byteswap.h>
+#include <errno.h>
+#include <assert.h>
+#include <gpxe/if_arp.h>
+#include <gpxe/if_ether.h>
+#include <gpxe/netdevice.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/ethernet.h>
+
+/** @file
+ *
+ * Ethernet protocol
+ *
+ */
+
+/** Ethernet broadcast MAC address */
+static uint8_t eth_broadcast[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+
+/**
+ * Transmit Ethernet packet
+ *
+ * @v iobuf I/O buffer
+ * @v netdev Network device
+ * @v net_protocol Network-layer protocol
+ * @v ll_dest Link-layer destination address
+ *
+ * Prepends the Ethernet link-layer header and transmits the packet.
+ */
+static int eth_tx ( struct io_buffer *iobuf, struct net_device *netdev,
+ struct net_protocol *net_protocol, const void *ll_dest ) {
+ struct ethhdr *ethhdr = iob_push ( iobuf, sizeof ( *ethhdr ) );
+
+ /* Build Ethernet header */
+ memcpy ( ethhdr->h_dest, ll_dest, ETH_ALEN );
+ memcpy ( ethhdr->h_source, netdev->ll_addr, ETH_ALEN );
+ ethhdr->h_protocol = net_protocol->net_proto;
+
+ /* Hand off to network device */
+ return netdev_tx ( netdev, iobuf );
+}
+
+/**
+ * Process received Ethernet packet
+ *
+ * @v iobuf I/O buffer
+ * @v netdev Network device
+ *
+ * Strips off the Ethernet link-layer header and passes up to the
+ * network-layer protocol.
+ */
+static int eth_rx ( struct io_buffer *iobuf, struct net_device *netdev ) {
+ struct ethhdr *ethhdr = iobuf->data;
+
+ /* Sanity check */
+ if ( iob_len ( iobuf ) < sizeof ( *ethhdr ) ) {
+ DBG ( "Ethernet packet too short (%zd bytes)\n",
+ iob_len ( iobuf ) );
+ free_iob ( iobuf );
+ return -EINVAL;
+ }
+
+ /* Strip off Ethernet header */
+ iob_pull ( iobuf, sizeof ( *ethhdr ) );
+
+ /* Hand off to network-layer protocol */
+ return net_rx ( iobuf, netdev, ethhdr->h_protocol, ethhdr->h_source );
+}
+
+/**
+ * Transcribe Ethernet address
+ *
+ * @v ll_addr Link-layer address
+ * @ret string Link-layer address in human-readable format
+ */
+const char * eth_ntoa ( const void *ll_addr ) {
+ static char buf[18]; /* "00:00:00:00:00:00" */
+ const uint8_t *eth_addr = ll_addr;
+
+ sprintf ( buf, "%02x:%02x:%02x:%02x:%02x:%02x",
+ eth_addr[0], eth_addr[1], eth_addr[2],
+ eth_addr[3], eth_addr[4], eth_addr[5] );
+ return buf;
+}
+
+/** Ethernet protocol */
+struct ll_protocol ethernet_protocol __ll_protocol = {
+ .name = "Ethernet",
+ .ll_proto = htons ( ARPHRD_ETHER ),
+ .ll_addr_len = ETH_ALEN,
+ .ll_header_len = ETH_HLEN,
+ .ll_broadcast = eth_broadcast,
+ .tx = eth_tx,
+ .rx = eth_rx,
+ .ntoa = eth_ntoa,
+};
diff --git a/gpxe/src/net/fakedhcp.c b/gpxe/src/net/fakedhcp.c
new file mode 100644
index 00000000..c3054db1
--- /dev/null
+++ b/gpxe/src/net/fakedhcp.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (C) 2008 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <gpxe/settings.h>
+#include <gpxe/netdevice.h>
+#include <gpxe/dhcppkt.h>
+#include <gpxe/fakedhcp.h>
+
+/** @file
+ *
+ * Fake DHCP packets
+ *
+ */
+
+/**
+ * Copy settings to DHCP packet
+ *
+ * @v dest Destination DHCP packet
+ * @v source Source settings block
+ * @v encapsulator Encapsulating setting tag number, or zero
+ * @ret rc Return status code
+ */
+static int copy_encap_settings ( struct dhcp_packet *dest,
+ struct settings *source,
+ unsigned int encapsulator ) {
+ struct setting setting = { .name = "" };
+ unsigned int subtag;
+ unsigned int tag;
+ int len;
+ int check_len;
+ int rc;
+
+ for ( subtag = DHCP_MIN_OPTION; subtag <= DHCP_MAX_OPTION; subtag++ ) {
+ tag = DHCP_ENCAP_OPT ( encapsulator, subtag );
+ switch ( tag ) {
+ case DHCP_EB_ENCAP:
+ case DHCP_VENDOR_ENCAP:
+ /* Process encapsulated settings */
+ if ( ( rc = copy_encap_settings ( dest, source,
+ tag ) ) != 0 )
+ return rc;
+ break;
+ default:
+ /* Copy setting, if present */
+ setting.tag = tag;
+ len = fetch_setting_len ( source, &setting );
+ if ( len < 0 )
+ break;
+ {
+ char buf[len];
+
+ check_len = fetch_setting ( source, &setting,
+ buf, sizeof (buf));
+ assert ( check_len == len );
+ if ( ( rc = dhcppkt_store ( dest, tag, buf,
+ sizeof(buf) )) !=0)
+ return rc;
+ }
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * Copy settings to DHCP packet
+ *
+ * @v dest Destination DHCP packet
+ * @v source Source settings block
+ * @ret rc Return status code
+ */
+static int copy_settings ( struct dhcp_packet *dest,
+ struct settings *source ) {
+ return copy_encap_settings ( dest, source, 0 );
+}
+
+/**
+ * Create fake DHCPDISCOVER packet
+ *
+ * @v netdev Network device
+ * @v data Buffer for DHCP packet
+ * @v max_len Size of DHCP packet buffer
+ * @ret rc Return status code
+ *
+ * Used by external code.
+ */
+int create_fakedhcpdiscover ( struct net_device *netdev,
+ void *data, size_t max_len ) {
+ struct dhcp_packet dhcppkt;
+ int rc;
+
+ if ( ( rc = create_dhcp_request ( &dhcppkt, netdev, NULL, data,
+ max_len ) ) != 0 ) {
+ DBG ( "Could not create DHCPDISCOVER: %s\n",
+ strerror ( rc ) );
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * Create fake DHCPACK packet
+ *
+ * @v netdev Network device
+ * @v data Buffer for DHCP packet
+ * @v max_len Size of DHCP packet buffer
+ * @ret rc Return status code
+ *
+ * Used by external code.
+ */
+int create_fakedhcpack ( struct net_device *netdev,
+ void *data, size_t max_len ) {
+ struct dhcp_packet dhcppkt;
+ int rc;
+
+ /* Create base DHCPACK packet */
+ if ( ( rc = create_dhcp_packet ( &dhcppkt, netdev, DHCPACK, NULL,
+ data, max_len ) ) != 0 ) {
+ DBG ( "Could not create DHCPACK: %s\n", strerror ( rc ) );
+ return rc;
+ }
+
+ /* Merge in globally-scoped settings, then netdev-specific
+ * settings. Do it in this order so that netdev-specific
+ * settings take precedence regardless of stated priorities.
+ */
+ if ( ( rc = copy_settings ( &dhcppkt, NULL ) ) != 0 ) {
+ DBG ( "Could not set DHCPACK global settings: %s\n",
+ strerror ( rc ) );
+ return rc;
+ }
+ if ( ( rc = copy_settings ( &dhcppkt,
+ netdev_settings ( netdev ) ) ) != 0 ) {
+ DBG ( "Could not set DHCPACK netdev settings: %s\n",
+ strerror ( rc ) );
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * Create ProxyDHCPACK packet
+ *
+ * @v netdev Network device
+ * @v data Buffer for DHCP packet
+ * @v max_len Size of DHCP packet buffer
+ * @ret rc Return status code
+ *
+ * Used by external code.
+ */
+int create_fakeproxydhcpack ( struct net_device *netdev,
+ void *data, size_t max_len ) {
+ struct dhcp_packet dhcppkt;
+ struct settings *settings;
+ int rc;
+
+ /* Identify ProxyDHCP settings */
+ settings = find_settings ( PROXYDHCP_SETTINGS_NAME );
+
+ /* No ProxyDHCP settings => return empty block */
+ if ( ! settings ) {
+ memset ( data, 0, max_len );
+ return 0;
+ }
+
+ /* Create base DHCPACK packet */
+ if ( ( rc = create_dhcp_packet ( &dhcppkt, netdev, DHCPACK, NULL,
+ data, max_len ) ) != 0 ) {
+ DBG ( "Could not create ProxyDHCPACK: %s\n",
+ strerror ( rc ) );
+ return rc;
+ }
+
+ /* Merge in ProxyDHCP options */
+ if ( ( rc = copy_settings ( &dhcppkt, settings ) ) != 0 ) {
+ DBG ( "Could not set ProxyDHCPACK settings: %s\n",
+ strerror ( rc ) );
+ return rc;
+ }
+
+ return 0;
+}
diff --git a/gpxe/src/net/icmpv6.c b/gpxe/src/net/icmpv6.c
new file mode 100644
index 00000000..7b7146c2
--- /dev/null
+++ b/gpxe/src/net/icmpv6.c
@@ -0,0 +1,128 @@
+#include <stdint.h>
+#include <string.h>
+#include <byteswap.h>
+#include <errno.h>
+#include <gpxe/in.h>
+#include <gpxe/ip6.h>
+#include <gpxe/if_ether.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/ndp.h>
+#include <gpxe/icmp6.h>
+#include <gpxe/tcpip.h>
+#include <gpxe/netdevice.h>
+
+struct tcpip_protocol icmp6_protocol;
+
+/**
+ * Send neighbour solicitation packet
+ *
+ * @v netdev Network device
+ * @v src Source address
+ * @v dest Destination address
+ *
+ * This function prepares a neighbour solicitation packet and sends it to the
+ * network layer.
+ */
+int icmp6_send_solicit ( struct net_device *netdev, struct in6_addr *src __unused,
+ struct in6_addr *dest ) {
+ union {
+ struct sockaddr_in6 sin6;
+ struct sockaddr_tcpip st;
+ } st_dest;
+ struct ll_protocol *ll_protocol = netdev->ll_protocol;
+ struct neighbour_solicit *nsolicit;
+ struct io_buffer *iobuf = alloc_iob ( sizeof ( *nsolicit ) + MIN_IOB_LEN );
+ iob_reserve ( iobuf, MAX_HDR_LEN );
+ nsolicit = iob_put ( iobuf, sizeof ( *nsolicit ) );
+
+ /* Fill up the headers */
+ memset ( nsolicit, 0, sizeof ( *nsolicit ) );
+ nsolicit->type = ICMP6_NSOLICIT;
+ nsolicit->code = 0;
+ nsolicit->target = *dest;
+ nsolicit->opt_type = 1;
+ nsolicit->opt_len = ( 2 + ll_protocol->ll_addr_len ) / 8;
+ memcpy ( nsolicit->opt_ll_addr, netdev->ll_addr,
+ netdev->ll_protocol->ll_addr_len );
+ /* Partial checksum */
+ nsolicit->csum = 0;
+ nsolicit->csum = tcpip_chksum ( nsolicit, sizeof ( *nsolicit ) );
+
+ /* Solicited multicast address */
+ st_dest.sin6.sin_family = AF_INET6;
+ st_dest.sin6.sin6_addr.in6_u.u6_addr8[0] = 0xff;
+ st_dest.sin6.sin6_addr.in6_u.u6_addr8[2] = 0x02;
+ st_dest.sin6.sin6_addr.in6_u.u6_addr16[1] = 0x0000;
+ st_dest.sin6.sin6_addr.in6_u.u6_addr32[1] = 0x00000000;
+ st_dest.sin6.sin6_addr.in6_u.u6_addr16[4] = 0x0000;
+ st_dest.sin6.sin6_addr.in6_u.u6_addr16[5] = 0x0001;
+ st_dest.sin6.sin6_addr.in6_u.u6_addr32[3] = dest->in6_u.u6_addr32[3];
+ st_dest.sin6.sin6_addr.in6_u.u6_addr8[13] = 0xff;
+
+ /* Send packet over IP6 */
+ return tcpip_tx ( iobuf, &icmp6_protocol, &st_dest.st,
+ NULL, &nsolicit->csum );
+}
+
+/**
+ * Process ICMP6 headers
+ *
+ * @v iobuf I/O buffer
+ * @v st_src Source address
+ * @v st_dest Destination address
+ */
+static int icmp6_rx ( struct io_buffer *iobuf, struct sockaddr_tcpip *st_src,
+ struct sockaddr_tcpip *st_dest, __unused uint16_t pshdr_csum ) {
+ struct icmp6_header *icmp6hdr = iobuf->data;
+
+ /* Sanity check */
+ if ( iob_len ( iobuf ) < sizeof ( *icmp6hdr ) ) {
+ DBG ( "Packet too short (%zd bytes)\n", iob_len ( iobuf ) );
+ free_iob ( iobuf );
+ return -EINVAL;
+ }
+
+ /* TODO: Verify checksum */
+
+ /* Process the ICMP header */
+ switch ( icmp6hdr->type ) {
+ case ICMP6_NADVERT:
+ return ndp_process_advert ( iobuf, st_src, st_dest );
+ }
+ return -ENOSYS;
+}
+
+#if 0
+void icmp6_test_nadvert (struct net_device *netdev, struct sockaddr_in6 *server_p, char *ll_addr) {
+
+ struct sockaddr_in6 server;
+ memcpy ( &server, server_p, sizeof ( server ) );
+ struct io_buffer *rxiobuf = alloc_iob ( 500 );
+ iob_reserve ( rxiobuf, MAX_HDR_LEN );
+ struct neighbour_advert *nadvert = iob_put ( rxiobuf, sizeof ( *nadvert ) );
+ nadvert->type = 136;
+ nadvert->code = 0;
+ nadvert->flags = ICMP6_FLAGS_SOLICITED;
+ nadvert->csum = 0xffff;
+ nadvert->target = server.sin6_addr;
+ nadvert->opt_type = 2;
+ nadvert->opt_len = 1;
+ memcpy ( nadvert->opt_ll_addr, ll_addr, 6 );
+ struct ip6_header *ip6hdr = iob_push ( rxiobuf, sizeof ( *ip6hdr ) );
+ ip6hdr->ver_traffic_class_flow_label = htonl ( 0x60000000 );
+ ip6hdr->hop_limit = 255;
+ ip6hdr->nxt_hdr = 58;
+ ip6hdr->payload_len = htons ( sizeof ( *nadvert ) );
+ ip6hdr->src = server.sin6_addr;
+ ip6hdr->dest = server.sin6_addr;
+ hex_dump ( rxiobuf->data, iob_len ( rxiobuf ) );
+ net_rx ( rxiobuf, netdev, htons ( ETH_P_IPV6 ), ll_addr );
+}
+#endif
+
+/** ICMP6 protocol */
+struct tcpip_protocol icmp6_protocol __tcpip_protocol = {
+ .name = "ICMP6",
+ .rx = icmp6_rx,
+ .tcpip_proto = IP_ICMP6, // 58
+};
diff --git a/gpxe/src/net/infiniband.c b/gpxe/src/net/infiniband.c
new file mode 100644
index 00000000..39d11285
--- /dev/null
+++ b/gpxe/src/net/infiniband.c
@@ -0,0 +1,437 @@
+/*
+ * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <byteswap.h>
+#include <errno.h>
+#include <assert.h>
+#include <gpxe/list.h>
+#include <gpxe/if_arp.h>
+#include <gpxe/netdevice.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/ipoib.h>
+#include <gpxe/infiniband.h>
+
+/** @file
+ *
+ * Infiniband protocol
+ *
+ */
+
+/**
+ * Create completion queue
+ *
+ * @v ibdev Infiniband device
+ * @v num_cqes Number of completion queue entries
+ * @ret cq New completion queue
+ */
+struct ib_completion_queue * ib_create_cq ( struct ib_device *ibdev,
+ unsigned int num_cqes ) {
+ struct ib_completion_queue *cq;
+ int rc;
+
+ DBGC ( ibdev, "IBDEV %p creating completion queue\n", ibdev );
+
+ /* Allocate and initialise data structure */
+ cq = zalloc ( sizeof ( *cq ) );
+ if ( ! cq )
+ return NULL;
+ cq->num_cqes = num_cqes;
+ INIT_LIST_HEAD ( &cq->work_queues );
+
+ /* Perform device-specific initialisation and get CQN */
+ if ( ( rc = ibdev->op->create_cq ( ibdev, cq ) ) != 0 ) {
+ DBGC ( ibdev, "IBDEV %p could not initialise completion "
+ "queue: %s\n", ibdev, strerror ( rc ) );
+ free ( cq );
+ return NULL;
+ }
+
+ DBGC ( ibdev, "IBDEV %p created %d-entry completion queue %p (%p) "
+ "with CQN %#lx\n", ibdev, num_cqes, cq,
+ ib_cq_get_drvdata ( cq ), cq->cqn );
+ return cq;
+}
+
+/**
+ * Destroy completion queue
+ *
+ * @v ibdev Infiniband device
+ * @v cq Completion queue
+ */
+void ib_destroy_cq ( struct ib_device *ibdev,
+ struct ib_completion_queue *cq ) {
+ DBGC ( ibdev, "IBDEV %p destroying completion queue %#lx\n",
+ ibdev, cq->cqn );
+ assert ( list_empty ( &cq->work_queues ) );
+ ibdev->op->destroy_cq ( ibdev, cq );
+ free ( cq );
+}
+
+/**
+ * Create queue pair
+ *
+ * @v ibdev Infiniband device
+ * @v num_send_wqes Number of send work queue entries
+ * @v send_cq Send completion queue
+ * @v num_recv_wqes Number of receive work queue entries
+ * @v recv_cq Receive completion queue
+ * @v qkey Queue key
+ * @ret qp Queue pair
+ */
+struct ib_queue_pair * ib_create_qp ( struct ib_device *ibdev,
+ unsigned int num_send_wqes,
+ struct ib_completion_queue *send_cq,
+ unsigned int num_recv_wqes,
+ struct ib_completion_queue *recv_cq,
+ unsigned long qkey ) {
+ struct ib_queue_pair *qp;
+ size_t total_size;
+ int rc;
+
+ DBGC ( ibdev, "IBDEV %p creating queue pair\n", ibdev );
+
+ /* Allocate and initialise data structure */
+ total_size = ( sizeof ( *qp ) +
+ ( num_send_wqes * sizeof ( qp->send.iobufs[0] ) ) +
+ ( num_recv_wqes * sizeof ( qp->recv.iobufs[0] ) ) );
+ qp = zalloc ( total_size );
+ if ( ! qp )
+ return NULL;
+ qp->qkey = qkey;
+ qp->send.qp = qp;
+ qp->send.is_send = 1;
+ qp->send.cq = send_cq;
+ list_add ( &qp->send.list, &send_cq->work_queues );
+ qp->send.num_wqes = num_send_wqes;
+ qp->send.iobufs = ( ( ( void * ) qp ) + sizeof ( *qp ) );
+ qp->recv.qp = qp;
+ qp->recv.cq = recv_cq;
+ list_add ( &qp->recv.list, &recv_cq->work_queues );
+ qp->recv.num_wqes = num_recv_wqes;
+ qp->recv.iobufs = ( ( ( void * ) qp ) + sizeof ( *qp ) +
+ ( num_send_wqes * sizeof ( qp->send.iobufs[0] ) ));
+
+ /* Perform device-specific initialisation and get QPN */
+ if ( ( rc = ibdev->op->create_qp ( ibdev, qp ) ) != 0 ) {
+ DBGC ( ibdev, "IBDEV %p could not initialise queue pair: "
+ "%s\n", ibdev, strerror ( rc ) );
+ list_del ( &qp->send.list );
+ list_del ( &qp->recv.list );
+ free ( qp );
+ return NULL;
+ }
+
+ DBGC ( ibdev, "IBDEV %p created queue pair %p (%p) with QPN %#lx\n",
+ ibdev, qp, ib_qp_get_drvdata ( qp ), qp->qpn );
+ DBGC ( ibdev, "IBDEV %p QPN %#lx has %d send entries at [%p,%p)\n",
+ ibdev, qp->qpn, num_send_wqes, qp->send.iobufs,
+ qp->recv.iobufs );
+ DBGC ( ibdev, "IBDEV %p QPN %#lx has %d receive entries at [%p,%p)\n",
+ ibdev, qp->qpn, num_recv_wqes, qp->recv.iobufs,
+ ( ( ( void * ) qp ) + total_size ) );
+ return qp;
+}
+
+/**
+ * Destroy queue pair
+ *
+ * @v ibdev Infiniband device
+ * @v qp Queue pair
+ */
+void ib_destroy_qp ( struct ib_device *ibdev,
+ struct ib_queue_pair *qp ) {
+ DBGC ( ibdev, "IBDEV %p destroying queue pair %#lx\n",
+ ibdev, qp->qpn );
+ ibdev->op->destroy_qp ( ibdev, qp );
+ list_del ( &qp->send.list );
+ list_del ( &qp->recv.list );
+ free ( qp );
+}
+
+/**
+ * Find work queue belonging to completion queue
+ *
+ * @v cq Completion queue
+ * @v qpn Queue pair number
+ * @v is_send Find send work queue (rather than receive)
+ * @ret wq Work queue, or NULL if not found
+ */
+struct ib_work_queue * ib_find_wq ( struct ib_completion_queue *cq,
+ unsigned long qpn, int is_send ) {
+ struct ib_work_queue *wq;
+
+ list_for_each_entry ( wq, &cq->work_queues, list ) {
+ if ( ( wq->qp->qpn == qpn ) && ( wq->is_send == is_send ) )
+ return wq;
+ }
+ return NULL;
+}
+
+/***************************************************************************
+ *
+ * Management datagram operations
+ *
+ ***************************************************************************
+ */
+
+/**
+ * Get port information
+ *
+ * @v ibdev Infiniband device
+ * @v port_info Port information datagram to fill in
+ * @ret rc Return status code
+ */
+static int ib_get_port_info ( struct ib_device *ibdev,
+ struct ib_mad_port_info *port_info ) {
+ struct ib_mad_hdr *hdr = &port_info->mad_hdr;
+ int rc;
+
+ /* Construct MAD */
+ memset ( port_info, 0, sizeof ( *port_info ) );
+ hdr->base_version = IB_MGMT_BASE_VERSION;
+ hdr->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ hdr->class_version = 1;
+ hdr->method = IB_MGMT_METHOD_GET;
+ hdr->attr_id = htons ( IB_SMP_ATTR_PORT_INFO );
+ hdr->attr_mod = htonl ( ibdev->port );
+
+ if ( ( rc = ib_mad ( ibdev, hdr, sizeof ( *port_info ) ) ) != 0 ) {
+ DBGC ( ibdev, "IBDEV %p could not get port info: %s\n",
+ ibdev, strerror ( rc ) );
+ return rc;
+ }
+ return 0;
+}
+
+/**
+ * Get GUID information
+ *
+ * @v ibdev Infiniband device
+ * @v guid_info GUID information datagram to fill in
+ * @ret rc Return status code
+ */
+static int ib_get_guid_info ( struct ib_device *ibdev,
+ struct ib_mad_guid_info *guid_info ) {
+ struct ib_mad_hdr *hdr = &guid_info->mad_hdr;
+ int rc;
+
+ /* Construct MAD */
+ memset ( guid_info, 0, sizeof ( *guid_info ) );
+ hdr->base_version = IB_MGMT_BASE_VERSION;
+ hdr->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ hdr->class_version = 1;
+ hdr->method = IB_MGMT_METHOD_GET;
+ hdr->attr_id = htons ( IB_SMP_ATTR_GUID_INFO );
+
+ if ( ( rc = ib_mad ( ibdev, hdr, sizeof ( *guid_info ) ) ) != 0 ) {
+ DBGC ( ibdev, "IBDEV %p could not get GUID info: %s\n",
+ ibdev, strerror ( rc ) );
+ return rc;
+ }
+ return 0;
+}
+
+/**
+ * Get partition key table
+ *
+ * @v ibdev Infiniband device
+ * @v guid_info Partition key table datagram to fill in
+ * @ret rc Return status code
+ */
+static int ib_get_pkey_table ( struct ib_device *ibdev,
+ struct ib_mad_pkey_table *pkey_table ) {
+ struct ib_mad_hdr *hdr = &pkey_table->mad_hdr;
+ int rc;
+
+ /* Construct MAD */
+ memset ( pkey_table, 0, sizeof ( *pkey_table ) );
+ hdr->base_version = IB_MGMT_BASE_VERSION;
+ hdr->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ hdr->class_version = 1;
+ hdr->method = IB_MGMT_METHOD_GET;
+ hdr->attr_id = htons ( IB_SMP_ATTR_PKEY_TABLE );
+
+ if ( ( rc = ib_mad ( ibdev, hdr, sizeof ( *pkey_table ) ) ) != 0 ) {
+ DBGC ( ibdev, "IBDEV %p could not get pkey table: %s\n",
+ ibdev, strerror ( rc ) );
+ return rc;
+ }
+ return 0;
+}
+
+/**
+ * Wait for link up
+ *
+ * @v ibdev Infiniband device
+ * @ret rc Return status code
+ *
+ * This function shouldn't really exist. Unfortunately, IB links take
+ * a long time to come up, and we can't get various key parameters
+ * e.g. our own IPoIB MAC address without information from the subnet
+ * manager). We should eventually make link-up an asynchronous event.
+ */
+static int ib_wait_for_link ( struct ib_device *ibdev ) {
+ struct ib_mad_port_info port_info;
+ unsigned int retries;
+ int rc;
+
+ printf ( "Waiting for Infiniband link-up..." );
+ for ( retries = 20 ; retries ; retries-- ) {
+ if ( ( rc = ib_get_port_info ( ibdev, &port_info ) ) != 0 )
+ continue;
+ if ( ( ( port_info.port_state__link_speed_supported ) & 0xf )
+ == 4 ) {
+ printf ( "ok\n" );
+ return 0;
+ }
+ printf ( "." );
+ sleep ( 1 );
+ }
+ printf ( "failed\n" );
+ return -ENODEV;
+};
+
+/**
+ * Get MAD parameters
+ *
+ * @v ibdev Infiniband device
+ * @ret rc Return status code
+ */
+static int ib_get_mad_params ( struct ib_device *ibdev ) {
+ union {
+ /* This union exists just to save stack space */
+ struct ib_mad_port_info port_info;
+ struct ib_mad_guid_info guid_info;
+ struct ib_mad_pkey_table pkey_table;
+ } u;
+ int rc;
+
+ /* Port info gives us the first half of the port GID and the SM LID */
+ if ( ( rc = ib_get_port_info ( ibdev, &u.port_info ) ) != 0 )
+ return rc;
+ memcpy ( &ibdev->port_gid.u.bytes[0], u.port_info.gid_prefix, 8 );
+ ibdev->sm_lid = ntohs ( u.port_info.mastersm_lid );
+
+ /* GUID info gives us the second half of the port GID */
+ if ( ( rc = ib_get_guid_info ( ibdev, &u.guid_info ) ) != 0 )
+ return rc;
+ memcpy ( &ibdev->port_gid.u.bytes[8], u.guid_info.gid_local, 8 );
+
+ /* Get partition key */
+ if ( ( rc = ib_get_pkey_table ( ibdev, &u.pkey_table ) ) != 0 )
+ return rc;
+ ibdev->pkey = ntohs ( u.pkey_table.pkey[0][0] );
+
+ DBGC ( ibdev, "IBDEV %p port GID is %08lx:%08lx:%08lx:%08lx\n",
+ ibdev, htonl ( ibdev->port_gid.u.dwords[0] ),
+ htonl ( ibdev->port_gid.u.dwords[1] ),
+ htonl ( ibdev->port_gid.u.dwords[2] ),
+ htonl ( ibdev->port_gid.u.dwords[3] ) );
+
+ return 0;
+}
+
+/***************************************************************************
+ *
+ * Infiniband device creation/destruction
+ *
+ ***************************************************************************
+ */
+
+/**
+ * Allocate Infiniband device
+ *
+ * @v priv_size Size of driver private data area
+ * @ret ibdev Infiniband device, or NULL
+ */
+struct ib_device * alloc_ibdev ( size_t priv_size ) {
+ struct ib_device *ibdev;
+ void *drv_priv;
+ size_t total_len;
+
+ total_len = ( sizeof ( *ibdev ) + priv_size );
+ ibdev = zalloc ( total_len );
+ if ( ibdev ) {
+ drv_priv = ( ( ( void * ) ibdev ) + sizeof ( *ibdev ) );
+ ib_set_drvdata ( ibdev, drv_priv );
+ }
+ return ibdev;
+}
+
+/**
+ * Register Infiniband device
+ *
+ * @v ibdev Infiniband device
+ * @ret rc Return status code
+ */
+int register_ibdev ( struct ib_device *ibdev ) {
+ int rc;
+
+ /* Open link */
+ if ( ( rc = ib_open ( ibdev ) ) != 0 )
+ goto err_open;
+
+ /* Wait for link */
+ if ( ( rc = ib_wait_for_link ( ibdev ) ) != 0 )
+ goto err_wait_for_link;
+
+ /* Get MAD parameters */
+ if ( ( rc = ib_get_mad_params ( ibdev ) ) != 0 )
+ goto err_get_mad_params;
+
+ /* Add IPoIB device */
+ if ( ( rc = ipoib_probe ( ibdev ) ) != 0 ) {
+ DBGC ( ibdev, "IBDEV %p could not add IPoIB device: %s\n",
+ ibdev, strerror ( rc ) );
+ goto err_ipoib_probe;
+ }
+
+ return 0;
+
+ err_ipoib_probe:
+ err_get_mad_params:
+ err_wait_for_link:
+ ib_close ( ibdev );
+ err_open:
+ return rc;
+}
+
+/**
+ * Unregister Infiniband device
+ *
+ * @v ibdev Infiniband device
+ */
+void unregister_ibdev ( struct ib_device *ibdev ) {
+ ipoib_remove ( ibdev );
+ ib_close ( ibdev );
+}
+
+/**
+ * Free Infiniband device
+ *
+ * @v ibdev Infiniband device
+ */
+void free_ibdev ( struct ib_device *ibdev ) {
+ free ( ibdev );
+}
+
diff --git a/gpxe/src/net/iobpad.c b/gpxe/src/net/iobpad.c
new file mode 100644
index 00000000..9961edca
--- /dev/null
+++ b/gpxe/src/net/iobpad.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/**
+ * @file
+ *
+ * I/O buffer padding
+ *
+ */
+
+#include <string.h>
+#include <gpxe/iobuf.h>
+
+/**
+ * Pad I/O buffer
+ *
+ * @v iobuf I/O buffer
+ * @v min_len Minimum length
+ *
+ * This function pads and aligns I/O buffers, for devices that
+ * aren't capable of padding in hardware, or that require specific
+ * alignment in TX buffers. The packet data will end up aligned to a
+ * multiple of @c IOB_ALIGN.
+ *
+ * @c min_len must not exceed @v IOB_ZLEN.
+ */
+void iob_pad ( struct io_buffer *iobuf, size_t min_len ) {
+ void *data;
+ size_t len;
+ size_t headroom;
+ signed int pad_len;
+
+ assert ( min_len <= IOB_ZLEN );
+
+ /* Move packet data to start of I/O buffer. This will both
+ * align the data (since I/O buffers are aligned to
+ * IOB_ALIGN) and give us sufficient space for the
+ * zero-padding
+ */
+ data = iobuf->data;
+ len = iob_len ( iobuf );
+ headroom = iob_headroom ( iobuf );
+ iob_push ( iobuf, headroom );
+ memmove ( iobuf->data, data, len );
+ iob_unput ( iobuf, headroom );
+
+ /* Pad to minimum packet length */
+ pad_len = ( min_len - iob_len ( iobuf ) );
+ if ( pad_len > 0 )
+ memset ( iob_put ( iobuf, pad_len ), 0, pad_len );
+}
diff --git a/gpxe/src/net/ipv4.c b/gpxe/src/net/ipv4.c
new file mode 100644
index 00000000..591293b7
--- /dev/null
+++ b/gpxe/src/net/ipv4.c
@@ -0,0 +1,633 @@
+#include <string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <byteswap.h>
+#include <gpxe/list.h>
+#include <gpxe/in.h>
+#include <gpxe/arp.h>
+#include <gpxe/if_ether.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/netdevice.h>
+#include <gpxe/ip.h>
+#include <gpxe/tcpip.h>
+#include <gpxe/dhcp.h>
+#include <gpxe/settings.h>
+
+/** @file
+ *
+ * IPv4 protocol
+ *
+ */
+
+/* Unique IP datagram identification number */
+static uint16_t next_ident = 0;
+
+struct net_protocol ipv4_protocol;
+
+/** List of IPv4 miniroutes */
+struct list_head ipv4_miniroutes = LIST_HEAD_INIT ( ipv4_miniroutes );
+
+/** List of fragment reassembly buffers */
+static LIST_HEAD ( frag_buffers );
+
+/**
+ * Add IPv4 minirouting table entry
+ *
+ * @v netdev Network device
+ * @v address IPv4 address
+ * @v netmask Subnet mask
+ * @v gateway Gateway address (or @c INADDR_NONE for no gateway)
+ * @ret miniroute Routing table entry, or NULL
+ */
+static struct ipv4_miniroute * __malloc
+add_ipv4_miniroute ( struct net_device *netdev, struct in_addr address,
+ struct in_addr netmask, struct in_addr gateway ) {
+ struct ipv4_miniroute *miniroute;
+
+ DBG ( "IPv4 add %s", inet_ntoa ( address ) );
+ DBG ( "/%s ", inet_ntoa ( netmask ) );
+ if ( gateway.s_addr != INADDR_NONE )
+ DBG ( "gw %s ", inet_ntoa ( gateway ) );
+ DBG ( "via %s\n", netdev->name );
+
+ /* Allocate and populate miniroute structure */
+ miniroute = malloc ( sizeof ( *miniroute ) );
+ if ( ! miniroute ) {
+ DBG ( "IPv4 could not add miniroute\n" );
+ return NULL;
+ }
+
+ /* Record routing information */
+ miniroute->netdev = netdev_get ( netdev );
+ miniroute->address = address;
+ miniroute->netmask = netmask;
+ miniroute->gateway = gateway;
+
+ /* Add to end of list if we have a gateway, otherwise
+ * to start of list.
+ */
+ if ( gateway.s_addr != INADDR_NONE ) {
+ list_add_tail ( &miniroute->list, &ipv4_miniroutes );
+ } else {
+ list_add ( &miniroute->list, &ipv4_miniroutes );
+ }
+
+ return miniroute;
+}
+
+/**
+ * Delete IPv4 minirouting table entry
+ *
+ * @v miniroute Routing table entry
+ */
+static void del_ipv4_miniroute ( struct ipv4_miniroute *miniroute ) {
+
+ DBG ( "IPv4 del %s", inet_ntoa ( miniroute->address ) );
+ DBG ( "/%s ", inet_ntoa ( miniroute->netmask ) );
+ if ( miniroute->gateway.s_addr != INADDR_NONE )
+ DBG ( "gw %s ", inet_ntoa ( miniroute->gateway ) );
+ DBG ( "via %s\n", miniroute->netdev->name );
+
+ netdev_put ( miniroute->netdev );
+ list_del ( &miniroute->list );
+ free ( miniroute );
+}
+
+/**
+ * Perform IPv4 routing
+ *
+ * @v dest Final destination address
+ * @ret dest Next hop destination address
+ * @ret miniroute Routing table entry to use, or NULL if no route
+ *
+ * If the route requires use of a gateway, the next hop destination
+ * address will be overwritten with the gateway address.
+ */
+static struct ipv4_miniroute * ipv4_route ( struct in_addr *dest ) {
+ struct ipv4_miniroute *miniroute;
+ int local;
+ int has_gw;
+
+ /* Never attempt to route the broadcast address */
+ if ( dest->s_addr == INADDR_BROADCAST )
+ return NULL;
+
+ /* Find first usable route in routing table */
+ list_for_each_entry ( miniroute, &ipv4_miniroutes, list ) {
+ local = ( ( ( dest->s_addr ^ miniroute->address.s_addr )
+ & miniroute->netmask.s_addr ) == 0 );
+ has_gw = ( miniroute->gateway.s_addr != INADDR_NONE );
+ if ( local || has_gw ) {
+ if ( ! local )
+ *dest = miniroute->gateway;
+ return miniroute;
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * Fragment reassembly counter timeout
+ *
+ * @v timer Retry timer
+ * @v over If asserted, the timer is greater than @c MAX_TIMEOUT
+ */
+static void ipv4_frag_expired ( struct retry_timer *timer __unused,
+ int over ) {
+ if ( over ) {
+ DBG ( "Fragment reassembly timeout" );
+ /* Free the fragment buffer */
+ }
+}
+
+/**
+ * Free fragment buffer
+ *
+ * @v fragbug Fragment buffer
+ */
+static void free_fragbuf ( struct frag_buffer *fragbuf ) {
+ free ( fragbuf );
+}
+
+/**
+ * Fragment reassembler
+ *
+ * @v iobuf I/O buffer, fragment of the datagram
+ * @ret frag_iob Reassembled packet, or NULL
+ */
+static struct io_buffer * ipv4_reassemble ( struct io_buffer * iobuf ) {
+ struct iphdr *iphdr = iobuf->data;
+ struct frag_buffer *fragbuf;
+
+ /**
+ * Check if the fragment belongs to any fragment series
+ */
+ list_for_each_entry ( fragbuf, &frag_buffers, list ) {
+ if ( fragbuf->ident == iphdr->ident &&
+ fragbuf->src.s_addr == iphdr->src.s_addr ) {
+ /**
+ * Check if the packet is the expected fragment
+ *
+ * The offset of the new packet must be equal to the
+ * length of the data accumulated so far (the length of
+ * the reassembled I/O buffer
+ */
+ if ( iob_len ( fragbuf->frag_iob ) ==
+ ( iphdr->frags & IP_MASK_OFFSET ) ) {
+ /**
+ * Append the contents of the fragment to the
+ * reassembled I/O buffer
+ */
+ iob_pull ( iobuf, sizeof ( *iphdr ) );
+ memcpy ( iob_put ( fragbuf->frag_iob,
+ iob_len ( iobuf ) ),
+ iobuf->data, iob_len ( iobuf ) );
+ free_iob ( iobuf );
+
+ /** Check if the fragment series is over */
+ if ( !iphdr->frags & IP_MASK_MOREFRAGS ) {
+ iobuf = fragbuf->frag_iob;
+ free_fragbuf ( fragbuf );
+ return iobuf;
+ }
+
+ } else {
+ /* Discard the fragment series */
+ free_fragbuf ( fragbuf );
+ free_iob ( iobuf );
+ }
+ return NULL;
+ }
+ }
+
+ /** Check if the fragment is the first in the fragment series */
+ if ( iphdr->frags & IP_MASK_MOREFRAGS &&
+ ( ( iphdr->frags & IP_MASK_OFFSET ) == 0 ) ) {
+
+ /** Create a new fragment buffer */
+ fragbuf = ( struct frag_buffer* ) malloc ( sizeof( *fragbuf ) );
+ fragbuf->ident = iphdr->ident;
+ fragbuf->src = iphdr->src;
+
+ /* Set up the reassembly I/O buffer */
+ fragbuf->frag_iob = alloc_iob ( IP_FRAG_IOB_SIZE );
+ iob_pull ( iobuf, sizeof ( *iphdr ) );
+ memcpy ( iob_put ( fragbuf->frag_iob, iob_len ( iobuf ) ),
+ iobuf->data, iob_len ( iobuf ) );
+ free_iob ( iobuf );
+
+ /* Set the reassembly timer */
+ fragbuf->frag_timer.timeout = IP_FRAG_TIMEOUT;
+ fragbuf->frag_timer.expired = ipv4_frag_expired;
+ start_timer ( &fragbuf->frag_timer );
+
+ /* Add the fragment buffer to the list of fragment buffers */
+ list_add ( &fragbuf->list, &frag_buffers );
+ }
+
+ return NULL;
+}
+
+/**
+ * Add IPv4 pseudo-header checksum to existing checksum
+ *
+ * @v iobuf I/O buffer
+ * @v csum Existing checksum
+ * @ret csum Updated checksum
+ */
+static uint16_t ipv4_pshdr_chksum ( struct io_buffer *iobuf, uint16_t csum ) {
+ struct ipv4_pseudo_header pshdr;
+ struct iphdr *iphdr = iobuf->data;
+ size_t hdrlen = ( ( iphdr->verhdrlen & IP_MASK_HLEN ) * 4 );
+
+ /* Build pseudo-header */
+ pshdr.src = iphdr->src;
+ pshdr.dest = iphdr->dest;
+ pshdr.zero_padding = 0x00;
+ pshdr.protocol = iphdr->protocol;
+ pshdr.len = htons ( iob_len ( iobuf ) - hdrlen );
+
+ /* Update the checksum value */
+ return tcpip_continue_chksum ( csum, &pshdr, sizeof ( pshdr ) );
+}
+
+/**
+ * Determine link-layer address
+ *
+ * @v dest IPv4 destination address
+ * @v src IPv4 source address
+ * @v netdev Network device
+ * @v ll_dest Link-layer destination address buffer
+ * @ret rc Return status code
+ */
+static int ipv4_ll_addr ( struct in_addr dest, struct in_addr src,
+ struct net_device *netdev, uint8_t *ll_dest ) {
+ struct ll_protocol *ll_protocol = netdev->ll_protocol;
+ uint8_t *dest_bytes = ( ( uint8_t * ) &dest );
+
+ if ( dest.s_addr == INADDR_BROADCAST ) {
+ /* Broadcast address */
+ memcpy ( ll_dest, ll_protocol->ll_broadcast,
+ ll_protocol->ll_addr_len );
+ return 0;
+ } else if ( IN_MULTICAST ( dest.s_addr ) ) {
+ /* Special case: IPv4 multicast over Ethernet. This
+ * code may need to be generalised once we find out
+ * what happens for other link layers.
+ */
+ ll_dest[0] = 0x01;
+ ll_dest[1] = 0x00;
+ ll_dest[2] = 0x5e;
+ ll_dest[3] = dest_bytes[1] & 0x7f;
+ ll_dest[4] = dest_bytes[2];
+ ll_dest[5] = dest_bytes[3];
+ return 0;
+ } else {
+ /* Unicast address: resolve via ARP */
+ return arp_resolve ( netdev, &ipv4_protocol, &dest,
+ &src, ll_dest );
+ }
+}
+
+/**
+ * Transmit IP packet
+ *
+ * @v iobuf I/O buffer
+ * @v tcpip Transport-layer protocol
+ * @v st_dest Destination network-layer address
+ * @v netdev Network device to use if no route found, or NULL
+ * @v trans_csum Transport-layer checksum to complete, or NULL
+ * @ret rc Status
+ *
+ * This function expects a transport-layer segment and prepends the IP header
+ */
+static int ipv4_tx ( struct io_buffer *iobuf,
+ struct tcpip_protocol *tcpip_protocol,
+ struct sockaddr_tcpip *st_dest,
+ struct net_device *netdev,
+ uint16_t *trans_csum ) {
+ struct iphdr *iphdr = iob_push ( iobuf, sizeof ( *iphdr ) );
+ struct sockaddr_in *sin_dest = ( ( struct sockaddr_in * ) st_dest );
+ struct ipv4_miniroute *miniroute;
+ struct in_addr next_hop;
+ uint8_t ll_dest[MAX_LL_ADDR_LEN];
+ int rc;
+
+ /* Fill up the IP header, except source address */
+ memset ( iphdr, 0, sizeof ( *iphdr ) );
+ iphdr->verhdrlen = ( IP_VER | ( sizeof ( *iphdr ) / 4 ) );
+ iphdr->service = IP_TOS;
+ iphdr->len = htons ( iob_len ( iobuf ) );
+ iphdr->ident = htons ( ++next_ident );
+ iphdr->ttl = IP_TTL;
+ iphdr->protocol = tcpip_protocol->tcpip_proto;
+ iphdr->dest = sin_dest->sin_addr;
+
+ /* Use routing table to identify next hop and transmitting netdev */
+ next_hop = iphdr->dest;
+ if ( ( miniroute = ipv4_route ( &next_hop ) ) ) {
+ iphdr->src = miniroute->address;
+ netdev = miniroute->netdev;
+ }
+ if ( ! netdev ) {
+ DBG ( "IPv4 has no route to %s\n", inet_ntoa ( iphdr->dest ) );
+ rc = -ENETUNREACH;
+ goto err;
+ }
+
+ /* Determine link-layer destination address */
+ if ( ( rc = ipv4_ll_addr ( next_hop, iphdr->src, netdev,
+ ll_dest ) ) != 0 ) {
+ DBG ( "IPv4 has no link-layer address for %s: %s\n",
+ inet_ntoa ( next_hop ), strerror ( rc ) );
+ goto err;
+ }
+
+ /* Fix up checksums */
+ if ( trans_csum )
+ *trans_csum = ipv4_pshdr_chksum ( iobuf, *trans_csum );
+ iphdr->chksum = tcpip_chksum ( iphdr, sizeof ( *iphdr ) );
+
+ /* Print IP4 header for debugging */
+ DBG ( "IPv4 TX %s->", inet_ntoa ( iphdr->src ) );
+ DBG ( "%s len %d proto %d id %04x csum %04x\n",
+ inet_ntoa ( iphdr->dest ), ntohs ( iphdr->len ), iphdr->protocol,
+ ntohs ( iphdr->ident ), ntohs ( iphdr->chksum ) );
+
+ /* Hand off to link layer */
+ if ( ( rc = net_tx ( iobuf, netdev, &ipv4_protocol, ll_dest ) ) != 0 ) {
+ DBG ( "IPv4 could not transmit packet via %s: %s\n",
+ netdev->name, strerror ( rc ) );
+ return rc;
+ }
+
+ return 0;
+
+ err:
+ free_iob ( iobuf );
+ return rc;
+}
+
+/**
+ * Process incoming packets
+ *
+ * @v iobuf I/O buffer
+ * @v netdev Network device
+ * @v ll_source Link-layer destination source
+ *
+ * This function expects an IP4 network datagram. It processes the headers
+ * and sends it to the transport layer.
+ */
+static int ipv4_rx ( struct io_buffer *iobuf, struct net_device *netdev __unused,
+ const void *ll_source __unused ) {
+ struct iphdr *iphdr = iobuf->data;
+ size_t hdrlen;
+ size_t len;
+ union {
+ struct sockaddr_in sin;
+ struct sockaddr_tcpip st;
+ } src, dest;
+ uint16_t csum;
+ uint16_t pshdr_csum;
+ int rc;
+
+ /* Sanity check the IPv4 header */
+ if ( iob_len ( iobuf ) < sizeof ( *iphdr ) ) {
+ DBG ( "IPv4 packet too short at %zd bytes (min %zd bytes)\n",
+ iob_len ( iobuf ), sizeof ( *iphdr ) );
+ goto err;
+ }
+ if ( ( iphdr->verhdrlen & IP_MASK_VER ) != IP_VER ) {
+ DBG ( "IPv4 version %#02x not supported\n", iphdr->verhdrlen );
+ goto err;
+ }
+ hdrlen = ( ( iphdr->verhdrlen & IP_MASK_HLEN ) * 4 );
+ if ( hdrlen < sizeof ( *iphdr ) ) {
+ DBG ( "IPv4 header too short at %zd bytes (min %zd bytes)\n",
+ hdrlen, sizeof ( *iphdr ) );
+ goto err;
+ }
+ if ( hdrlen > iob_len ( iobuf ) ) {
+ DBG ( "IPv4 header too long at %zd bytes "
+ "(packet is %zd bytes)\n", hdrlen, iob_len ( iobuf ) );
+ goto err;
+ }
+ if ( ( csum = tcpip_chksum ( iphdr, hdrlen ) ) != 0 ) {
+ DBG ( "IPv4 checksum incorrect (is %04x including checksum "
+ "field, should be 0000)\n", csum );
+ goto err;
+ }
+ len = ntohs ( iphdr->len );
+ if ( len < hdrlen ) {
+ DBG ( "IPv4 length too short at %zd bytes "
+ "(header is %zd bytes)\n", len, hdrlen );
+ goto err;
+ }
+ if ( len > iob_len ( iobuf ) ) {
+ DBG ( "IPv4 length too long at %zd bytes "
+ "(packet is %zd bytes)\n", len, iob_len ( iobuf ) );
+ goto err;
+ }
+
+ /* Print IPv4 header for debugging */
+ DBG ( "IPv4 RX %s<-", inet_ntoa ( iphdr->dest ) );
+ DBG ( "%s len %d proto %d id %04x csum %04x\n",
+ inet_ntoa ( iphdr->src ), ntohs ( iphdr->len ), iphdr->protocol,
+ ntohs ( iphdr->ident ), ntohs ( iphdr->chksum ) );
+
+ /* Truncate packet to correct length, calculate pseudo-header
+ * checksum and then strip off the IPv4 header.
+ */
+ iob_unput ( iobuf, ( iob_len ( iobuf ) - len ) );
+ pshdr_csum = ipv4_pshdr_chksum ( iobuf, TCPIP_EMPTY_CSUM );
+ iob_pull ( iobuf, hdrlen );
+
+ /* Fragment reassembly */
+ if ( ( iphdr->frags & htons ( IP_MASK_MOREFRAGS ) ) ||
+ ( ( iphdr->frags & htons ( IP_MASK_OFFSET ) ) != 0 ) ) {
+ /* Pass the fragment to ipv4_reassemble() which either
+ * returns a fully reassembled I/O buffer or NULL.
+ */
+ iobuf = ipv4_reassemble ( iobuf );
+ if ( ! iobuf )
+ return 0;
+ }
+
+ /* Construct socket addresses and hand off to transport layer */
+ memset ( &src, 0, sizeof ( src ) );
+ src.sin.sin_family = AF_INET;
+ src.sin.sin_addr = iphdr->src;
+ memset ( &dest, 0, sizeof ( dest ) );
+ dest.sin.sin_family = AF_INET;
+ dest.sin.sin_addr = iphdr->dest;
+ if ( ( rc = tcpip_rx ( iobuf, iphdr->protocol, &src.st,
+ &dest.st, pshdr_csum ) ) != 0 ) {
+ DBG ( "IPv4 received packet rejected by stack: %s\n",
+ strerror ( rc ) );
+ return rc;
+ }
+
+ return 0;
+
+ err:
+ free_iob ( iobuf );
+ return -EINVAL;
+}
+
+/**
+ * Check existence of IPv4 address for ARP
+ *
+ * @v netdev Network device
+ * @v net_addr Network-layer address
+ * @ret rc Return status code
+ */
+static int ipv4_arp_check ( struct net_device *netdev, const void *net_addr ) {
+ const struct in_addr *address = net_addr;
+ struct ipv4_miniroute *miniroute;
+
+ list_for_each_entry ( miniroute, &ipv4_miniroutes, list ) {
+ if ( ( miniroute->netdev == netdev ) &&
+ ( miniroute->address.s_addr == address->s_addr ) ) {
+ /* Found matching address */
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+/**
+ * Convert IPv4 address to dotted-quad notation
+ *
+ * @v in IP address
+ * @ret string IP address in dotted-quad notation
+ */
+char * inet_ntoa ( struct in_addr in ) {
+ static char buf[16]; /* "xxx.xxx.xxx.xxx" */
+ uint8_t *bytes = ( uint8_t * ) &in;
+
+ sprintf ( buf, "%d.%d.%d.%d", bytes[0], bytes[1], bytes[2], bytes[3] );
+ return buf;
+}
+
+/**
+ * Transcribe IP address
+ *
+ * @v net_addr IP address
+ * @ret string IP address in dotted-quad notation
+ *
+ */
+static const char * ipv4_ntoa ( const void *net_addr ) {
+ return inet_ntoa ( * ( ( struct in_addr * ) net_addr ) );
+}
+
+/** IPv4 protocol */
+struct net_protocol ipv4_protocol __net_protocol = {
+ .name = "IP",
+ .net_proto = htons ( ETH_P_IP ),
+ .net_addr_len = sizeof ( struct in_addr ),
+ .rx = ipv4_rx,
+ .ntoa = ipv4_ntoa,
+};
+
+/** IPv4 TCPIP net protocol */
+struct tcpip_net_protocol ipv4_tcpip_protocol __tcpip_net_protocol = {
+ .name = "IPv4",
+ .sa_family = AF_INET,
+ .tx = ipv4_tx,
+};
+
+/** IPv4 ARP protocol */
+struct arp_net_protocol ipv4_arp_protocol __arp_net_protocol = {
+ .net_protocol = &ipv4_protocol,
+ .check = ipv4_arp_check,
+};
+
+/******************************************************************************
+ *
+ * Settings
+ *
+ ******************************************************************************
+ */
+
+/** IPv4 address setting */
+struct setting ip_setting __setting = {
+ .name = "ip",
+ .description = "IPv4 address",
+ .tag = DHCP_EB_YIADDR,
+ .type = &setting_type_ipv4,
+};
+
+/** IPv4 subnet mask setting */
+struct setting netmask_setting __setting = {
+ .name = "netmask",
+ .description = "IPv4 subnet mask",
+ .tag = DHCP_SUBNET_MASK,
+ .type = &setting_type_ipv4,
+};
+
+/** Default gateway setting */
+struct setting gateway_setting __setting = {
+ .name = "gateway",
+ .description = "Default gateway",
+ .tag = DHCP_ROUTERS,
+ .type = &setting_type_ipv4,
+};
+
+/**
+ * Create IPv4 routing table based on configured settings
+ *
+ * @ret rc Return status code
+ */
+static int ipv4_create_routes ( void ) {
+ struct ipv4_miniroute *miniroute;
+ struct ipv4_miniroute *tmp;
+ struct net_device *netdev;
+ struct settings *settings;
+ struct in_addr address = { 0 };
+ struct in_addr netmask = { 0 };
+ struct in_addr gateway = { INADDR_NONE };
+
+ /* Delete all existing routes */
+ list_for_each_entry_safe ( miniroute, tmp, &ipv4_miniroutes, list )
+ del_ipv4_miniroute ( miniroute );
+
+ /* Create a route for each configured network device */
+ for_each_netdev ( netdev ) {
+ settings = netdev_settings ( netdev );
+ /* Get IPv4 address */
+ address.s_addr = 0;
+ fetch_ipv4_setting ( settings, &ip_setting, &address );
+ if ( ! address.s_addr )
+ continue;
+ /* Calculate default netmask */
+ if ( IN_CLASSA ( ntohl ( address.s_addr ) ) ) {
+ netmask.s_addr = htonl ( IN_CLASSA_NET );
+ } else if ( IN_CLASSB ( ntohl ( address.s_addr ) ) ) {
+ netmask.s_addr = htonl ( IN_CLASSB_NET );
+ } else if ( IN_CLASSC ( ntohl ( address.s_addr ) ) ) {
+ netmask.s_addr = htonl ( IN_CLASSC_NET );
+ } else {
+ netmask.s_addr = 0;
+ }
+ /* Override with subnet mask, if present */
+ fetch_ipv4_setting ( settings, &netmask_setting, &netmask );
+ /* Get default gateway, if present */
+ gateway.s_addr = INADDR_NONE;
+ fetch_ipv4_setting ( settings, &gateway_setting, &gateway );
+ /* Configure route */
+ miniroute = add_ipv4_miniroute ( netdev, address,
+ netmask, gateway );
+ if ( ! miniroute )
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/** IPv4 settings applicator */
+struct settings_applicator ipv4_settings_applicator __settings_applicator = {
+ .apply = ipv4_create_routes,
+};
diff --git a/gpxe/src/net/ipv6.c b/gpxe/src/net/ipv6.c
new file mode 100644
index 00000000..3407d538
--- /dev/null
+++ b/gpxe/src/net/ipv6.c
@@ -0,0 +1,380 @@
+#include <errno.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <byteswap.h>
+#include <gpxe/in.h>
+#include <gpxe/ip6.h>
+#include <gpxe/ndp.h>
+#include <gpxe/list.h>
+#include <gpxe/icmp6.h>
+#include <gpxe/tcpip.h>
+#include <gpxe/socket.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/netdevice.h>
+#include <gpxe/if_ether.h>
+
+struct net_protocol ipv6_protocol;
+
+/* Unspecified IP6 address */
+static struct in6_addr ip6_none = {
+ .in6_u.u6_addr32 = { 0,0,0,0 }
+};
+
+/** An IPv6 routing table entry */
+struct ipv6_miniroute {
+ /* List of miniroutes */
+ struct list_head list;
+
+ /* Network device */
+ struct net_device *netdev;
+
+ /* Destination prefix */
+ struct in6_addr prefix;
+ /* Prefix length */
+ int prefix_len;
+ /* IPv6 address of interface */
+ struct in6_addr address;
+ /* Gateway address */
+ struct in6_addr gateway;
+};
+
+/** List of IPv6 miniroutes */
+static LIST_HEAD ( miniroutes );
+
+/**
+ * Add IPv6 minirouting table entry
+ *
+ * @v netdev Network device
+ * @v prefix Destination prefix
+ * @v address Address of the interface
+ * @v gateway Gateway address (or ::0 for no gateway)
+ * @ret miniroute Routing table entry, or NULL
+ */
+static struct ipv6_miniroute * __malloc
+add_ipv6_miniroute ( struct net_device *netdev, struct in6_addr prefix,
+ int prefix_len, struct in6_addr address,
+ struct in6_addr gateway ) {
+ struct ipv6_miniroute *miniroute;
+
+ miniroute = malloc ( sizeof ( *miniroute ) );
+ if ( miniroute ) {
+ /* Record routing information */
+ miniroute->netdev = netdev_get ( netdev );
+ miniroute->prefix = prefix;
+ miniroute->prefix_len = prefix_len;
+ miniroute->address = address;
+ miniroute->gateway = gateway;
+
+ /* Add miniroute to list of miniroutes */
+ if ( !IP6_EQUAL ( gateway, ip6_none ) ) {
+ list_add_tail ( &miniroute->list, &miniroutes );
+ } else {
+ list_add ( &miniroute->list, &miniroutes );
+ }
+ }
+
+ return miniroute;
+}
+
+/**
+ * Delete IPv6 minirouting table entry
+ *
+ * @v miniroute Routing table entry
+ */
+static void del_ipv6_miniroute ( struct ipv6_miniroute *miniroute ) {
+ netdev_put ( miniroute->netdev );
+ list_del ( &miniroute->list );
+ free ( miniroute );
+}
+
+/**
+ * Add IPv6 interface
+ *
+ * @v netdev Network device
+ * @v prefix Destination prefix
+ * @v address Address of the interface
+ * @v gateway Gateway address (or ::0 for no gateway)
+ */
+int add_ipv6_address ( struct net_device *netdev, struct in6_addr prefix,
+ int prefix_len, struct in6_addr address,
+ struct in6_addr gateway ) {
+ struct ipv6_miniroute *miniroute;
+
+ /* Clear any existing address for this net device */
+ del_ipv6_address ( netdev );
+
+ /* Add new miniroute */
+ miniroute = add_ipv6_miniroute ( netdev, prefix, prefix_len, address,
+ gateway );
+ if ( ! miniroute )
+ return -ENOMEM;
+
+ return 0;
+}
+
+/**
+ * Remove IPv6 interface
+ *
+ * @v netdev Network device
+ */
+void del_ipv6_address ( struct net_device *netdev ) {
+ struct ipv6_miniroute *miniroute;
+
+ list_for_each_entry ( miniroute, &miniroutes, list ) {
+ if ( miniroute->netdev == netdev ) {
+ del_ipv6_miniroute ( miniroute );
+ break;
+ }
+ }
+}
+
+/**
+ * Calculate TCPIP checksum
+ *
+ * @v iobuf I/O buffer
+ * @v tcpip TCP/IP protocol
+ *
+ * This function constructs the pseudo header and completes the checksum in the
+ * upper layer header.
+ */
+static uint16_t ipv6_tx_csum ( struct io_buffer *iobuf, uint16_t csum ) {
+ struct ip6_header *ip6hdr = iobuf->data;
+ struct ipv6_pseudo_header pshdr;
+
+ /* Calculate pseudo header */
+ memset ( &pshdr, 0, sizeof ( pshdr ) );
+ pshdr.src = ip6hdr->src;
+ pshdr.dest = ip6hdr->dest;
+ pshdr.len = htons ( iob_len ( iobuf ) - sizeof ( *ip6hdr ) );
+ pshdr.nxt_hdr = ip6hdr->nxt_hdr;
+
+ /* Update checksum value */
+ return tcpip_continue_chksum ( csum, &pshdr, sizeof ( pshdr ) );
+}
+
+/**
+ * Dump IP6 header for debugging
+ *
+ * ip6hdr IPv6 header
+ */
+void ipv6_dump ( struct ip6_header *ip6hdr ) {
+ DBG ( "IP6 %p src %s dest %s nxt_hdr %d len %d\n", ip6hdr,
+ inet6_ntoa ( ip6hdr->src ), inet6_ntoa ( ip6hdr->dest ),
+ ip6hdr->nxt_hdr, ntohs ( ip6hdr->payload_len ) );
+}
+
+/**
+ * Transmit IP6 packet
+ *
+ * iobuf I/O buffer
+ * tcpip TCP/IP protocol
+ * st_dest Destination socket address
+ *
+ * This function prepends the IPv6 headers to the payload an transmits it.
+ */
+static int ipv6_tx ( struct io_buffer *iobuf,
+ struct tcpip_protocol *tcpip,
+ struct sockaddr_tcpip *st_dest,
+ struct net_device *netdev,
+ uint16_t *trans_csum ) {
+ struct sockaddr_in6 *dest = ( struct sockaddr_in6* ) st_dest;
+ struct in6_addr next_hop;
+ struct ipv6_miniroute *miniroute;
+ uint8_t ll_dest_buf[MAX_LL_ADDR_LEN];
+ const uint8_t *ll_dest = ll_dest_buf;
+ int rc;
+
+ /* Construct the IPv6 packet */
+ struct ip6_header *ip6hdr = iob_push ( iobuf, sizeof ( *ip6hdr ) );
+ memset ( ip6hdr, 0, sizeof ( *ip6hdr) );
+ ip6hdr->ver_traffic_class_flow_label = htonl ( 0x60000000 );//IP6_VERSION;
+ ip6hdr->payload_len = htons ( iob_len ( iobuf ) - sizeof ( *ip6hdr ) );
+ ip6hdr->nxt_hdr = tcpip->tcpip_proto;
+ ip6hdr->hop_limit = IP6_HOP_LIMIT; // 255
+
+ /* Determine the next hop address and interface
+ *
+ * TODO: Implement the routing table.
+ */
+ next_hop = dest->sin6_addr;
+ list_for_each_entry ( miniroute, &miniroutes, list ) {
+ if ( ( memcmp ( &ip6hdr->dest, &miniroute->prefix,
+ miniroute->prefix_len ) == 0 ) ||
+ ( IP6_EQUAL ( miniroute->gateway, ip6_none ) ) ) {
+ netdev = miniroute->netdev;
+ ip6hdr->src = miniroute->address;
+ if ( ! ( IS_UNSPECIFIED ( miniroute->gateway ) ) ) {
+ next_hop = miniroute->gateway;
+ }
+ break;
+ }
+ }
+ /* No network interface identified */
+ if ( !netdev ) {
+ DBG ( "No route to host %s\n", inet6_ntoa ( ip6hdr->dest ) );
+ rc = -ENETUNREACH;
+ goto err;
+ }
+
+ /* Complete the transport layer checksum */
+ if ( trans_csum )
+ *trans_csum = ipv6_tx_csum ( iobuf, *trans_csum );
+
+ /* Print IPv6 header */
+ ipv6_dump ( ip6hdr );
+
+ /* Resolve link layer address */
+ if ( next_hop.in6_u.u6_addr8[0] == 0xff ) {
+ ll_dest_buf[0] = 0x33;
+ ll_dest_buf[1] = 0x33;
+ ll_dest_buf[2] = next_hop.in6_u.u6_addr8[12];
+ ll_dest_buf[3] = next_hop.in6_u.u6_addr8[13];
+ ll_dest_buf[4] = next_hop.in6_u.u6_addr8[14];
+ ll_dest_buf[5] = next_hop.in6_u.u6_addr8[15];
+ } else {
+ /* Unicast address needs to be resolved by NDP */
+ if ( ( rc = ndp_resolve ( netdev, &next_hop, &ip6hdr->src,
+ ll_dest_buf ) ) != 0 ) {
+ DBG ( "No entry for %s\n", inet6_ntoa ( next_hop ) );
+ goto err;
+ }
+ }
+
+ /* Transmit packet */
+ return net_tx ( iobuf, netdev, &ipv6_protocol, ll_dest );
+
+ err:
+ free_iob ( iobuf );
+ return rc;
+}
+
+/**
+ * Process next IP6 header
+ *
+ * @v iobuf I/O buffer
+ * @v nxt_hdr Next header number
+ * @v src Source socket address
+ * @v dest Destination socket address
+ *
+ * Refer http://www.iana.org/assignments/ipv6-parameters for the numbers
+ */
+static int ipv6_process_nxt_hdr ( struct io_buffer *iobuf, uint8_t nxt_hdr,
+ struct sockaddr_tcpip *src, struct sockaddr_tcpip *dest ) {
+ switch ( nxt_hdr ) {
+ case IP6_HOPBYHOP:
+ case IP6_ROUTING:
+ case IP6_FRAGMENT:
+ case IP6_AUTHENTICATION:
+ case IP6_DEST_OPTS:
+ case IP6_ESP:
+ DBG ( "Function not implemented for header %d\n", nxt_hdr );
+ return -ENOSYS;
+ case IP6_ICMP6:
+ break;
+ case IP6_NO_HEADER:
+ DBG ( "No next header\n" );
+ return 0;
+ }
+ /* Next header is not a IPv6 extension header */
+ return tcpip_rx ( iobuf, nxt_hdr, src, dest, 0 /* fixme */ );
+}
+
+/**
+ * Process incoming IP6 packets
+ *
+ * @v iobuf I/O buffer
+ * @v netdev Network device
+ * @v ll_source Link-layer source address
+ *
+ * This function processes a IPv6 packet
+ */
+static int ipv6_rx ( struct io_buffer *iobuf,
+ __unused struct net_device *netdev,
+ __unused const void *ll_source ) {
+
+ struct ip6_header *ip6hdr = iobuf->data;
+ union {
+ struct sockaddr_in6 sin6;
+ struct sockaddr_tcpip st;
+ } src, dest;
+
+ /* Sanity check */
+ if ( iob_len ( iobuf ) < sizeof ( *ip6hdr ) ) {
+ DBG ( "Packet too short (%zd bytes)\n", iob_len ( iobuf ) );
+ goto drop;
+ }
+
+ /* TODO: Verify checksum */
+
+ /* Print IP6 header for debugging */
+ ipv6_dump ( ip6hdr );
+
+ /* Check header version */
+ if ( ( ip6hdr->ver_traffic_class_flow_label & 0xf0000000 ) != 0x60000000 ) {
+ DBG ( "Invalid protocol version\n" );
+ goto drop;
+ }
+
+ /* Check the payload length */
+ if ( ntohs ( ip6hdr->payload_len ) > iob_len ( iobuf ) ) {
+ DBG ( "Inconsistent packet length (%d bytes)\n",
+ ip6hdr->payload_len );
+ goto drop;
+ }
+
+ /* Ignore the traffic class and flow control values */
+
+ /* Construct socket address */
+ memset ( &src, 0, sizeof ( src ) );
+ src.sin6.sin_family = AF_INET6;
+ src.sin6.sin6_addr = ip6hdr->src;
+ memset ( &dest, 0, sizeof ( dest ) );
+ dest.sin6.sin_family = AF_INET6;
+ dest.sin6.sin6_addr = ip6hdr->dest;
+
+ /* Strip header */
+ iob_unput ( iobuf, iob_len ( iobuf ) - ntohs ( ip6hdr->payload_len ) -
+ sizeof ( *ip6hdr ) );
+ iob_pull ( iobuf, sizeof ( *ip6hdr ) );
+
+ /* Send it to the transport layer */
+ return ipv6_process_nxt_hdr ( iobuf, ip6hdr->nxt_hdr, &src.st, &dest.st );
+
+ drop:
+ DBG ( "Packet dropped\n" );
+ free_iob ( iobuf );
+ return -1;
+}
+
+/**
+ * Print a IP6 address as xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx
+ */
+char * inet6_ntoa ( struct in6_addr in6 ) {
+ static char buf[40];
+ uint16_t *bytes = ( uint16_t* ) &in6;
+ sprintf ( buf, "%x:%x:%x:%x:%x:%x:%x:%x", bytes[0], bytes[1], bytes[2],
+ bytes[3], bytes[4], bytes[5], bytes[6], bytes[7] );
+ return buf;
+}
+
+static const char * ipv6_ntoa ( const void *net_addr ) {
+ return inet6_ntoa ( * ( ( struct in6_addr * ) net_addr ) );
+}
+
+/** IPv6 protocol */
+struct net_protocol ipv6_protocol __net_protocol = {
+ .name = "IPv6",
+ .net_proto = htons ( ETH_P_IPV6 ),
+ .net_addr_len = sizeof ( struct in6_addr ),
+ .rx = ipv6_rx,
+ .ntoa = ipv6_ntoa,
+};
+
+/** IPv6 TCPIP net protocol */
+struct tcpip_net_protocol ipv6_tcpip_protocol __tcpip_net_protocol = {
+ .name = "IPv6",
+ .sa_family = AF_INET6,
+ .tx = ipv6_tx,
+};
diff --git a/gpxe/src/net/ndp.c b/gpxe/src/net/ndp.c
new file mode 100644
index 00000000..8bea8b32
--- /dev/null
+++ b/gpxe/src/net/ndp.c
@@ -0,0 +1,180 @@
+#include <stdint.h>
+#include <string.h>
+#include <byteswap.h>
+#include <errno.h>
+#include <gpxe/if_ether.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/ndp.h>
+#include <gpxe/icmp6.h>
+#include <gpxe/ip6.h>
+#include <gpxe/netdevice.h>
+
+/** @file
+ *
+ * Neighbour Discovery Protocol
+ *
+ * This file implements address resolution as specified by the neighbour
+ * discovery protocol in RFC2461. This protocol is part of the IPv6 protocol
+ * family.
+ */
+
+/* A neighbour entry */
+struct ndp_entry {
+ /** Target IP6 address */
+ struct in6_addr in6;
+ /** Link layer protocol */
+ struct ll_protocol *ll_protocol;
+ /** Link-layer address */
+ uint8_t ll_addr[MAX_LL_ADDR_LEN];
+ /** State of the neighbour entry */
+ int state;
+};
+
+/** Number of entries in the neighbour cache table */
+#define NUM_NDP_ENTRIES 4
+
+/** The neighbour cache table */
+static struct ndp_entry ndp_table[NUM_NDP_ENTRIES];
+#define ndp_table_end &ndp_table[NUM_NDP_ENTRIES]
+
+static unsigned int next_new_ndp_entry = 0;
+
+/**
+ * Find entry in the neighbour cache
+ *
+ * @v in6 IP6 address
+ */
+static struct ndp_entry *
+ndp_find_entry ( struct in6_addr *in6 ) {
+ struct ndp_entry *ndp;
+
+ for ( ndp = ndp_table ; ndp < ndp_table_end ; ndp++ ) {
+ if ( IP6_EQUAL ( ( *in6 ), ndp->in6 ) &&
+ ( ndp->state != NDP_STATE_INVALID ) ) {
+ return ndp;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * Add NDP entry
+ *
+ * @v netdev Network device
+ * @v in6 IP6 address
+ * @v ll_addr Link-layer address
+ * @v state State of the entry - one of the NDP_STATE_XXX values
+ */
+static void
+add_ndp_entry ( struct net_device *netdev, struct in6_addr *in6,
+ void *ll_addr, int state ) {
+ struct ndp_entry *ndp;
+ ndp = &ndp_table[next_new_ndp_entry++ % NUM_NDP_ENTRIES];
+
+ /* Fill up entry */
+ ndp->ll_protocol = netdev->ll_protocol;
+ memcpy ( &ndp->in6, &( *in6 ), sizeof ( *in6 ) );
+ if ( ll_addr ) {
+ memcpy ( ndp->ll_addr, ll_addr, netdev->ll_protocol->ll_addr_len );
+ } else {
+ memset ( ndp->ll_addr, 0, netdev->ll_protocol->ll_addr_len );
+ }
+ ndp->state = state;
+ DBG ( "New neighbour cache entry: IP6 %s => %s %s\n",
+ inet6_ntoa ( ndp->in6 ), netdev->ll_protocol->name,
+ netdev->ll_protocol->ntoa ( ndp->ll_addr ) );
+}
+
+/**
+ * Resolve the link-layer address
+ *
+ * @v netdev Network device
+ * @v dest Destination address
+ * @v src Source address
+ * @ret dest_ll_addr Destination link-layer address or NULL
+ * @ret rc Status
+ *
+ * This function looks up the neighbour cache for an entry corresponding to the
+ * destination address. If it finds a valid entry, it fills up dest_ll_addr and
+ * returns 0. Otherwise it sends a neighbour solicitation to the solicited
+ * multicast address.
+ */
+int ndp_resolve ( struct net_device *netdev, struct in6_addr *dest,
+ struct in6_addr *src, void *dest_ll_addr ) {
+ struct ll_protocol *ll_protocol = netdev->ll_protocol;
+ struct ndp_entry *ndp;
+ int rc;
+
+ ndp = ndp_find_entry ( dest );
+ /* Check if the entry is valid */
+ if ( ndp && ndp->state == NDP_STATE_REACHABLE ) {
+ DBG ( "Neighbour cache hit: IP6 %s => %s %s\n",
+ inet6_ntoa ( *dest ), ll_protocol->name,
+ ll_protocol->ntoa ( ndp->ll_addr ) );
+ memcpy ( dest_ll_addr, ndp->ll_addr, ll_protocol->ll_addr_len );
+ return 0;
+ }
+
+ /* Check if the entry was already created */
+ if ( ndp ) {
+ DBG ( "Awaiting neighbour advertisement\n" );
+ /* For test */
+// ndp->state = NDP_STATE_REACHABLE;
+// memcpy ( ndp->ll_addr, netdev->ll_addr, 6 );
+// assert ( ndp->ll_protocol->ll_addr_len == 6 );
+// icmp6_test_nadvert ( netdev, dest, ndp->ll_addr );
+// assert ( ndp->state == NDP_STATE_REACHABLE );
+ /* Take it out till here */
+ return -ENOENT;
+ }
+ DBG ( "Neighbour cache miss: IP6 %s\n", inet6_ntoa ( *dest ) );
+
+ /* Add entry in the neighbour cache */
+ add_ndp_entry ( netdev, dest, NULL, NDP_STATE_INCOMPLETE );
+
+ /* Send neighbour solicitation */
+ if ( ( rc = icmp6_send_solicit ( netdev, src, dest ) ) != 0 ) {
+ return rc;
+ }
+ return -ENOENT;
+}
+
+/**
+ * Process neighbour advertisement
+ *
+ * @v iobuf I/O buffer
+ * @v st_src Source address
+ * @v st_dest Destination address
+ */
+int ndp_process_advert ( struct io_buffer *iobuf, struct sockaddr_tcpip *st_src __unused,
+ struct sockaddr_tcpip *st_dest __unused ) {
+ struct neighbour_advert *nadvert = iobuf->data;
+ struct ndp_entry *ndp;
+
+ /* Sanity check */
+ if ( iob_len ( iobuf ) < sizeof ( *nadvert ) ) {
+ DBG ( "Packet too short (%zd bytes)\n", iob_len ( iobuf ) );
+ return -EINVAL;
+ }
+
+ assert ( nadvert->code == 0 );
+ assert ( nadvert->flags & ICMP6_FLAGS_SOLICITED );
+ assert ( nadvert->opt_type == 2 );
+
+ /* Update the neighbour cache, if entry is present */
+ ndp = ndp_find_entry ( &nadvert->target );
+ if ( ndp ) {
+
+ assert ( nadvert->opt_len ==
+ ( ( 2 + ndp->ll_protocol->ll_addr_len ) / 8 ) );
+
+ if ( IP6_EQUAL ( ndp->in6, nadvert->target ) ) {
+ memcpy ( ndp->ll_addr, nadvert->opt_ll_addr,
+ ndp->ll_protocol->ll_addr_len );
+ ndp->state = NDP_STATE_REACHABLE;
+ return 0;
+ }
+ }
+ DBG ( "Unsolicited advertisement (dropping packet)\n" );
+ return 0;
+}
diff --git a/gpxe/src/net/netdev_settings.c b/gpxe/src/net/netdev_settings.c
new file mode 100644
index 00000000..44aca7d8
--- /dev/null
+++ b/gpxe/src/net/netdev_settings.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2008 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <string.h>
+#include <errno.h>
+#include <gpxe/dhcp.h>
+#include <gpxe/settings.h>
+#include <gpxe/netdevice.h>
+
+/** @file
+ *
+ * Network device configuration settings
+ *
+ */
+
+/** Network device named settings */
+struct setting mac_setting __setting = {
+ .name = "mac",
+ .description = "MAC address",
+ .type = &setting_type_hex,
+};
+
+/**
+ * Store value of network device setting
+ *
+ * @v settings Settings block
+ * @v setting Setting to store
+ * @v data Setting data, or NULL to clear setting
+ * @v len Length of setting data
+ * @ret rc Return status code
+ */
+static int netdev_store ( struct settings *settings, struct setting *setting,
+ const void *data, size_t len ) {
+ struct net_device *netdev = container_of ( settings, struct net_device,
+ settings.settings );
+
+ if ( setting_cmp ( setting, &mac_setting ) == 0 ) {
+ if ( len != netdev->ll_protocol->ll_addr_len )
+ return -EINVAL;
+ memcpy ( netdev->ll_addr, data, len );
+ return 0;
+ } else {
+ return simple_settings_store ( settings, setting, data, len );
+ }
+}
+
+/**
+ * Fetch value of network device setting
+ *
+ * @v settings Settings block
+ * @v setting Setting to fetch
+ * @v data Setting data, or NULL to clear setting
+ * @v len Length of setting data
+ * @ret rc Return status code
+ */
+static int netdev_fetch ( struct settings *settings, struct setting *setting,
+ void *data, size_t len ) {
+ struct net_device *netdev = container_of ( settings, struct net_device,
+ settings.settings );
+
+ if ( setting_cmp ( setting, &mac_setting ) == 0 ) {
+ if ( len > netdev->ll_protocol->ll_addr_len )
+ len = netdev->ll_protocol->ll_addr_len;
+ memcpy ( data, netdev->ll_addr, len );
+ return netdev->ll_protocol->ll_addr_len;
+ } else {
+ return simple_settings_fetch ( settings, setting, data, len );
+ }
+}
+
+/** Network device configuration settings operations */
+struct settings_operations netdev_settings_operations = {
+ .store = netdev_store,
+ .fetch = netdev_fetch,
+};
diff --git a/gpxe/src/net/netdevice.c b/gpxe/src/net/netdevice.c
new file mode 100644
index 00000000..6875b3ba
--- /dev/null
+++ b/gpxe/src/net/netdevice.c
@@ -0,0 +1,513 @@
+/*
+ * Copyright (C) 2006 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <byteswap.h>
+#include <string.h>
+#include <errno.h>
+#include <gpxe/if_ether.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/tables.h>
+#include <gpxe/process.h>
+#include <gpxe/init.h>
+#include <gpxe/device.h>
+#include <gpxe/netdevice.h>
+
+/** @file
+ *
+ * Network device management
+ *
+ */
+
+/** Registered network-layer protocols */
+static struct net_protocol net_protocols[0]
+ __table_start ( struct net_protocol, net_protocols );
+static struct net_protocol net_protocols_end[0]
+ __table_end ( struct net_protocol, net_protocols );
+
+/** List of network devices */
+struct list_head net_devices = LIST_HEAD_INIT ( net_devices );
+
+/**
+ * Transmit raw packet via network device
+ *
+ * @v netdev Network device
+ * @v iobuf I/O buffer
+ * @ret rc Return status code
+ *
+ * Transmits the packet via the specified network device. This
+ * function takes ownership of the I/O buffer.
+ */
+int netdev_tx ( struct net_device *netdev, struct io_buffer *iobuf ) {
+ int rc;
+
+ DBGC ( netdev, "NETDEV %p transmitting %p (%p+%zx)\n",
+ netdev, iobuf, iobuf->data, iob_len ( iobuf ) );
+
+ list_add_tail ( &iobuf->list, &netdev->tx_queue );
+
+ if ( ! ( netdev->state & NETDEV_OPEN ) ) {
+ rc = -ENETUNREACH;
+ goto err;
+ }
+
+ if ( ( rc = netdev->op->transmit ( netdev, iobuf ) ) != 0 )
+ goto err;
+
+ return 0;
+
+ err:
+ netdev_tx_complete_err ( netdev, iobuf, rc );
+ return rc;
+}
+
+/**
+ * Complete network transmission
+ *
+ * @v netdev Network device
+ * @v iobuf I/O buffer
+ * @v rc Packet status code
+ *
+ * The packet must currently be in the network device's TX queue.
+ */
+void netdev_tx_complete_err ( struct net_device *netdev,
+ struct io_buffer *iobuf, int rc ) {
+
+ /* Update statistics counter */
+ if ( rc == 0 ) {
+ netdev->stats.tx_ok++;
+ DBGC ( netdev, "NETDEV %p transmission %p complete\n",
+ netdev, iobuf );
+ } else {
+ netdev->stats.tx_err++;
+ DBGC ( netdev, "NETDEV %p transmission %p failed: %s\n",
+ netdev, iobuf, strerror ( rc ) );
+ }
+
+ /* Catch data corruption as early as possible */
+ assert ( iobuf->list.next != NULL );
+ assert ( iobuf->list.prev != NULL );
+
+ /* Dequeue and free I/O buffer */
+ list_del ( &iobuf->list );
+ free_iob ( iobuf );
+}
+
+/**
+ * Complete network transmission
+ *
+ * @v netdev Network device
+ * @v rc Packet status code
+ *
+ * Completes the oldest outstanding packet in the TX queue.
+ */
+void netdev_tx_complete_next_err ( struct net_device *netdev, int rc ) {
+ struct io_buffer *iobuf;
+
+ list_for_each_entry ( iobuf, &netdev->tx_queue, list ) {
+ netdev_tx_complete_err ( netdev, iobuf, rc );
+ return;
+ }
+}
+
+/**
+ * Flush device's transmit queue
+ *
+ * @v netdev Network device
+ */
+static void netdev_tx_flush ( struct net_device *netdev ) {
+
+ /* Discard any packets in the TX queue */
+ while ( ! list_empty ( &netdev->tx_queue ) ) {
+ netdev_tx_complete_next_err ( netdev, -ECANCELED );
+ }
+}
+
+/**
+ * Add packet to receive queue
+ *
+ * @v netdev Network device
+ * @v iobuf I/O buffer, or NULL
+ *
+ * The packet is added to the network device's RX queue. This
+ * function takes ownership of the I/O buffer.
+ */
+void netdev_rx ( struct net_device *netdev, struct io_buffer *iobuf ) {
+
+ DBGC ( netdev, "NETDEV %p received %p (%p+%zx)\n",
+ netdev, iobuf, iobuf->data, iob_len ( iobuf ) );
+
+ /* Enqueue packet */
+ list_add_tail ( &iobuf->list, &netdev->rx_queue );
+
+ /* Update statistics counter */
+ netdev->stats.rx_ok++;
+}
+
+/**
+ * Discard received packet
+ *
+ * @v netdev Network device
+ * @v iobuf I/O buffer, or NULL
+ * @v rc Packet status code
+ *
+ * The packet is discarded and an RX error is recorded. This function
+ * takes ownership of the I/O buffer. @c iobuf may be NULL if, for
+ * example, the net device wishes to report an error due to being
+ * unable to allocate an I/O buffer.
+ */
+void netdev_rx_err ( struct net_device *netdev,
+ struct io_buffer *iobuf, int rc ) {
+
+ DBGC ( netdev, "NETDEV %p failed to receive %p: %s\n",
+ netdev, iobuf, strerror ( rc ) );
+
+ /* Discard packet */
+ free_iob ( iobuf );
+
+ /* Update statistics counter */
+ netdev->stats.rx_err++;
+}
+
+/**
+ * Poll for completed and received packets on network device
+ *
+ * @v netdev Network device
+ *
+ * Polls the network device for completed transmissions and received
+ * packets. Any received packets will be added to the RX packet queue
+ * via netdev_rx().
+ */
+void netdev_poll ( struct net_device *netdev ) {
+
+ if ( netdev->state & NETDEV_OPEN )
+ netdev->op->poll ( netdev );
+}
+
+/**
+ * Remove packet from device's receive queue
+ *
+ * @v netdev Network device
+ * @ret iobuf I/O buffer, or NULL
+ *
+ * Removes the first packet from the device's RX queue and returns it.
+ * Ownership of the packet is transferred to the caller.
+ */
+struct io_buffer * netdev_rx_dequeue ( struct net_device *netdev ) {
+ struct io_buffer *iobuf;
+
+ list_for_each_entry ( iobuf, &netdev->rx_queue, list ) {
+ list_del ( &iobuf->list );
+ return iobuf;
+ }
+ return NULL;
+}
+
+/**
+ * Flush device's receive queue
+ *
+ * @v netdev Network device
+ */
+static void netdev_rx_flush ( struct net_device *netdev ) {
+ struct io_buffer *iobuf;
+
+ /* Discard any packets in the RX queue */
+ while ( ( iobuf = netdev_rx_dequeue ( netdev ) ) ) {
+ netdev_rx_err ( netdev, iobuf, -ECANCELED );
+ }
+}
+
+/**
+ * Free network device
+ *
+ * @v refcnt Network device reference counter
+ */
+static void free_netdev ( struct refcnt *refcnt ) {
+ struct net_device *netdev =
+ container_of ( refcnt, struct net_device, refcnt );
+
+ netdev_tx_flush ( netdev );
+ netdev_rx_flush ( netdev );
+ free ( netdev );
+}
+
+/**
+ * Allocate network device
+ *
+ * @v priv_size Size of private data area (net_device::priv)
+ * @ret netdev Network device, or NULL
+ *
+ * Allocates space for a network device and its private data area.
+ */
+struct net_device * alloc_netdev ( size_t priv_size ) {
+ struct net_device *netdev;
+ size_t total_len;
+
+ total_len = ( sizeof ( *netdev ) + priv_size );
+ netdev = zalloc ( total_len );
+ if ( netdev ) {
+ netdev->refcnt.free = free_netdev;
+ INIT_LIST_HEAD ( &netdev->tx_queue );
+ INIT_LIST_HEAD ( &netdev->rx_queue );
+ settings_init ( netdev_settings ( netdev ),
+ &netdev_settings_operations, &netdev->refcnt,
+ netdev->name );
+ netdev->priv = ( ( ( void * ) netdev ) + sizeof ( *netdev ) );
+ }
+ return netdev;
+}
+
+/**
+ * Register network device
+ *
+ * @v netdev Network device
+ * @ret rc Return status code
+ *
+ * Gives the network device a name and adds it to the list of network
+ * devices.
+ */
+int register_netdev ( struct net_device *netdev ) {
+ static unsigned int ifindex = 0;
+ int rc;
+
+ /* Create device name */
+ snprintf ( netdev->name, sizeof ( netdev->name ), "net%d",
+ ifindex++ );
+
+ /* Register per-netdev configuration settings */
+ if ( ( rc = register_settings ( netdev_settings ( netdev ),
+ NULL ) ) != 0 ) {
+ DBGC ( netdev, "NETDEV %p could not register settings: %s\n",
+ netdev, strerror ( rc ) );
+ return rc;
+ }
+
+ /* Add to device list */
+ netdev_get ( netdev );
+ list_add_tail ( &netdev->list, &net_devices );
+ DBGC ( netdev, "NETDEV %p registered as %s (phys %s hwaddr %s)\n",
+ netdev, netdev->name, netdev->dev->name,
+ netdev_hwaddr ( netdev ) );
+
+ return 0;
+}
+
+/**
+ * Open network device
+ *
+ * @v netdev Network device
+ * @ret rc Return status code
+ */
+int netdev_open ( struct net_device *netdev ) {
+ int rc;
+
+ /* Do nothing if device is already open */
+ if ( netdev->state & NETDEV_OPEN )
+ return 0;
+
+ DBGC ( netdev, "NETDEV %p opening\n", netdev );
+
+ /* Open the device */
+ if ( ( rc = netdev->op->open ( netdev ) ) != 0 )
+ return rc;
+
+ /* Mark as opened */
+ netdev->state |= NETDEV_OPEN;
+ return 0;
+}
+
+/**
+ * Close network device
+ *
+ * @v netdev Network device
+ */
+void netdev_close ( struct net_device *netdev ) {
+
+ /* Do nothing if device is already closed */
+ if ( ! ( netdev->state & NETDEV_OPEN ) )
+ return;
+
+ DBGC ( netdev, "NETDEV %p closing\n", netdev );
+
+ /* Close the device */
+ netdev->op->close ( netdev );
+
+ /* Flush TX and RX queues */
+ netdev_tx_flush ( netdev );
+ netdev_rx_flush ( netdev );
+
+ /* Mark as closed */
+ netdev->state &= ~NETDEV_OPEN;
+}
+
+/**
+ * Unregister network device
+ *
+ * @v netdev Network device
+ *
+ * Removes the network device from the list of network devices.
+ */
+void unregister_netdev ( struct net_device *netdev ) {
+
+ /* Ensure device is closed */
+ netdev_close ( netdev );
+
+ /* Unregister per-netdev configuration settings */
+ unregister_settings ( netdev_settings ( netdev ) );
+
+ /* Remove from device list */
+ list_del ( &netdev->list );
+ netdev_put ( netdev );
+ DBGC ( netdev, "NETDEV %p unregistered\n", netdev );
+}
+
+/** Enable or disable interrupts
+ *
+ * @v netdev Network device
+ * @v enable Interrupts should be enabled
+ */
+void netdev_irq ( struct net_device *netdev, int enable ) {
+ netdev->op->irq ( netdev, enable );
+}
+
+/**
+ * Get network device by name
+ *
+ * @v name Network device name
+ * @ret netdev Network device, or NULL
+ */
+struct net_device * find_netdev ( const char *name ) {
+ struct net_device *netdev;
+
+ list_for_each_entry ( netdev, &net_devices, list ) {
+ if ( strcmp ( netdev->name, name ) == 0 )
+ return netdev;
+ }
+
+ return NULL;
+}
+
+/**
+ * Get network device by PCI bus:dev.fn address
+ *
+ * @v bus_type Bus type
+ * @v location Bus location
+ * @ret netdev Network device, or NULL
+ */
+struct net_device * find_netdev_by_location ( unsigned int bus_type,
+ unsigned int location ) {
+ struct net_device *netdev;
+
+ list_for_each_entry ( netdev, &net_devices, list ) {
+ if ( ( netdev->dev->desc.bus_type == bus_type ) &&
+ ( netdev->dev->desc.location == location ) )
+ return netdev;
+ }
+
+ return NULL;
+}
+
+/**
+ * Transmit network-layer packet
+ *
+ * @v iobuf I/O buffer
+ * @v netdev Network device
+ * @v net_protocol Network-layer protocol
+ * @v ll_dest Destination link-layer address
+ * @ret rc Return status code
+ *
+ * Prepends link-layer headers to the I/O buffer and transmits the
+ * packet via the specified network device. This function takes
+ * ownership of the I/O buffer.
+ */
+int net_tx ( struct io_buffer *iobuf, struct net_device *netdev,
+ struct net_protocol *net_protocol, const void *ll_dest ) {
+
+ /* Force a poll on the netdevice to (potentially) clear any
+ * backed-up TX completions. This is needed on some network
+ * devices to avoid excessive losses due to small TX ring
+ * sizes.
+ */
+ netdev_poll ( netdev );
+
+ return netdev->ll_protocol->tx ( iobuf, netdev, net_protocol, ll_dest );
+}
+
+/**
+ * Process received network-layer packet
+ *
+ * @v iobuf I/O buffer
+ * @v netdev Network device
+ * @v net_proto Network-layer protocol, in network-byte order
+ * @v ll_source Source link-layer address
+ * @ret rc Return status code
+ */
+int net_rx ( struct io_buffer *iobuf, struct net_device *netdev,
+ uint16_t net_proto, const void *ll_source ) {
+ struct net_protocol *net_protocol;
+
+ /* Hand off to network-layer protocol, if any */
+ for ( net_protocol = net_protocols ; net_protocol < net_protocols_end ;
+ net_protocol++ ) {
+ if ( net_protocol->net_proto == net_proto ) {
+ return net_protocol->rx ( iobuf, netdev, ll_source );
+ }
+ }
+ free_iob ( iobuf );
+ return 0;
+}
+
+/**
+ * Single-step the network stack
+ *
+ * @v process Network stack process
+ *
+ * This polls all interfaces for received packets, and processes
+ * packets from the RX queue.
+ */
+static void net_step ( struct process *process __unused ) {
+ struct net_device *netdev;
+ struct io_buffer *iobuf;
+
+ /* Poll and process each network device */
+ list_for_each_entry ( netdev, &net_devices, list ) {
+
+ /* Poll for new packets */
+ netdev_poll ( netdev );
+
+ /* Process at most one received packet. Give priority
+ * to getting packets out of the NIC over processing
+ * the received packets, because we advertise a window
+ * that assumes that we can receive packets from the
+ * NIC faster than they arrive.
+ */
+ if ( ( iobuf = netdev_rx_dequeue ( netdev ) ) ) {
+ DBGC ( netdev, "NETDEV %p processing %p (%p+%zx)\n",
+ netdev, iobuf, iobuf->data,
+ iob_len ( iobuf ) );
+ netdev->ll_protocol->rx ( iobuf, netdev );
+ }
+ }
+}
+
+/** Networking stack process */
+struct process net_process __permanent_process = {
+ .step = net_step,
+};
diff --git a/gpxe/src/net/nullnet.c b/gpxe/src/net/nullnet.c
new file mode 100644
index 00000000..7e199ce3
--- /dev/null
+++ b/gpxe/src/net/nullnet.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2006 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdint.h>
+#include <errno.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/netdevice.h>
+
+/** @file
+ *
+ * Null network device
+ *
+ */
+
+static int null_open ( struct net_device *netdev __unused ) {
+ return -ENODEV;
+};
+
+static void null_close ( struct net_device *netdev __unused ) {
+ /* Do nothing */
+};
+
+static int null_transmit ( struct net_device *netdev __unused,
+ struct io_buffer *iobuf __unused ) {
+ return -ENODEV;
+};
+
+static void null_poll ( struct net_device *netdev __unused ) {
+ /* Do nothing */
+}
+
+static void null_irq ( struct net_device *netdev __unused,
+ int enable __unused ) {
+ /* Do nothing */
+}
+
+struct net_device_operations null_netdev_operations = {
+ .open = null_open,
+ .close = null_close,
+ .transmit = null_transmit,
+ .poll = null_poll,
+ .irq = null_irq,
+};
diff --git a/gpxe/src/net/rarp.c b/gpxe/src/net/rarp.c
new file mode 100644
index 00000000..bb5e6ad7
--- /dev/null
+++ b/gpxe/src/net/rarp.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdint.h>
+#include <byteswap.h>
+#include <gpxe/netdevice.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/if_ether.h>
+#include <gpxe/rarp.h>
+
+/** @file
+ *
+ * Reverse Address Resolution Protocol
+ *
+ */
+
+/**
+ * Process incoming ARP packets
+ *
+ * @v iobuf I/O buffer
+ * @v netdev Network device
+ * @v ll_source Link-layer source address
+ * @ret rc Return status code
+ *
+ * This is a dummy method which simply discards RARP packets.
+ */
+static int rarp_rx ( struct io_buffer *iobuf,
+ struct net_device *netdev __unused,
+ const void *ll_source __unused ) {
+ free_iob ( iobuf );
+ return 0;
+}
+
+
+/**
+ * Transcribe RARP address
+ *
+ * @v net_addr RARP address
+ * @ret string "<RARP>"
+ *
+ * This operation is meaningless for the RARP protocol.
+ */
+static const char * rarp_ntoa ( const void *net_addr __unused ) {
+ return "<RARP>";
+}
+
+/** RARP protocol */
+struct net_protocol rarp_protocol __net_protocol = {
+ .name = "RARP",
+ .net_proto = htons ( ETH_P_RARP ),
+ .rx = rarp_rx,
+ .ntoa = rarp_ntoa,
+};
diff --git a/gpxe/src/net/retry.c b/gpxe/src/net/retry.c
new file mode 100644
index 00000000..90b89711
--- /dev/null
+++ b/gpxe/src/net/retry.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) 2006 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stddef.h>
+#include <gpxe/timer.h>
+#include <gpxe/list.h>
+#include <gpxe/process.h>
+#include <gpxe/init.h>
+#include <gpxe/retry.h>
+
+/** @file
+ *
+ * Retry timers
+ *
+ * A retry timer is a binary exponential backoff timer. It can be
+ * used to build automatic retransmission into network protocols.
+ *
+ * This implementation of the timer is designed to satisfy RFC 2988
+ * and therefore be usable as a TCP retransmission timer.
+ *
+ *
+ */
+
+/** Default timeout value */
+#define MIN_TIMEOUT ( TICKS_PER_SEC / 4 )
+
+/** Limit after which the timeout will be deemed permanent */
+#define MAX_TIMEOUT ( 10 * TICKS_PER_SEC )
+
+/* The theoretical minimum that the algorithm in stop_timer() can
+ * adjust the timeout back down to is seven ticks, so set the minimum
+ * timeout to at least that value for the sake of consistency.
+ */
+#if MIN_TIMEOUT < 7
+#undef MIN_TIMEOUT
+#define MIN_TIMEOUT 7
+#endif
+
+/** List of running timers */
+static LIST_HEAD ( timers );
+
+/**
+ * Start timer
+ *
+ * @v timer Retry timer
+ *
+ * This starts the timer running with the current timeout value. If
+ * stop_timer() is not called before the timer expires, the timer will
+ * be stopped and the timer's callback function will be called.
+ */
+void start_timer ( struct retry_timer *timer ) {
+ if ( ! timer_running ( timer ) )
+ list_add ( &timer->list, &timers );
+ timer->start = currticks();
+ if ( timer->timeout < MIN_TIMEOUT )
+ timer->timeout = MIN_TIMEOUT;
+ DBG2 ( "Timer %p started at time %ld (expires at %ld)\n",
+ timer, timer->start, ( timer->start + timer->timeout ) );
+}
+
+/**
+ * Start timer with no delay
+ *
+ * @v timer Retry timer
+ *
+ * This starts the timer running with a zero timeout value.
+ */
+void start_timer_nodelay ( struct retry_timer *timer ) {
+ start_timer ( timer );
+ timer->timeout = 0;
+}
+
+/**
+ * Stop timer
+ *
+ * @v timer Retry timer
+ *
+ * This stops the timer and updates the timer's timeout value.
+ */
+void stop_timer ( struct retry_timer *timer ) {
+ unsigned long old_timeout = timer->timeout;
+ unsigned long now = currticks();
+ unsigned long runtime;
+
+ /* If timer was already stopped, do nothing */
+ if ( ! timer_running ( timer ) )
+ return;
+
+ list_del ( &timer->list );
+ runtime = ( now - timer->start );
+ timer->start = 0;
+ DBG2 ( "Timer %p stopped at time %ld (ran for %ld)\n",
+ timer, now, runtime );
+
+ /* Update timer. Variables are:
+ *
+ * r = round-trip time estimate (i.e. runtime)
+ * t = timeout value (i.e. timer->timeout)
+ * s = smoothed round-trip time
+ *
+ * By choice, we set t = 4s, i.e. allow for four times the
+ * normal round-trip time to pass before retransmitting.
+ *
+ * We want to smooth according to s := ( 7 s + r ) / 8
+ *
+ * Since we don't actually store s, this reduces to
+ * t := ( 7 t / 8 ) + ( r / 2 )
+ *
+ */
+ if ( timer->count ) {
+ timer->count--;
+ } else {
+ timer->timeout -= ( timer->timeout >> 3 );
+ timer->timeout += ( runtime >> 1 );
+ if ( timer->timeout != old_timeout ) {
+ DBG ( "Timer %p timeout updated to %ld\n",
+ timer, timer->timeout );
+ }
+ }
+}
+
+/**
+ * Handle expired timer
+ *
+ * @v timer Retry timer
+ */
+static void timer_expired ( struct retry_timer *timer ) {
+ int fail;
+
+ /* Stop timer without performing RTT calculations */
+ DBG2 ( "Timer %p stopped at time %ld on expiry\n",
+ timer, currticks() );
+ list_del ( &timer->list );
+ timer->start = 0;
+ timer->count++;
+
+ /* Back off the timeout value */
+ timer->timeout <<= 1;
+ if ( ( fail = ( timer->timeout > MAX_TIMEOUT ) ) )
+ timer->timeout = MAX_TIMEOUT;
+ DBG ( "Timer %p timeout backed off to %ld\n",
+ timer, timer->timeout );
+
+ /* Call expiry callback */
+ timer->expired ( timer, fail );
+}
+
+/**
+ * Single-step the retry timer list
+ *
+ * @v process Retry timer process
+ */
+static void retry_step ( struct process *process __unused ) {
+ struct retry_timer *timer;
+ struct retry_timer *tmp;
+ unsigned long now = currticks();
+ unsigned long used;
+
+ list_for_each_entry_safe ( timer, tmp, &timers, list ) {
+ used = ( now - timer->start );
+ if ( used >= timer->timeout )
+ timer_expired ( timer );
+ }
+}
+
+/** Retry timer process */
+struct process retry_process __permanent_process = {
+ .step = retry_step,
+};
diff --git a/gpxe/src/net/tcp.c b/gpxe/src/net/tcp.c
new file mode 100644
index 00000000..da8e87b4
--- /dev/null
+++ b/gpxe/src/net/tcp.c
@@ -0,0 +1,1073 @@
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <errno.h>
+#include <byteswap.h>
+#include <gpxe/timer.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/malloc.h>
+#include <gpxe/retry.h>
+#include <gpxe/refcnt.h>
+#include <gpxe/xfer.h>
+#include <gpxe/open.h>
+#include <gpxe/uri.h>
+#include <gpxe/tcpip.h>
+#include <gpxe/tcp.h>
+
+/** @file
+ *
+ * TCP protocol
+ *
+ */
+
+/** A TCP connection */
+struct tcp_connection {
+ /** Reference counter */
+ struct refcnt refcnt;
+ /** List of TCP connections */
+ struct list_head list;
+
+ /** Data transfer interface */
+ struct xfer_interface xfer;
+ /** Data transfer interface closed flag */
+ int xfer_closed;
+
+ /** Remote socket address */
+ struct sockaddr_tcpip peer;
+ /** Local port, in network byte order */
+ unsigned int local_port;
+
+ /** Current TCP state */
+ unsigned int tcp_state;
+ /** Previous TCP state
+ *
+ * Maintained only for debug messages
+ */
+ unsigned int prev_tcp_state;
+ /** Current sequence number
+ *
+ * Equivalent to SND.UNA in RFC 793 terminology.
+ */
+ uint32_t snd_seq;
+ /** Unacknowledged sequence count
+ *
+ * Equivalent to (SND.NXT-SND.UNA) in RFC 793 terminology.
+ */
+ uint32_t snd_sent;
+ /** Send window
+ *
+ * Equivalent to SND.WND in RFC 793 terminology
+ */
+ uint32_t snd_win;
+ /** Current acknowledgement number
+ *
+ * Equivalent to RCV.NXT in RFC 793 terminology.
+ */
+ uint32_t rcv_ack;
+ /** Most recent received timestamp
+ *
+ * Equivalent to TS.Recent in RFC 1323 terminology.
+ */
+ uint32_t ts_recent;
+ /** Timestamps enabled */
+ int timestamps;
+
+ /** Transmit queue */
+ struct list_head queue;
+ /** Retransmission timer */
+ struct retry_timer timer;
+};
+
+/**
+ * List of registered TCP connections
+ */
+static LIST_HEAD ( tcp_conns );
+
+/* Forward declarations */
+static struct xfer_interface_operations tcp_xfer_operations;
+static void tcp_expired ( struct retry_timer *timer, int over );
+static int tcp_rx_ack ( struct tcp_connection *tcp, uint32_t ack,
+ uint32_t win );
+
+/**
+ * Name TCP state
+ *
+ * @v state TCP state
+ * @ret name Name of TCP state
+ */
+static inline __attribute__ (( always_inline )) const char *
+tcp_state ( int state ) {
+ switch ( state ) {
+ case TCP_CLOSED: return "CLOSED";
+ case TCP_LISTEN: return "LISTEN";
+ case TCP_SYN_SENT: return "SYN_SENT";
+ case TCP_SYN_RCVD: return "SYN_RCVD";
+ case TCP_ESTABLISHED: return "ESTABLISHED";
+ case TCP_FIN_WAIT_1: return "FIN_WAIT_1";
+ case TCP_FIN_WAIT_2: return "FIN_WAIT_2";
+ case TCP_CLOSING_OR_LAST_ACK: return "CLOSING/LAST_ACK";
+ case TCP_TIME_WAIT: return "TIME_WAIT";
+ case TCP_CLOSE_WAIT: return "CLOSE_WAIT";
+ default: return "INVALID";
+ }
+}
+
+/**
+ * Dump TCP state transition
+ *
+ * @v tcp TCP connection
+ */
+static inline __attribute__ (( always_inline )) void
+tcp_dump_state ( struct tcp_connection *tcp ) {
+
+ if ( tcp->tcp_state != tcp->prev_tcp_state ) {
+ DBGC ( tcp, "TCP %p transitioned from %s to %s\n", tcp,
+ tcp_state ( tcp->prev_tcp_state ),
+ tcp_state ( tcp->tcp_state ) );
+ }
+ tcp->prev_tcp_state = tcp->tcp_state;
+}
+
+/**
+ * Dump TCP flags
+ *
+ * @v flags TCP flags
+ */
+static inline __attribute__ (( always_inline )) void
+tcp_dump_flags ( struct tcp_connection *tcp, unsigned int flags ) {
+ if ( flags & TCP_RST )
+ DBGC ( tcp, " RST" );
+ if ( flags & TCP_SYN )
+ DBGC ( tcp, " SYN" );
+ if ( flags & TCP_PSH )
+ DBGC ( tcp, " PSH" );
+ if ( flags & TCP_FIN )
+ DBGC ( tcp, " FIN" );
+ if ( flags & TCP_ACK )
+ DBGC ( tcp, " ACK" );
+}
+
+/***************************************************************************
+ *
+ * Open and close
+ *
+ ***************************************************************************
+ */
+
+/**
+ * Bind TCP connection to local port
+ *
+ * @v tcp TCP connection
+ * @v port Local port number, in network-endian order
+ * @ret rc Return status code
+ *
+ * If the port is 0, the connection is assigned an available port
+ * between 1024 and 65535.
+ */
+static int tcp_bind ( struct tcp_connection *tcp, unsigned int port ) {
+ struct tcp_connection *existing;
+ static uint16_t try_port = 1023;
+
+ /* If no port specified, find the first available port */
+ if ( ! port ) {
+ while ( try_port ) {
+ try_port++;
+ if ( try_port < 1024 )
+ continue;
+ if ( tcp_bind ( tcp, htons ( try_port ) ) == 0 )
+ return 0;
+ }
+ DBGC ( tcp, "TCP %p could not bind: no free ports\n", tcp );
+ return -EADDRINUSE;
+ }
+
+ /* Attempt bind to local port */
+ list_for_each_entry ( existing, &tcp_conns, list ) {
+ if ( existing->local_port == port ) {
+ DBGC ( tcp, "TCP %p could not bind: port %d in use\n",
+ tcp, ntohs ( port ) );
+ return -EADDRINUSE;
+ }
+ }
+ tcp->local_port = port;
+
+ DBGC ( tcp, "TCP %p bound to port %d\n", tcp, ntohs ( port ) );
+ return 0;
+}
+
+/**
+ * Open a TCP connection
+ *
+ * @v xfer Data transfer interface
+ * @v peer Peer socket address
+ * @v local Local socket address, or NULL
+ * @ret rc Return status code
+ */
+static int tcp_open ( struct xfer_interface *xfer, struct sockaddr *peer,
+ struct sockaddr *local ) {
+ struct sockaddr_tcpip *st_peer = ( struct sockaddr_tcpip * ) peer;
+ struct sockaddr_tcpip *st_local = ( struct sockaddr_tcpip * ) local;
+ struct tcp_connection *tcp;
+ unsigned int bind_port;
+ int rc;
+
+ /* Allocate and initialise structure */
+ tcp = zalloc ( sizeof ( *tcp ) );
+ if ( ! tcp )
+ return -ENOMEM;
+ DBGC ( tcp, "TCP %p allocated\n", tcp );
+ xfer_init ( &tcp->xfer, &tcp_xfer_operations, &tcp->refcnt );
+ tcp->prev_tcp_state = TCP_CLOSED;
+ tcp->tcp_state = TCP_STATE_SENT ( TCP_SYN );
+ tcp_dump_state ( tcp );
+ tcp->snd_seq = random();
+ INIT_LIST_HEAD ( &tcp->queue );
+ tcp->timer.expired = tcp_expired;
+ memcpy ( &tcp->peer, st_peer, sizeof ( tcp->peer ) );
+
+ /* Bind to local port */
+ bind_port = ( st_local ? st_local->st_port : 0 );
+ if ( ( rc = tcp_bind ( tcp, bind_port ) ) != 0 )
+ goto err;
+
+ /* Start timer to initiate SYN */
+ start_timer_nodelay ( &tcp->timer );
+
+ /* Attach parent interface, transfer reference to connection
+ * list and return
+ */
+ xfer_plug_plug ( &tcp->xfer, xfer );
+ list_add ( &tcp->list, &tcp_conns );
+ return 0;
+
+ err:
+ ref_put ( &tcp->refcnt );
+ return rc;
+}
+
+/**
+ * Close TCP connection
+ *
+ * @v tcp TCP connection
+ * @v rc Reason for close
+ *
+ * Closes the data transfer interface. If the TCP state machine is in
+ * a suitable state, the connection will be deleted.
+ */
+static void tcp_close ( struct tcp_connection *tcp, int rc ) {
+ struct io_buffer *iobuf;
+ struct io_buffer *tmp;
+
+ /* Close data transfer interface */
+ xfer_nullify ( &tcp->xfer );
+ xfer_close ( &tcp->xfer, rc );
+ tcp->xfer_closed = 1;
+
+ /* If we are in CLOSED, or have otherwise not yet received a
+ * SYN (i.e. we are in LISTEN or SYN_SENT), just delete the
+ * connection.
+ */
+ if ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) {
+
+ /* Transition to CLOSED for the sake of debugging messages */
+ tcp->tcp_state = TCP_CLOSED;
+ tcp_dump_state ( tcp );
+
+ /* Free any unsent I/O buffers */
+ list_for_each_entry_safe ( iobuf, tmp, &tcp->queue, list ) {
+ list_del ( &iobuf->list );
+ free_iob ( iobuf );
+ }
+
+ /* Remove from list and drop reference */
+ stop_timer ( &tcp->timer );
+ list_del ( &tcp->list );
+ ref_put ( &tcp->refcnt );
+ DBGC ( tcp, "TCP %p connection deleted\n", tcp );
+ return;
+ }
+
+ /* If we have not had our SYN acknowledged (i.e. we are in
+ * SYN_RCVD), pretend that it has been acknowledged so that we
+ * can send a FIN without breaking things.
+ */
+ if ( ! ( tcp->tcp_state & TCP_STATE_ACKED ( TCP_SYN ) ) )
+ tcp_rx_ack ( tcp, ( tcp->snd_seq + 1 ), 0 );
+
+ /* If we have no data remaining to send, start sending FIN */
+ if ( list_empty ( &tcp->queue ) ) {
+ tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
+ tcp_dump_state ( tcp );
+ }
+}
+
+/***************************************************************************
+ *
+ * Transmit data path
+ *
+ ***************************************************************************
+ */
+
+/**
+ * Calculate transmission window
+ *
+ * @v tcp TCP connection
+ * @ret len Maximum length that can be sent in a single packet
+ */
+static size_t tcp_xmit_win ( struct tcp_connection *tcp ) {
+ size_t len;
+
+ /* Not ready if we're not in a suitable connection state */
+ if ( ! TCP_CAN_SEND_DATA ( tcp->tcp_state ) )
+ return 0;
+
+ /* Length is the minimum of the receiver's window and the path MTU */
+ len = tcp->snd_win;
+ if ( len > TCP_PATH_MTU )
+ len = TCP_PATH_MTU;
+
+ return len;
+}
+
+/**
+ * Process TCP transmit queue
+ *
+ * @v tcp TCP connection
+ * @v max_len Maximum length to process
+ * @v dest I/O buffer to fill with data, or NULL
+ * @v remove Remove data from queue
+ * @ret len Length of data processed
+ *
+ * This processes at most @c max_len bytes from the TCP connection's
+ * transmit queue. Data will be copied into the @c dest I/O buffer
+ * (if provided) and, if @c remove is true, removed from the transmit
+ * queue.
+ */
+static size_t tcp_process_queue ( struct tcp_connection *tcp, size_t max_len,
+ struct io_buffer *dest, int remove ) {
+ struct io_buffer *iobuf;
+ struct io_buffer *tmp;
+ size_t frag_len;
+ size_t len = 0;
+
+ list_for_each_entry_safe ( iobuf, tmp, &tcp->queue, list ) {
+ frag_len = iob_len ( iobuf );
+ if ( frag_len > max_len )
+ frag_len = max_len;
+ if ( dest ) {
+ memcpy ( iob_put ( dest, frag_len ), iobuf->data,
+ frag_len );
+ }
+ if ( remove ) {
+ iob_pull ( iobuf, frag_len );
+ if ( ! iob_len ( iobuf ) ) {
+ list_del ( &iobuf->list );
+ free_iob ( iobuf );
+ }
+ }
+ len += frag_len;
+ max_len -= frag_len;
+ }
+ return len;
+}
+
+/**
+ * Transmit any outstanding data
+ *
+ * @v tcp TCP connection
+ * @v force_send Force sending of packet
+ *
+ * Transmits any outstanding data on the connection.
+ *
+ * Note that even if an error is returned, the retransmission timer
+ * will have been started if necessary, and so the stack will
+ * eventually attempt to retransmit the failed packet.
+ */
+static int tcp_xmit ( struct tcp_connection *tcp, int force_send ) {
+ struct io_buffer *iobuf;
+ struct tcp_header *tcphdr;
+ struct tcp_mss_option *mssopt;
+ struct tcp_timestamp_padded_option *tsopt;
+ void *payload;
+ unsigned int flags;
+ size_t len = 0;
+ size_t seq_len;
+ size_t app_win;
+ size_t rcv_win;
+
+ /* If retransmission timer is already running, do nothing */
+ if ( timer_running ( &tcp->timer ) )
+ return 0;
+
+ /* Calculate both the actual (payload) and sequence space
+ * lengths that we wish to transmit.
+ */
+ if ( TCP_CAN_SEND_DATA ( tcp->tcp_state ) ) {
+ len = tcp_process_queue ( tcp, tcp_xmit_win ( tcp ),
+ NULL, 0 );
+ }
+ seq_len = len;
+ flags = TCP_FLAGS_SENDING ( tcp->tcp_state );
+ if ( flags & ( TCP_SYN | TCP_FIN ) ) {
+ /* SYN or FIN consume one byte, and we can never send both */
+ assert ( ! ( ( flags & TCP_SYN ) && ( flags & TCP_FIN ) ) );
+ seq_len++;
+ }
+ tcp->snd_sent = seq_len;
+
+ /* If we have nothing to transmit, stop now */
+ if ( ( seq_len == 0 ) && ! force_send )
+ return 0;
+
+ /* If we are transmitting anything that requires
+ * acknowledgement (i.e. consumes sequence space), start the
+ * retransmission timer. Do this before attempting to
+ * allocate the I/O buffer, in case allocation itself fails.
+ */
+ if ( seq_len )
+ start_timer ( &tcp->timer );
+
+ /* Allocate I/O buffer */
+ iobuf = alloc_iob ( len + MAX_HDR_LEN );
+ if ( ! iobuf ) {
+ DBGC ( tcp, "TCP %p could not allocate data buffer\n", tcp );
+ return -ENOMEM;
+ }
+ iob_reserve ( iobuf, MAX_HDR_LEN );
+
+ /* Fill data payload from transmit queue */
+ tcp_process_queue ( tcp, len, iobuf, 0 );
+
+ /* Estimate window size */
+ rcv_win = ( ( freemem * 3 ) / 4 );
+ if ( rcv_win > TCP_MAX_WINDOW_SIZE )
+ rcv_win = TCP_MAX_WINDOW_SIZE;
+ app_win = xfer_window ( &tcp->xfer );
+ if ( rcv_win > app_win )
+ rcv_win = app_win;
+ rcv_win &= ~0x03; /* Keep everything dword-aligned */
+
+ /* Fill up the TCP header */
+ payload = iobuf->data;
+ if ( flags & TCP_SYN ) {
+ mssopt = iob_push ( iobuf, sizeof ( *mssopt ) );
+ mssopt->kind = TCP_OPTION_MSS;
+ mssopt->length = sizeof ( *mssopt );
+ mssopt->mss = htons ( TCP_MSS );
+ }
+ if ( ( flags & TCP_SYN ) || tcp->timestamps ) {
+ tsopt = iob_push ( iobuf, sizeof ( *tsopt ) );
+ memset ( tsopt->nop, TCP_OPTION_NOP, sizeof ( tsopt->nop ) );
+ tsopt->tsopt.kind = TCP_OPTION_TS;
+ tsopt->tsopt.length = sizeof ( tsopt->tsopt );
+ tsopt->tsopt.tsval = ntohl ( currticks() );
+ tsopt->tsopt.tsecr = ntohl ( tcp->ts_recent );
+ }
+ tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
+ memset ( tcphdr, 0, sizeof ( *tcphdr ) );
+ tcphdr->src = tcp->local_port;
+ tcphdr->dest = tcp->peer.st_port;
+ tcphdr->seq = htonl ( tcp->snd_seq );
+ tcphdr->ack = htonl ( tcp->rcv_ack );
+ tcphdr->hlen = ( ( payload - iobuf->data ) << 2 );
+ tcphdr->flags = flags;
+ tcphdr->win = htons ( rcv_win );
+ tcphdr->csum = tcpip_chksum ( iobuf->data, iob_len ( iobuf ) );
+
+ /* Dump header */
+ DBGC ( tcp, "TCP %p TX %d->%d %08lx..%08lx %08lx %4zd",
+ tcp, ntohs ( tcphdr->src ), ntohs ( tcphdr->dest ),
+ ntohl ( tcphdr->seq ), ( ntohl ( tcphdr->seq ) + seq_len ),
+ ntohl ( tcphdr->ack ), len );
+ tcp_dump_flags ( tcp, tcphdr->flags );
+ DBGC ( tcp, "\n" );
+
+ /* Transmit packet */
+ return tcpip_tx ( iobuf, &tcp_protocol, &tcp->peer, NULL,
+ &tcphdr->csum );
+}
+
+/**
+ * Retransmission timer expired
+ *
+ * @v timer Retry timer
+ * @v over Failure indicator
+ */
+static void tcp_expired ( struct retry_timer *timer, int over ) {
+ struct tcp_connection *tcp =
+ container_of ( timer, struct tcp_connection, timer );
+ int graceful_close = TCP_CLOSED_GRACEFULLY ( tcp->tcp_state );
+
+ DBGC ( tcp, "TCP %p timer %s in %s\n", tcp,
+ ( over ? "expired" : "fired" ), tcp_state ( tcp->tcp_state ) );
+
+ assert ( ( tcp->tcp_state == TCP_SYN_SENT ) ||
+ ( tcp->tcp_state == TCP_SYN_RCVD ) ||
+ ( tcp->tcp_state == TCP_ESTABLISHED ) ||
+ ( tcp->tcp_state == TCP_FIN_WAIT_1 ) ||
+ ( tcp->tcp_state == TCP_TIME_WAIT ) ||
+ ( tcp->tcp_state == TCP_CLOSE_WAIT ) ||
+ ( tcp->tcp_state == TCP_CLOSING_OR_LAST_ACK ) );
+
+ if ( over || graceful_close ) {
+ /* If we have finally timed out and given up, or if
+ * this is the result of a graceful close, terminate
+ * the connection
+ */
+ tcp->tcp_state = TCP_CLOSED;
+ tcp_dump_state ( tcp );
+ tcp_close ( tcp, -ETIMEDOUT );
+ } else {
+ /* Otherwise, retransmit the packet */
+ tcp_xmit ( tcp, 0 );
+ }
+}
+
+/**
+ * Send RST response to incoming packet
+ *
+ * @v in_tcphdr TCP header of incoming packet
+ * @ret rc Return status code
+ */
+static int tcp_xmit_reset ( struct tcp_connection *tcp,
+ struct sockaddr_tcpip *st_dest,
+ struct tcp_header *in_tcphdr ) {
+ struct io_buffer *iobuf;
+ struct tcp_header *tcphdr;
+
+ /* Allocate space for dataless TX buffer */
+ iobuf = alloc_iob ( MAX_HDR_LEN );
+ if ( ! iobuf ) {
+ DBGC ( tcp, "TCP %p could not allocate data buffer\n", tcp );
+ return -ENOMEM;
+ }
+ iob_reserve ( iobuf, MAX_HDR_LEN );
+
+ /* Construct RST response */
+ tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
+ memset ( tcphdr, 0, sizeof ( *tcphdr ) );
+ tcphdr->src = in_tcphdr->dest;
+ tcphdr->dest = in_tcphdr->src;
+ tcphdr->seq = in_tcphdr->ack;
+ tcphdr->ack = in_tcphdr->seq;
+ tcphdr->hlen = ( ( sizeof ( *tcphdr ) / 4 ) << 4 );
+ tcphdr->flags = ( TCP_RST | TCP_ACK );
+ tcphdr->win = htons ( TCP_MAX_WINDOW_SIZE );
+ tcphdr->csum = tcpip_chksum ( iobuf->data, iob_len ( iobuf ) );
+
+ /* Dump header */
+ DBGC ( tcp, "TCP %p TX %d->%d %08lx..%08lx %08lx %4d",
+ tcp, ntohs ( tcphdr->src ), ntohs ( tcphdr->dest ),
+ ntohl ( tcphdr->seq ), ( ntohl ( tcphdr->seq ) ),
+ ntohl ( tcphdr->ack ), 0 );
+ tcp_dump_flags ( tcp, tcphdr->flags );
+ DBGC ( tcp, "\n" );
+
+ /* Transmit packet */
+ return tcpip_tx ( iobuf, &tcp_protocol, st_dest,
+ NULL, &tcphdr->csum );
+}
+
+/***************************************************************************
+ *
+ * Receive data path
+ *
+ ***************************************************************************
+ */
+
+/**
+ * Identify TCP connection by local port number
+ *
+ * @v local_port Local port (in network-endian order)
+ * @ret tcp TCP connection, or NULL
+ */
+static struct tcp_connection * tcp_demux ( unsigned int local_port ) {
+ struct tcp_connection *tcp;
+
+ list_for_each_entry ( tcp, &tcp_conns, list ) {
+ if ( tcp->local_port == local_port )
+ return tcp;
+ }
+ return NULL;
+}
+
+/**
+ * Parse TCP received options
+ *
+ * @v tcp TCP connection
+ * @v data Raw options data
+ * @v len Raw options length
+ * @v options Options structure to fill in
+ */
+static void tcp_rx_opts ( struct tcp_connection *tcp, const void *data,
+ size_t len, struct tcp_options *options ) {
+ const void *end = ( data + len );
+ const struct tcp_option *option;
+ unsigned int kind;
+
+ memset ( options, 0, sizeof ( *options ) );
+ while ( data < end ) {
+ option = data;
+ kind = option->kind;
+ if ( kind == TCP_OPTION_END )
+ return;
+ if ( kind == TCP_OPTION_NOP ) {
+ data++;
+ continue;
+ }
+ switch ( kind ) {
+ case TCP_OPTION_MSS:
+ options->mssopt = data;
+ break;
+ case TCP_OPTION_TS:
+ options->tsopt = data;
+ break;
+ default:
+ DBGC ( tcp, "TCP %p received unknown option %d\n",
+ tcp, kind );
+ break;
+ }
+ data += option->length;
+ }
+}
+
+/**
+ * Handle TCP received SYN
+ *
+ * @v tcp TCP connection
+ * @v seq SEQ value (in host-endian order)
+ * @v options TCP options
+ * @ret rc Return status code
+ */
+static int tcp_rx_syn ( struct tcp_connection *tcp, uint32_t seq,
+ struct tcp_options *options ) {
+
+ /* Synchronise sequence numbers on first SYN */
+ if ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) {
+ tcp->rcv_ack = seq;
+ if ( options->tsopt )
+ tcp->timestamps = 1;
+ }
+
+ /* Ignore duplicate SYN */
+ if ( ( tcp->rcv_ack - seq ) > 0 )
+ return 0;
+
+ /* Mark SYN as received and start sending ACKs with each packet */
+ tcp->tcp_state |= ( TCP_STATE_SENT ( TCP_ACK ) |
+ TCP_STATE_RCVD ( TCP_SYN ) );
+
+ /* Acknowledge SYN */
+ tcp->rcv_ack++;
+
+ return 0;
+}
+
+/**
+ * Handle TCP received ACK
+ *
+ * @v tcp TCP connection
+ * @v ack ACK value (in host-endian order)
+ * @v win WIN value (in host-endian order)
+ * @ret rc Return status code
+ */
+static int tcp_rx_ack ( struct tcp_connection *tcp, uint32_t ack,
+ uint32_t win ) {
+ size_t ack_len = ( ack - tcp->snd_seq );
+ size_t len;
+ unsigned int acked_flags;
+
+ /* Ignore duplicate or out-of-range ACK */
+ if ( ack_len > tcp->snd_sent ) {
+ DBGC ( tcp, "TCP %p received ACK for [%08lx,%08lx), "
+ "sent only [%08lx,%08lx)\n", tcp, tcp->snd_seq,
+ ( tcp->snd_seq + ack_len ), tcp->snd_seq,
+ ( tcp->snd_seq + tcp->snd_sent ) );
+ return -EINVAL;
+ }
+
+ /* Acknowledge any flags being sent */
+ len = ack_len;
+ acked_flags = ( TCP_FLAGS_SENDING ( tcp->tcp_state ) &
+ ( TCP_SYN | TCP_FIN ) );
+ if ( acked_flags )
+ len--;
+
+ /* Update SEQ and sent counters, and window size */
+ tcp->snd_seq = ack;
+ tcp->snd_sent = 0;
+ tcp->snd_win = win;
+
+ /* Stop the retransmission timer */
+ stop_timer ( &tcp->timer );
+
+ /* Remove any acknowledged data from transmit queue */
+ tcp_process_queue ( tcp, len, NULL, 1 );
+
+ /* Mark SYN/FIN as acknowledged if applicable. */
+ if ( acked_flags )
+ tcp->tcp_state |= TCP_STATE_ACKED ( acked_flags );
+
+ /* Start sending FIN if we've had all possible data ACKed */
+ if ( list_empty ( &tcp->queue ) && tcp->xfer_closed )
+ tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
+
+ return 0;
+}
+
+/**
+ * Handle TCP received data
+ *
+ * @v tcp TCP connection
+ * @v seq SEQ value (in host-endian order)
+ * @v iobuf I/O buffer
+ * @ret rc Return status code
+ *
+ * This function takes ownership of the I/O buffer.
+ */
+static int tcp_rx_data ( struct tcp_connection *tcp, uint32_t seq,
+ struct io_buffer *iobuf ) {
+ size_t already_rcvd;
+ size_t len;
+ int rc;
+
+ /* Ignore duplicate or out-of-order data */
+ already_rcvd = ( tcp->rcv_ack - seq );
+ len = iob_len ( iobuf );
+ if ( already_rcvd >= len ) {
+ free_iob ( iobuf );
+ return 0;
+ }
+ iob_pull ( iobuf, already_rcvd );
+ len -= already_rcvd;
+
+ /* Deliver data to application */
+ if ( ( rc = xfer_deliver_iob ( &tcp->xfer, iobuf ) ) != 0 )
+ return rc;
+
+ /* Acknowledge new data */
+ tcp->rcv_ack += len;
+ return 0;
+}
+
+/**
+ * Handle TCP received FIN
+ *
+ * @v tcp TCP connection
+ * @v seq SEQ value (in host-endian order)
+ * @ret rc Return status code
+ */
+static int tcp_rx_fin ( struct tcp_connection *tcp, uint32_t seq ) {
+
+ /* Ignore duplicate or out-of-order FIN */
+ if ( ( tcp->rcv_ack - seq ) > 0 )
+ return 0;
+
+ /* Mark FIN as received and acknowledge it */
+ tcp->tcp_state |= TCP_STATE_RCVD ( TCP_FIN );
+ tcp->rcv_ack++;
+
+ /* Close connection */
+ tcp_close ( tcp, 0 );
+
+ return 0;
+}
+
+/**
+ * Handle TCP received RST
+ *
+ * @v tcp TCP connection
+ * @v seq SEQ value (in host-endian order)
+ * @ret rc Return status code
+ */
+static int tcp_rx_rst ( struct tcp_connection *tcp, uint32_t seq ) {
+
+ /* Accept RST only if it falls within the window. If we have
+ * not yet received a SYN, then we have no window to test
+ * against, so fall back to checking that our SYN has been
+ * ACKed.
+ */
+ if ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) {
+ if ( ( tcp->rcv_ack - seq ) > 0 )
+ return 0;
+ } else {
+ if ( ! ( tcp->tcp_state & TCP_STATE_ACKED ( TCP_SYN ) ) )
+ return 0;
+ }
+
+ /* Abort connection */
+ tcp->tcp_state = TCP_CLOSED;
+ tcp_dump_state ( tcp );
+ tcp_close ( tcp, -ECONNRESET );
+
+ return -ECONNRESET;
+}
+
+/**
+ * Process received packet
+ *
+ * @v iobuf I/O buffer
+ * @v st_src Partially-filled source address
+ * @v st_dest Partially-filled destination address
+ * @v pshdr_csum Pseudo-header checksum
+ * @ret rc Return status code
+ */
+static int tcp_rx ( struct io_buffer *iobuf,
+ struct sockaddr_tcpip *st_src,
+ struct sockaddr_tcpip *st_dest __unused,
+ uint16_t pshdr_csum ) {
+ struct tcp_header *tcphdr = iobuf->data;
+ struct tcp_connection *tcp;
+ struct tcp_options options;
+ size_t hlen;
+ uint16_t csum;
+ uint32_t start_seq;
+ uint32_t seq;
+ uint32_t ack;
+ uint32_t win;
+ unsigned int flags;
+ size_t len;
+ int rc;
+
+ /* Sanity check packet */
+ if ( iob_len ( iobuf ) < sizeof ( *tcphdr ) ) {
+ DBG ( "TCP packet too short at %zd bytes (min %zd bytes)\n",
+ iob_len ( iobuf ), sizeof ( *tcphdr ) );
+ rc = -EINVAL;
+ goto discard;
+ }
+ hlen = ( ( tcphdr->hlen & TCP_MASK_HLEN ) / 16 ) * 4;
+ if ( hlen < sizeof ( *tcphdr ) ) {
+ DBG ( "TCP header too short at %zd bytes (min %zd bytes)\n",
+ hlen, sizeof ( *tcphdr ) );
+ rc = -EINVAL;
+ goto discard;
+ }
+ if ( hlen > iob_len ( iobuf ) ) {
+ DBG ( "TCP header too long at %zd bytes (max %zd bytes)\n",
+ hlen, iob_len ( iobuf ) );
+ rc = -EINVAL;
+ goto discard;
+ }
+ csum = tcpip_continue_chksum ( pshdr_csum, iobuf->data, iob_len ( iobuf ));
+ if ( csum != 0 ) {
+ DBG ( "TCP checksum incorrect (is %04x including checksum "
+ "field, should be 0000)\n", csum );
+ rc = -EINVAL;
+ goto discard;
+ }
+
+ /* Parse parameters from header and strip header */
+ tcp = tcp_demux ( tcphdr->dest );
+ start_seq = seq = ntohl ( tcphdr->seq );
+ ack = ntohl ( tcphdr->ack );
+ win = ntohs ( tcphdr->win );
+ flags = tcphdr->flags;
+ tcp_rx_opts ( tcp, ( ( ( void * ) tcphdr ) + sizeof ( *tcphdr ) ),
+ ( hlen - sizeof ( *tcphdr ) ), &options );
+ iob_pull ( iobuf, hlen );
+ len = iob_len ( iobuf );
+
+ /* Dump header */
+ DBGC ( tcp, "TCP %p RX %d<-%d %08lx %08lx..%08lx %4zd",
+ tcp, ntohs ( tcphdr->dest ), ntohs ( tcphdr->src ),
+ ntohl ( tcphdr->ack ), ntohl ( tcphdr->seq ),
+ ( ntohl ( tcphdr->seq ) + len +
+ ( ( tcphdr->flags & ( TCP_SYN | TCP_FIN ) ) ? 1 : 0 ) ), len);
+ tcp_dump_flags ( tcp, tcphdr->flags );
+ DBGC ( tcp, "\n" );
+
+ /* If no connection was found, send RST */
+ if ( ! tcp ) {
+ tcp_xmit_reset ( tcp, st_src, tcphdr );
+ rc = -ENOTCONN;
+ goto discard;
+ }
+
+ /* Handle ACK, if present */
+ if ( flags & TCP_ACK ) {
+ if ( ( rc = tcp_rx_ack ( tcp, ack, win ) ) != 0 ) {
+ tcp_xmit_reset ( tcp, st_src, tcphdr );
+ goto discard;
+ }
+ }
+
+ /* Handle SYN, if present */
+ if ( flags & TCP_SYN ) {
+ tcp_rx_syn ( tcp, seq, &options );
+ seq++;
+ }
+
+ /* Handle RST, if present */
+ if ( flags & TCP_RST ) {
+ if ( ( rc = tcp_rx_rst ( tcp, seq ) ) != 0 )
+ goto discard;
+ }
+
+ /* Handle new data, if any */
+ tcp_rx_data ( tcp, seq, iobuf );
+ seq += len;
+
+ /* Handle FIN, if present */
+ if ( flags & TCP_FIN ) {
+ tcp_rx_fin ( tcp, seq );
+ seq++;
+ }
+
+ /* Update timestamp, if present and applicable */
+ if ( ( seq == tcp->rcv_ack ) && options.tsopt )
+ tcp->ts_recent = ntohl ( options.tsopt->tsval );
+
+ /* Dump out any state change as a result of the received packet */
+ tcp_dump_state ( tcp );
+
+ /* Send out any pending data. If peer is expecting an ACK for
+ * this packet then force sending a reply.
+ */
+ tcp_xmit ( tcp, ( start_seq != seq ) );
+
+ /* If this packet was the last we expect to receive, set up
+ * timer to expire and cause the connection to be freed.
+ */
+ if ( TCP_CLOSED_GRACEFULLY ( tcp->tcp_state ) ) {
+ tcp->timer.timeout = ( 2 * TCP_MSL );
+ start_timer ( &tcp->timer );
+ }
+
+ return 0;
+
+ discard:
+ /* Free received packet */
+ free_iob ( iobuf );
+ return rc;
+}
+
+/** TCP protocol */
+struct tcpip_protocol tcp_protocol __tcpip_protocol = {
+ .name = "TCP",
+ .rx = tcp_rx,
+ .tcpip_proto = IP_TCP,
+};
+
+/***************************************************************************
+ *
+ * Data transfer interface
+ *
+ ***************************************************************************
+ */
+
+/**
+ * Close interface
+ *
+ * @v xfer Data transfer interface
+ * @v rc Reason for close
+ */
+static void tcp_xfer_close ( struct xfer_interface *xfer, int rc ) {
+ struct tcp_connection *tcp =
+ container_of ( xfer, struct tcp_connection, xfer );
+
+ /* Close data transfer interface */
+ tcp_close ( tcp, rc );
+
+ /* Transmit FIN, if possible */
+ tcp_xmit ( tcp, 0 );
+}
+
+/**
+ * Check flow control window
+ *
+ * @v xfer Data transfer interface
+ * @ret len Length of window
+ */
+static size_t tcp_xfer_window ( struct xfer_interface *xfer ) {
+ struct tcp_connection *tcp =
+ container_of ( xfer, struct tcp_connection, xfer );
+
+ /* Not ready if data queue is non-empty. This imposes a limit
+ * of only one unACKed packet in the TX queue at any time; we
+ * do this to conserve memory usage.
+ */
+ if ( ! list_empty ( &tcp->queue ) )
+ return 0;
+
+ /* Return TCP window length */
+ return tcp_xmit_win ( tcp );
+}
+
+/**
+ * Deliver datagram as I/O buffer
+ *
+ * @v xfer Data transfer interface
+ * @v iobuf Datagram I/O buffer
+ * @v meta Data transfer metadata, or NULL
+ * @ret rc Return status code
+ */
+static int tcp_xfer_deliver_iob ( struct xfer_interface *xfer,
+ struct io_buffer *iobuf,
+ struct xfer_metadata *meta __unused ) {
+ struct tcp_connection *tcp =
+ container_of ( xfer, struct tcp_connection, xfer );
+
+ /* Enqueue packet */
+ list_add_tail ( &iobuf->list, &tcp->queue );
+
+ /* Transmit data, if possible */
+ tcp_xmit ( tcp, 0 );
+
+ return 0;
+}
+
+/** TCP data transfer interface operations */
+static struct xfer_interface_operations tcp_xfer_operations = {
+ .close = tcp_xfer_close,
+ .vredirect = ignore_xfer_vredirect,
+ .window = tcp_xfer_window,
+ .alloc_iob = default_xfer_alloc_iob,
+ .deliver_iob = tcp_xfer_deliver_iob,
+ .deliver_raw = xfer_deliver_as_iob,
+};
+
+/***************************************************************************
+ *
+ * Openers
+ *
+ ***************************************************************************
+ */
+
+/** TCP socket opener */
+struct socket_opener tcp_socket_opener __socket_opener = {
+ .semantics = SOCK_STREAM,
+ .family = AF_INET,
+ .open = tcp_open,
+};
+
+char TCP_SOCK_STREAM[1];
+
+/**
+ * Open TCP URI
+ *
+ * @v xfer Data transfer interface
+ * @v uri URI
+ * @ret rc Return status code
+ */
+static int tcp_open_uri ( struct xfer_interface *xfer, struct uri *uri ) {
+ struct sockaddr_tcpip peer;
+
+ /* Sanity check */
+ if ( ! uri->host )
+ return -EINVAL;
+
+ memset ( &peer, 0, sizeof ( peer ) );
+ peer.st_port = htons ( uri_port ( uri, 0 ) );
+ return xfer_open_named_socket ( xfer, SOCK_STREAM,
+ ( struct sockaddr * ) &peer,
+ uri->host, NULL );
+}
+
+/** TCP URI opener */
+struct uri_opener tcp_uri_opener __uri_opener = {
+ .scheme = "tcp",
+ .open = tcp_open_uri,
+};
+
diff --git a/gpxe/src/net/tcp/ftp.c b/gpxe/src/net/tcp/ftp.c
new file mode 100644
index 00000000..ffb2fbff
--- /dev/null
+++ b/gpxe/src/net/tcp/ftp.c
@@ -0,0 +1,467 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+#include <byteswap.h>
+#include <gpxe/socket.h>
+#include <gpxe/tcpip.h>
+#include <gpxe/in.h>
+#include <gpxe/xfer.h>
+#include <gpxe/open.h>
+#include <gpxe/uri.h>
+#include <gpxe/features.h>
+#include <gpxe/ftp.h>
+
+/** @file
+ *
+ * File transfer protocol
+ *
+ */
+
+FEATURE ( FEATURE_PROTOCOL, "FTP", DHCP_EB_FEATURE_FTP, 1 );
+
+/**
+ * FTP states
+ *
+ * These @b must be sequential, i.e. a successful FTP session must
+ * pass through each of these states in order.
+ */
+enum ftp_state {
+ FTP_CONNECT = 0,
+ FTP_USER,
+ FTP_PASS,
+ FTP_TYPE,
+ FTP_PASV,
+ FTP_RETR,
+ FTP_QUIT,
+ FTP_DONE,
+};
+
+/**
+ * An FTP request
+ *
+ */
+struct ftp_request {
+ /** Reference counter */
+ struct refcnt refcnt;
+ /** Data transfer interface */
+ struct xfer_interface xfer;
+
+ /** URI being fetched */
+ struct uri *uri;
+ /** FTP control channel interface */
+ struct xfer_interface control;
+ /** FTP data channel interface */
+ struct xfer_interface data;
+
+ /** Current state */
+ enum ftp_state state;
+ /** Buffer to be filled with data received via the control channel */
+ char *recvbuf;
+ /** Remaining size of recvbuf */
+ size_t recvsize;
+ /** FTP status code, as text */
+ char status_text[5];
+ /** Passive-mode parameters, as text */
+ char passive_text[24]; /* "aaa,bbb,ccc,ddd,eee,fff" */
+};
+
+/**
+ * Free FTP request
+ *
+ * @v refcnt Reference counter
+ */
+static void ftp_free ( struct refcnt *refcnt ) {
+ struct ftp_request *ftp =
+ container_of ( refcnt, struct ftp_request, refcnt );
+
+ DBGC ( ftp, "FTP %p freed\n", ftp );
+
+ uri_put ( ftp->uri );
+ free ( ftp );
+}
+
+/**
+ * Mark FTP operation as complete
+ *
+ * @v ftp FTP request
+ * @v rc Return status code
+ */
+static void ftp_done ( struct ftp_request *ftp, int rc ) {
+
+ DBGC ( ftp, "FTP %p completed (%s)\n", ftp, strerror ( rc ) );
+
+ /* Close all data transfer interfaces */
+ xfer_nullify ( &ftp->xfer );
+ xfer_close ( &ftp->xfer, rc );
+ xfer_nullify ( &ftp->control );
+ xfer_close ( &ftp->control, rc );
+ xfer_nullify ( &ftp->data );
+ xfer_close ( &ftp->data, rc );
+}
+
+/*****************************************************************************
+ *
+ * FTP control channel
+ *
+ */
+
+/**
+ * FTP control channel strings
+ *
+ * These are used as printf() format strings. Since only one of them
+ * (RETR) takes an argument, we always supply that argument to the
+ * snprintf() call.
+ */
+static const char * ftp_strings[] = {
+ [FTP_CONNECT] = "",
+ [FTP_USER] = "USER anonymous\r\n",
+ [FTP_PASS] = "PASS etherboot@etherboot.org\r\n",
+ [FTP_TYPE] = "TYPE I\r\n",
+ [FTP_PASV] = "PASV\r\n",
+ [FTP_RETR] = "RETR %s\r\n",
+ [FTP_QUIT] = "QUIT\r\n",
+ [FTP_DONE] = "",
+};
+
+/**
+ * Handle control channel being closed
+ *
+ * @v control FTP control channel interface
+ * @v rc Reason for close
+ *
+ * When the control channel is closed, the data channel must also be
+ * closed, if it is currently open.
+ */
+static void ftp_control_close ( struct xfer_interface *control, int rc ) {
+ struct ftp_request *ftp =
+ container_of ( control, struct ftp_request, control );
+
+ DBGC ( ftp, "FTP %p control connection closed: %s\n",
+ ftp, strerror ( rc ) );
+
+ /* Complete FTP operation */
+ ftp_done ( ftp, rc );
+}
+
+/**
+ * Parse FTP byte sequence value
+ *
+ * @v text Text string
+ * @v value Value buffer
+ * @v len Length of value buffer
+ *
+ * This parses an FTP byte sequence value (e.g. the "aaa,bbb,ccc,ddd"
+ * form for IP addresses in PORT commands) into a byte sequence. @c
+ * *text will be updated to point beyond the end of the parsed byte
+ * sequence.
+ *
+ * This function is safe in the presence of malformed data, though the
+ * output is undefined.
+ */
+static void ftp_parse_value ( char **text, uint8_t *value, size_t len ) {
+ do {
+ *(value++) = strtoul ( *text, text, 10 );
+ if ( **text )
+ (*text)++;
+ } while ( --len );
+}
+
+/**
+ * Handle an FTP control channel response
+ *
+ * @v ftp FTP request
+ *
+ * This is called once we have received a complete response line.
+ */
+static void ftp_reply ( struct ftp_request *ftp ) {
+ char status_major = ftp->status_text[0];
+ char separator = ftp->status_text[3];
+
+ DBGC ( ftp, "FTP %p received status %s\n", ftp, ftp->status_text );
+
+ /* Ignore malformed lines */
+ if ( separator != ' ' )
+ return;
+
+ /* Ignore "intermediate" responses (1xx codes) */
+ if ( status_major == '1' )
+ return;
+
+ /* Anything other than success (2xx) or, in the case of a
+ * repsonse to a "USER" command, a password prompt (3xx), is a
+ * fatal error.
+ */
+ if ( ! ( ( status_major == '2' ) ||
+ ( ( status_major == '3' ) && ( ftp->state == FTP_USER ) ) ) ){
+ /* Flag protocol error and close connections */
+ ftp_done ( ftp, -EPROTO );
+ }
+
+ /* Open passive connection when we get "PASV" response */
+ if ( ftp->state == FTP_PASV ) {
+ char *ptr = ftp->passive_text;
+ union {
+ struct sockaddr_in sin;
+ struct sockaddr sa;
+ } sa;
+ int rc;
+
+ sa.sin.sin_family = AF_INET;
+ ftp_parse_value ( &ptr, ( uint8_t * ) &sa.sin.sin_addr,
+ sizeof ( sa.sin.sin_addr ) );
+ ftp_parse_value ( &ptr, ( uint8_t * ) &sa.sin.sin_port,
+ sizeof ( sa.sin.sin_port ) );
+ if ( ( rc = xfer_open_socket ( &ftp->data, SOCK_STREAM,
+ &sa.sa, NULL ) ) != 0 ) {
+ DBGC ( ftp, "FTP %p could not open data connection\n",
+ ftp );
+ ftp_done ( ftp, rc );
+ return;
+ }
+ }
+
+ /* Move to next state */
+ if ( ftp->state < FTP_DONE )
+ ftp->state++;
+
+ /* Send control string */
+ if ( ftp->state < FTP_DONE ) {
+ DBGC ( ftp, "FTP %p sending ", ftp );
+ DBGC ( ftp, ftp_strings[ftp->state], ftp->uri->path );
+ xfer_printf ( &ftp->control, ftp_strings[ftp->state],
+ ftp->uri->path );
+ }
+}
+
+/**
+ * Handle new data arriving on FTP control channel
+ *
+ * @v control FTP control channel interface
+ * @v data New data
+ * @v len Length of new data
+ *
+ * Data is collected until a complete line is received, at which point
+ * its information is passed to ftp_reply().
+ */
+static int ftp_control_deliver_raw ( struct xfer_interface *control,
+ const void *data, size_t len ) {
+ struct ftp_request *ftp =
+ container_of ( control, struct ftp_request, control );
+ char *recvbuf = ftp->recvbuf;
+ size_t recvsize = ftp->recvsize;
+ char c;
+
+ while ( len-- ) {
+ c = * ( ( char * ) data++ );
+ switch ( c ) {
+ case '\r' :
+ case '\n' :
+ /* End of line: call ftp_reply() to handle
+ * completed reply. Avoid calling ftp_reply()
+ * twice if we receive both \r and \n.
+ */
+ if ( recvsize == 0 )
+ ftp_reply ( ftp );
+ /* Start filling up the status code buffer */
+ recvbuf = ftp->status_text;
+ recvsize = sizeof ( ftp->status_text ) - 1;
+ break;
+ case '(' :
+ /* Start filling up the passive parameter buffer */
+ recvbuf = ftp->passive_text;
+ recvsize = sizeof ( ftp->passive_text ) - 1;
+ break;
+ case ')' :
+ /* Stop filling the passive parameter buffer */
+ recvsize = 0;
+ break;
+ default :
+ /* Fill up buffer if applicable */
+ if ( recvsize > 0 ) {
+ *(recvbuf++) = c;
+ recvsize--;
+ }
+ break;
+ }
+ }
+
+ /* Store for next invocation */
+ ftp->recvbuf = recvbuf;
+ ftp->recvsize = recvsize;
+
+ return 0;
+}
+
+/** FTP control channel operations */
+static struct xfer_interface_operations ftp_control_operations = {
+ .close = ftp_control_close,
+ .vredirect = xfer_vopen,
+ .window = unlimited_xfer_window,
+ .alloc_iob = default_xfer_alloc_iob,
+ .deliver_iob = xfer_deliver_as_raw,
+ .deliver_raw = ftp_control_deliver_raw,
+};
+
+/*****************************************************************************
+ *
+ * FTP data channel
+ *
+ */
+
+/**
+ * Handle FTP data channel being closed
+ *
+ * @v data FTP data channel interface
+ * @v rc Reason for closure
+ *
+ * When the data channel is closed, the control channel should be left
+ * alone; the server will send a completion message via the control
+ * channel which we'll pick up.
+ *
+ * If the data channel is closed due to an error, we abort the request.
+ */
+static void ftp_data_closed ( struct xfer_interface *data, int rc ) {
+ struct ftp_request *ftp =
+ container_of ( data, struct ftp_request, data );
+
+ DBGC ( ftp, "FTP %p data connection closed: %s\n",
+ ftp, strerror ( rc ) );
+
+ /* If there was an error, close control channel and record status */
+ if ( rc )
+ ftp_done ( ftp, rc );
+}
+
+/**
+ * Handle data delivery via FTP data channel
+ *
+ * @v xfer FTP data channel interface
+ * @v iobuf I/O buffer
+ * @v meta Data transfer metadata, or NULL
+ * @ret rc Return status code
+ */
+static int ftp_data_deliver_iob ( struct xfer_interface *data,
+ struct io_buffer *iobuf,
+ struct xfer_metadata *meta __unused ) {
+ struct ftp_request *ftp =
+ container_of ( data, struct ftp_request, data );
+ int rc;
+
+ if ( ( rc = xfer_deliver_iob ( &ftp->xfer, iobuf ) ) != 0 ) {
+ DBGC ( ftp, "FTP %p failed to deliver data: %s\n",
+ ftp, strerror ( rc ) );
+ return rc;
+ }
+
+ return 0;
+}
+
+/** FTP data channel operations */
+static struct xfer_interface_operations ftp_data_operations = {
+ .close = ftp_data_closed,
+ .vredirect = xfer_vopen,
+ .window = unlimited_xfer_window,
+ .alloc_iob = default_xfer_alloc_iob,
+ .deliver_iob = ftp_data_deliver_iob,
+ .deliver_raw = xfer_deliver_as_iob,
+};
+
+/*****************************************************************************
+ *
+ * Data transfer interface
+ *
+ */
+
+/**
+ * Close FTP data transfer interface
+ *
+ * @v xfer FTP data transfer interface
+ * @v rc Reason for close
+ */
+static void ftp_xfer_closed ( struct xfer_interface *xfer, int rc ) {
+ struct ftp_request *ftp =
+ container_of ( xfer, struct ftp_request, xfer );
+
+ DBGC ( ftp, "FTP %p data transfer interface closed: %s\n",
+ ftp, strerror ( rc ) );
+
+ ftp_done ( ftp, rc );
+}
+
+/** FTP data transfer interface operations */
+static struct xfer_interface_operations ftp_xfer_operations = {
+ .close = ftp_xfer_closed,
+ .vredirect = ignore_xfer_vredirect,
+ .window = unlimited_xfer_window,
+ .alloc_iob = default_xfer_alloc_iob,
+ .deliver_iob = xfer_deliver_as_raw,
+ .deliver_raw = ignore_xfer_deliver_raw,
+};
+
+/*****************************************************************************
+ *
+ * URI opener
+ *
+ */
+
+/**
+ * Initiate an FTP connection
+ *
+ * @v xfer Data transfer interface
+ * @v uri Uniform Resource Identifier
+ * @ret rc Return status code
+ */
+static int ftp_open ( struct xfer_interface *xfer, struct uri *uri ) {
+ struct ftp_request *ftp;
+ struct sockaddr_tcpip server;
+ int rc;
+
+ /* Sanity checks */
+ if ( ! uri->path )
+ return -EINVAL;
+ if ( ! uri->host )
+ return -EINVAL;
+
+ /* Allocate and populate structure */
+ ftp = zalloc ( sizeof ( *ftp ) );
+ if ( ! ftp )
+ return -ENOMEM;
+ ftp->refcnt.free = ftp_free;
+ xfer_init ( &ftp->xfer, &ftp_xfer_operations, &ftp->refcnt );
+ ftp->uri = uri_get ( uri );
+ xfer_init ( &ftp->control, &ftp_control_operations, &ftp->refcnt );
+ xfer_init ( &ftp->data, &ftp_data_operations, &ftp->refcnt );
+ ftp->recvbuf = ftp->status_text;
+ ftp->recvsize = sizeof ( ftp->status_text ) - 1;
+
+ DBGC ( ftp, "FTP %p fetching %s\n", ftp, ftp->uri->path );
+
+ /* Open control connection */
+ memset ( &server, 0, sizeof ( server ) );
+ server.st_port = htons ( uri_port ( uri, FTP_PORT ) );
+ if ( ( rc = xfer_open_named_socket ( &ftp->control, SOCK_STREAM,
+ ( struct sockaddr * ) &server,
+ uri->host, NULL ) ) != 0 )
+ goto err;
+
+ /* Attach to parent interface, mortalise self, and return */
+ xfer_plug_plug ( &ftp->xfer, xfer );
+ ref_put ( &ftp->refcnt );
+ return 0;
+
+ err:
+ DBGC ( ftp, "FTP %p could not create request: %s\n",
+ ftp, strerror ( rc ) );
+ ftp_done ( ftp, rc );
+ ref_put ( &ftp->refcnt );
+ return rc;
+}
+
+/** FTP URI opener */
+struct uri_opener ftp_uri_opener __uri_opener = {
+ .scheme = "ftp",
+ .open = ftp_open,
+};
diff --git a/gpxe/src/net/tcp/http.c b/gpxe/src/net/tcp/http.c
new file mode 100644
index 00000000..db92e9eb
--- /dev/null
+++ b/gpxe/src/net/tcp/http.c
@@ -0,0 +1,534 @@
+/*
+ * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/**
+ * @file
+ *
+ * Hyper Text Transfer Protocol (HTTP)
+ *
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <strings.h>
+#include <byteswap.h>
+#include <errno.h>
+#include <assert.h>
+#include <gpxe/uri.h>
+#include <gpxe/refcnt.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/xfer.h>
+#include <gpxe/open.h>
+#include <gpxe/socket.h>
+#include <gpxe/tcpip.h>
+#include <gpxe/process.h>
+#include <gpxe/linebuf.h>
+#include <gpxe/features.h>
+#include <gpxe/http.h>
+
+FEATURE ( FEATURE_PROTOCOL, "HTTP", DHCP_EB_FEATURE_HTTP, 1 );
+
+/** HTTP receive state */
+enum http_rx_state {
+ HTTP_RX_RESPONSE = 0,
+ HTTP_RX_HEADER,
+ HTTP_RX_DATA,
+ HTTP_RX_DEAD,
+};
+
+/**
+ * An HTTP request
+ *
+ */
+struct http_request {
+ /** Reference count */
+ struct refcnt refcnt;
+ /** Data transfer interface */
+ struct xfer_interface xfer;
+
+ /** URI being fetched */
+ struct uri *uri;
+ /** Transport layer interface */
+ struct xfer_interface socket;
+
+ /** TX process */
+ struct process process;
+
+ /** HTTP response code */
+ unsigned int response;
+ /** HTTP Content-Length */
+ size_t content_length;
+ /** Received length */
+ size_t rx_len;
+ /** RX state */
+ enum http_rx_state rx_state;
+ /** Line buffer for received header lines */
+ struct line_buffer linebuf;
+};
+
+/**
+ * Free HTTP request
+ *
+ * @v refcnt Reference counter
+ */
+static void http_free ( struct refcnt *refcnt ) {
+ struct http_request *http =
+ container_of ( refcnt, struct http_request, refcnt );
+
+ uri_put ( http->uri );
+ empty_line_buffer ( &http->linebuf );
+ free ( http );
+};
+
+/**
+ * Mark HTTP request as complete
+ *
+ * @v http HTTP request
+ * @v rc Return status code
+ */
+static void http_done ( struct http_request *http, int rc ) {
+
+ /* Prevent further processing of any current packet */
+ http->rx_state = HTTP_RX_DEAD;
+
+ /* If we had a Content-Length, and the received content length
+ * isn't correct, flag an error
+ */
+ if ( http->content_length &&
+ ( http->content_length != http->rx_len ) ) {
+ DBGC ( http, "HTTP %p incorrect length %zd, should be %zd\n",
+ http, http->rx_len, http->content_length );
+ rc = -EIO;
+ }
+
+ /* Remove process */
+ process_del ( &http->process );
+
+ /* Close all data transfer interfaces */
+ xfer_nullify ( &http->socket );
+ xfer_close ( &http->socket, rc );
+ xfer_nullify ( &http->xfer );
+ xfer_close ( &http->xfer, rc );
+}
+
+/**
+ * Convert HTTP response code to return status code
+ *
+ * @v response HTTP response code
+ * @ret rc Return status code
+ */
+static int http_response_to_rc ( unsigned int response ) {
+ switch ( response ) {
+ case 200:
+ return 0;
+ case 404:
+ return -ENOENT;
+ case 403:
+ return -EPERM;
+ default:
+ return -EIO;
+ }
+}
+
+/**
+ * Handle HTTP response
+ *
+ * @v http HTTP request
+ * @v response HTTP response
+ * @ret rc Return status code
+ */
+static int http_rx_response ( struct http_request *http, char *response ) {
+ char *spc;
+ int rc;
+
+ DBGC ( http, "HTTP %p response \"%s\"\n", http, response );
+
+ /* Check response starts with "HTTP/" */
+ if ( strncmp ( response, "HTTP/", 5 ) != 0 )
+ return -EIO;
+
+ /* Locate and check response code */
+ spc = strchr ( response, ' ' );
+ if ( ! spc )
+ return -EIO;
+ http->response = strtoul ( spc, NULL, 10 );
+ if ( ( rc = http_response_to_rc ( http->response ) ) != 0 )
+ return rc;
+
+ /* Move to received headers */
+ http->rx_state = HTTP_RX_HEADER;
+ return 0;
+}
+
+/**
+ * Handle HTTP Content-Length header
+ *
+ * @v http HTTP request
+ * @v value HTTP header value
+ * @ret rc Return status code
+ */
+static int http_rx_content_length ( struct http_request *http,
+ const char *value ) {
+ char *endp;
+
+ http->content_length = strtoul ( value, &endp, 10 );
+ if ( *endp != '\0' ) {
+ DBGC ( http, "HTTP %p invalid Content-Length \"%s\"\n",
+ http, value );
+ return -EIO;
+ }
+
+ /* Use seek() to notify recipient of filesize */
+ xfer_seek ( &http->xfer, http->content_length, SEEK_SET );
+ xfer_seek ( &http->xfer, 0, SEEK_SET );
+
+ return 0;
+}
+
+/** An HTTP header handler */
+struct http_header_handler {
+ /** Name (e.g. "Content-Length") */
+ const char *header;
+ /** Handle received header
+ *
+ * @v http HTTP request
+ * @v value HTTP header value
+ * @ret rc Return status code
+ *
+ * If an error is returned, the download will be aborted.
+ */
+ int ( * rx ) ( struct http_request *http, const char *value );
+};
+
+/** List of HTTP header handlers */
+static struct http_header_handler http_header_handlers[] = {
+ {
+ .header = "Content-Length",
+ .rx = http_rx_content_length,
+ },
+ { NULL, NULL }
+};
+
+/**
+ * Handle HTTP header
+ *
+ * @v http HTTP request
+ * @v header HTTP header
+ * @ret rc Return status code
+ */
+static int http_rx_header ( struct http_request *http, char *header ) {
+ struct http_header_handler *handler;
+ char *separator;
+ char *value;
+ int rc;
+
+ /* An empty header line marks the transition to the data phase */
+ if ( ! header[0] ) {
+ DBGC ( http, "HTTP %p start of data\n", http );
+ empty_line_buffer ( &http->linebuf );
+ http->rx_state = HTTP_RX_DATA;
+ return 0;
+ }
+
+ DBGC ( http, "HTTP %p header \"%s\"\n", http, header );
+
+ /* Split header at the ": " */
+ separator = strstr ( header, ": " );
+ if ( ! separator ) {
+ DBGC ( http, "HTTP %p malformed header\n", http );
+ return -EIO;
+ }
+ *separator = '\0';
+ value = ( separator + 2 );
+
+ /* Hand off to header handler, if one exists */
+ for ( handler = http_header_handlers ; handler->header ; handler++ ) {
+ if ( strcasecmp ( header, handler->header ) == 0 ) {
+ if ( ( rc = handler->rx ( http, value ) ) != 0 )
+ return rc;
+ break;
+ }
+ }
+ return 0;
+}
+
+/** An HTTP line-based data handler */
+struct http_line_handler {
+ /** Handle line
+ *
+ * @v http HTTP request
+ * @v line Line to handle
+ * @ret rc Return status code
+ */
+ int ( * rx ) ( struct http_request *http, char *line );
+};
+
+/** List of HTTP line-based data handlers */
+static struct http_line_handler http_line_handlers[] = {
+ [HTTP_RX_RESPONSE] = { .rx = http_rx_response },
+ [HTTP_RX_HEADER] = { .rx = http_rx_header },
+};
+
+/**
+ * Handle new data arriving via HTTP connection in the data phase
+ *
+ * @v http HTTP request
+ * @v iobuf I/O buffer
+ * @ret rc Return status code
+ */
+static int http_rx_data ( struct http_request *http,
+ struct io_buffer *iobuf ) {
+ int rc;
+
+ /* Update received length */
+ http->rx_len += iob_len ( iobuf );
+
+ /* Hand off data buffer */
+ if ( ( rc = xfer_deliver_iob ( &http->xfer, iobuf ) ) != 0 )
+ return rc;
+
+ /* If we have reached the content-length, stop now */
+ if ( http->content_length &&
+ ( http->rx_len >= http->content_length ) ) {
+ http_done ( http, 0 );
+ }
+
+ return 0;
+}
+
+/**
+ * Handle new data arriving via HTTP connection
+ *
+ * @v socket Transport layer interface
+ * @v iobuf I/O buffer
+ * @v meta Data transfer metadata, or NULL
+ * @ret rc Return status code
+ */
+static int http_socket_deliver_iob ( struct xfer_interface *socket,
+ struct io_buffer *iobuf,
+ struct xfer_metadata *meta __unused ) {
+ struct http_request *http =
+ container_of ( socket, struct http_request, socket );
+ struct http_line_handler *lh;
+ char *line;
+ ssize_t len;
+ int rc = 0;
+
+ while ( iob_len ( iobuf ) ) {
+ switch ( http->rx_state ) {
+ case HTTP_RX_DEAD:
+ /* Do no further processing */
+ goto done;
+ case HTTP_RX_DATA:
+ /* Once we're into the data phase, just fill
+ * the data buffer
+ */
+ rc = http_rx_data ( http, iobuf );
+ iobuf = NULL;
+ goto done;
+ case HTTP_RX_RESPONSE:
+ case HTTP_RX_HEADER:
+ /* In the other phases, buffer and process a
+ * line at a time
+ */
+ len = line_buffer ( &http->linebuf, iobuf->data,
+ iob_len ( iobuf ) );
+ if ( len < 0 ) {
+ rc = len;
+ DBGC ( http, "HTTP %p could not buffer line: "
+ "%s\n", http, strerror ( rc ) );
+ goto done;
+ }
+ iob_pull ( iobuf, len );
+ line = buffered_line ( &http->linebuf );
+ if ( line ) {
+ lh = &http_line_handlers[http->rx_state];
+ if ( ( rc = lh->rx ( http, line ) ) != 0 )
+ goto done;
+ }
+ break;
+ default:
+ assert ( 0 );
+ break;
+ }
+ }
+
+ done:
+ if ( rc )
+ http_done ( http, rc );
+ free_iob ( iobuf );
+ return rc;
+}
+
+/**
+ * HTTP process
+ *
+ * @v process Process
+ */
+static void http_step ( struct process *process ) {
+ struct http_request *http =
+ container_of ( process, struct http_request, process );
+ const char *path = http->uri->path;
+ const char *host = http->uri->host;
+ const char *query = http->uri->query;
+ int rc;
+
+ if ( xfer_window ( &http->socket ) ) {
+ process_del ( &http->process );
+ if ( ( rc = xfer_printf ( &http->socket,
+ "GET %s%s%s HTTP/1.1\r\n"
+ "User-Agent: gPXE/" VERSION "\r\n"
+ "Host: %s\r\n"
+ "\r\n",
+ ( path ? path : "/" ),
+ ( query ? "?" : "" ),
+ ( query ? query : "" ),
+ host ) ) != 0 ) {
+ http_done ( http, rc );
+ }
+ }
+}
+
+/**
+ * HTTP connection closed by network stack
+ *
+ * @v socket Transport layer interface
+ * @v rc Reason for close
+ */
+static void http_socket_close ( struct xfer_interface *socket, int rc ) {
+ struct http_request *http =
+ container_of ( socket, struct http_request, socket );
+
+ DBGC ( http, "HTTP %p socket closed: %s\n",
+ http, strerror ( rc ) );
+
+ http_done ( http, rc );
+}
+
+/** HTTP socket operations */
+static struct xfer_interface_operations http_socket_operations = {
+ .close = http_socket_close,
+ .vredirect = xfer_vopen,
+ .window = unlimited_xfer_window,
+ .alloc_iob = default_xfer_alloc_iob,
+ .deliver_iob = http_socket_deliver_iob,
+ .deliver_raw = xfer_deliver_as_iob,
+};
+
+/**
+ * Close HTTP data transfer interface
+ *
+ * @v xfer Data transfer interface
+ * @v rc Reason for close
+ */
+static void http_xfer_close ( struct xfer_interface *xfer, int rc ) {
+ struct http_request *http =
+ container_of ( xfer, struct http_request, xfer );
+
+ DBGC ( http, "HTTP %p interface closed: %s\n",
+ http, strerror ( rc ) );
+
+ http_done ( http, rc );
+}
+
+/** HTTP data transfer interface operations */
+static struct xfer_interface_operations http_xfer_operations = {
+ .close = http_xfer_close,
+ .vredirect = ignore_xfer_vredirect,
+ .window = unlimited_xfer_window,
+ .alloc_iob = default_xfer_alloc_iob,
+ .deliver_iob = xfer_deliver_as_raw,
+ .deliver_raw = ignore_xfer_deliver_raw,
+};
+
+/**
+ * Initiate an HTTP connection, with optional filter
+ *
+ * @v xfer Data transfer interface
+ * @v uri Uniform Resource Identifier
+ * @v default_port Default port number
+ * @v filter Filter to apply to socket, or NULL
+ * @ret rc Return status code
+ */
+int http_open_filter ( struct xfer_interface *xfer, struct uri *uri,
+ unsigned int default_port,
+ int ( * filter ) ( struct xfer_interface *xfer,
+ struct xfer_interface **next ) ) {
+ struct http_request *http;
+ struct sockaddr_tcpip server;
+ struct xfer_interface *socket;
+ int rc;
+
+ /* Sanity checks */
+ if ( ! uri->host )
+ return -EINVAL;
+
+ /* Allocate and populate HTTP structure */
+ http = zalloc ( sizeof ( *http ) );
+ if ( ! http )
+ return -ENOMEM;
+ http->refcnt.free = http_free;
+ xfer_init ( &http->xfer, &http_xfer_operations, &http->refcnt );
+ http->uri = uri_get ( uri );
+ xfer_init ( &http->socket, &http_socket_operations, &http->refcnt );
+ process_init ( &http->process, http_step, &http->refcnt );
+
+ /* Open socket */
+ memset ( &server, 0, sizeof ( server ) );
+ server.st_port = htons ( uri_port ( http->uri, default_port ) );
+ socket = &http->socket;
+ if ( filter ) {
+ if ( ( rc = filter ( socket, &socket ) ) != 0 )
+ goto err;
+ }
+ if ( ( rc = xfer_open_named_socket ( socket, SOCK_STREAM,
+ ( struct sockaddr * ) &server,
+ uri->host, NULL ) ) != 0 )
+ goto err;
+
+ /* Attach to parent interface, mortalise self, and return */
+ xfer_plug_plug ( &http->xfer, xfer );
+ ref_put ( &http->refcnt );
+ return 0;
+
+ err:
+ DBGC ( http, "HTTP %p could not create request: %s\n",
+ http, strerror ( rc ) );
+ http_done ( http, rc );
+ ref_put ( &http->refcnt );
+ return rc;
+}
+
+/**
+ * Initiate an HTTP connection
+ *
+ * @v xfer Data transfer interface
+ * @v uri Uniform Resource Identifier
+ * @ret rc Return status code
+ */
+static int http_open ( struct xfer_interface *xfer, struct uri *uri ) {
+ return http_open_filter ( xfer, uri, HTTP_PORT, NULL );
+}
+
+/** HTTP URI opener */
+struct uri_opener http_uri_opener __uri_opener = {
+ .scheme = "http",
+ .open = http_open,
+};
diff --git a/gpxe/src/net/tcp/https.c b/gpxe/src/net/tcp/https.c
new file mode 100644
index 00000000..15ab32ef
--- /dev/null
+++ b/gpxe/src/net/tcp/https.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/**
+ * @file
+ *
+ * Secure Hyper Text Transfer Protocol (HTTPS)
+ *
+ */
+
+#include <stddef.h>
+#include <gpxe/open.h>
+#include <gpxe/tls.h>
+#include <gpxe/http.h>
+#include <gpxe/features.h>
+
+FEATURE ( FEATURE_PROTOCOL, "HTTPS", DHCP_EB_FEATURE_HTTPS, 1 );
+
+/**
+ * Initiate an HTTPS connection
+ *
+ * @v xfer Data transfer interface
+ * @v uri Uniform Resource Identifier
+ * @ret rc Return status code
+ */
+static int https_open ( struct xfer_interface *xfer, struct uri *uri ) {
+ return http_open_filter ( xfer, uri, HTTPS_PORT, add_tls );
+}
+
+/** HTTPS URI opener */
+struct uri_opener https_uri_opener __uri_opener = {
+ .scheme = "https",
+ .open = https_open,
+};
diff --git a/gpxe/src/net/tcp/iscsi.c b/gpxe/src/net/tcp/iscsi.c
new file mode 100644
index 00000000..c01ca44b
--- /dev/null
+++ b/gpxe/src/net/tcp/iscsi.c
@@ -0,0 +1,1726 @@
+/*
+ * Copyright (C) 2006 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <byteswap.h>
+#include <gpxe/vsprintf.h>
+#include <gpxe/socket.h>
+#include <gpxe/xfer.h>
+#include <gpxe/open.h>
+#include <gpxe/scsi.h>
+#include <gpxe/process.h>
+#include <gpxe/uaccess.h>
+#include <gpxe/tcpip.h>
+#include <gpxe/settings.h>
+#include <gpxe/features.h>
+#include <gpxe/iscsi.h>
+
+/** @file
+ *
+ * iSCSI protocol
+ *
+ */
+
+FEATURE ( FEATURE_PROTOCOL, "iSCSI", DHCP_EB_FEATURE_ISCSI, 1 );
+
+/** iSCSI initiator name (explicitly specified) */
+static char *iscsi_explicit_initiator_iqn;
+
+/** Default iSCSI initiator name (constructed from hostname) */
+static char *iscsi_default_initiator_iqn;
+
+/** iSCSI username */
+static char *iscsi_username;
+
+/** iSCSI password */
+static char *iscsi_password;
+
+static void iscsi_start_tx ( struct iscsi_session *iscsi );
+static void iscsi_start_login ( struct iscsi_session *iscsi );
+static void iscsi_start_data_out ( struct iscsi_session *iscsi,
+ unsigned int datasn );
+
+/**
+ * Finish receiving PDU data into buffer
+ *
+ * @v iscsi iSCSI session
+ */
+static void iscsi_rx_buffered_data_done ( struct iscsi_session *iscsi ) {
+ free ( iscsi->rx_buffer );
+ iscsi->rx_buffer = NULL;
+}
+
+/**
+ * Free iSCSI session
+ *
+ * @v refcnt Reference counter
+ */
+static void iscsi_free ( struct refcnt *refcnt ) {
+ struct iscsi_session *iscsi =
+ container_of ( refcnt, struct iscsi_session, refcnt );
+
+ free ( iscsi->target_address );
+ free ( iscsi->target_iqn );
+ free ( iscsi->username );
+ free ( iscsi->password );
+ chap_finish ( &iscsi->chap );
+ iscsi_rx_buffered_data_done ( iscsi );
+ free ( iscsi );
+}
+
+/**
+ * Open iSCSI transport-layer connection
+ *
+ * @v iscsi iSCSI session
+ * @ret rc Return status code
+ */
+static int iscsi_open_connection ( struct iscsi_session *iscsi ) {
+ struct sockaddr_tcpip target;
+ int rc;
+
+ assert ( iscsi->tx_state == ISCSI_TX_IDLE );
+ assert ( iscsi->rx_state == ISCSI_RX_BHS );
+ assert ( iscsi->rx_offset == 0 );
+
+ /* Open socket */
+ memset ( &target, 0, sizeof ( target ) );
+ target.st_port = htons ( iscsi->target_port );
+ if ( ( rc = xfer_open_named_socket ( &iscsi->socket, SOCK_STREAM,
+ ( struct sockaddr * ) &target,
+ iscsi->target_address,
+ NULL ) ) != 0 ) {
+ DBGC ( iscsi, "iSCSI %p could not open socket: %s\n",
+ iscsi, strerror ( rc ) );
+ return rc;
+ }
+
+ /* Enter security negotiation phase */
+ iscsi->status = ( ISCSI_STATUS_SECURITY_NEGOTIATION_PHASE |
+ ISCSI_STATUS_STRINGS_SECURITY );
+
+ /* Assign fresh initiator task tag */
+ iscsi->itt++;
+
+ /* Initiate login */
+ iscsi_start_login ( iscsi );
+
+ return 0;
+}
+
+/**
+ * Close iSCSI transport-layer connection
+ *
+ * @v iscsi iSCSI session
+ * @v rc Reason for close
+ *
+ * Closes the transport-layer connection and resets the session state
+ * ready to attempt a fresh login.
+ */
+static void iscsi_close_connection ( struct iscsi_session *iscsi, int rc ) {
+
+ /* Close all data transfer interfaces */
+ xfer_close ( &iscsi->socket, rc );
+
+ /* Clear connection status */
+ iscsi->status = 0;
+
+ /* Reset TX and RX state machines */
+ iscsi->tx_state = ISCSI_TX_IDLE;
+ iscsi->rx_state = ISCSI_RX_BHS;
+ iscsi->rx_offset = 0;
+
+ /* Free any temporary dynamically allocated memory */
+ chap_finish ( &iscsi->chap );
+ iscsi_rx_buffered_data_done ( iscsi );
+}
+
+/**
+ * Mark iSCSI SCSI operation as complete
+ *
+ * @v iscsi iSCSI session
+ * @v rc Return status code
+ *
+ * Note that iscsi_scsi_done() will not close the connection, and must
+ * therefore be called only when the internal state machines are in an
+ * appropriate state, otherwise bad things may happen on the next call
+ * to iscsi_issue(). The general rule is to call iscsi_scsi_done()
+ * only at the end of receiving a PDU; at this point the TX and RX
+ * engines should both be idle.
+ */
+static void iscsi_scsi_done ( struct iscsi_session *iscsi, int rc ) {
+
+ assert ( iscsi->tx_state == ISCSI_TX_IDLE );
+
+ iscsi->command = NULL;
+ iscsi->rc = rc;
+}
+
+/****************************************************************************
+ *
+ * iSCSI SCSI command issuing
+ *
+ */
+
+/**
+ * Build iSCSI SCSI command BHS
+ *
+ * @v iscsi iSCSI session
+ *
+ * We don't currently support bidirectional commands (i.e. with both
+ * Data-In and Data-Out segments); these would require providing code
+ * to generate an AHS, and there doesn't seem to be any need for it at
+ * the moment.
+ */
+static void iscsi_start_command ( struct iscsi_session *iscsi ) {
+ struct iscsi_bhs_scsi_command *command = &iscsi->tx_bhs.scsi_command;
+
+ assert ( ! ( iscsi->command->data_in && iscsi->command->data_out ) );
+
+ /* Construct BHS and initiate transmission */
+ iscsi_start_tx ( iscsi );
+ command->opcode = ISCSI_OPCODE_SCSI_COMMAND;
+ command->flags = ( ISCSI_FLAG_FINAL |
+ ISCSI_COMMAND_ATTR_SIMPLE );
+ if ( iscsi->command->data_in )
+ command->flags |= ISCSI_COMMAND_FLAG_READ;
+ if ( iscsi->command->data_out )
+ command->flags |= ISCSI_COMMAND_FLAG_WRITE;
+ /* lengths left as zero */
+ command->lun = iscsi->lun;
+ command->itt = htonl ( ++iscsi->itt );
+ command->exp_len = htonl ( iscsi->command->data_in_len |
+ iscsi->command->data_out_len );
+ command->cmdsn = htonl ( iscsi->cmdsn );
+ command->expstatsn = htonl ( iscsi->statsn + 1 );
+ memcpy ( &command->cdb, &iscsi->command->cdb, sizeof ( command->cdb ));
+ DBGC ( iscsi, "iSCSI %p start " SCSI_CDB_FORMAT " %s %#zx\n",
+ iscsi, SCSI_CDB_DATA ( command->cdb ),
+ ( iscsi->command->data_in ? "in" : "out" ),
+ ( iscsi->command->data_in ?
+ iscsi->command->data_in_len : iscsi->command->data_out_len ));
+}
+
+/**
+ * Receive data segment of an iSCSI SCSI response PDU
+ *
+ * @v iscsi iSCSI session
+ * @v data Received data
+ * @v len Length of received data
+ * @v remaining Data remaining after this data
+ * @ret rc Return status code
+ */
+static int iscsi_rx_scsi_response ( struct iscsi_session *iscsi,
+ const void *data, size_t len,
+ size_t remaining ) {
+ struct iscsi_bhs_scsi_response *response
+ = &iscsi->rx_bhs.scsi_response;
+ int sense_offset;
+
+ /* Capture the sense response code as it floats past, if present */
+ sense_offset = ISCSI_SENSE_RESPONSE_CODE_OFFSET - iscsi->rx_offset;
+ if ( ( sense_offset >= 0 ) && len ) {
+ iscsi->command->sense_response =
+ * ( ( char * ) data + sense_offset );
+ }
+
+ /* Wait for whole SCSI response to arrive */
+ if ( remaining )
+ return 0;
+
+ /* Record SCSI status code */
+ iscsi->command->status = response->status;
+
+ /* Check for errors */
+ if ( response->response != ISCSI_RESPONSE_COMMAND_COMPLETE )
+ return -EIO;
+
+ /* Mark as completed */
+ iscsi_scsi_done ( iscsi, 0 );
+ return 0;
+}
+
+/**
+ * Receive data segment of an iSCSI data-in PDU
+ *
+ * @v iscsi iSCSI session
+ * @v data Received data
+ * @v len Length of received data
+ * @v remaining Data remaining after this data
+ * @ret rc Return status code
+ */
+static int iscsi_rx_data_in ( struct iscsi_session *iscsi,
+ const void *data, size_t len,
+ size_t remaining ) {
+ struct iscsi_bhs_data_in *data_in = &iscsi->rx_bhs.data_in;
+ unsigned long offset;
+
+ /* Copy data to data-in buffer */
+ offset = ntohl ( data_in->offset ) + iscsi->rx_offset;
+ assert ( iscsi->command != NULL );
+ assert ( iscsi->command->data_in );
+ assert ( ( offset + len ) <= iscsi->command->data_in_len );
+ copy_to_user ( iscsi->command->data_in, offset, data, len );
+
+ /* Wait for whole SCSI response to arrive */
+ if ( remaining )
+ return 0;
+
+ /* Mark as completed if status is present */
+ if ( data_in->flags & ISCSI_DATA_FLAG_STATUS ) {
+ assert ( ( offset + len ) == iscsi->command->data_in_len );
+ assert ( data_in->flags & ISCSI_FLAG_FINAL );
+ iscsi->command->status = data_in->status;
+ /* iSCSI cannot return an error status via a data-in */
+ iscsi_scsi_done ( iscsi, 0 );
+ }
+
+ return 0;
+}
+
+/**
+ * Receive data segment of an iSCSI R2T PDU
+ *
+ * @v iscsi iSCSI session
+ * @v data Received data
+ * @v len Length of received data
+ * @v remaining Data remaining after this data
+ * @ret rc Return status code
+ */
+static int iscsi_rx_r2t ( struct iscsi_session *iscsi,
+ const void *data __unused, size_t len __unused,
+ size_t remaining __unused ) {
+ struct iscsi_bhs_r2t *r2t = &iscsi->rx_bhs.r2t;
+
+ /* Record transfer parameters and trigger first data-out */
+ iscsi->ttt = ntohl ( r2t->ttt );
+ iscsi->transfer_offset = ntohl ( r2t->offset );
+ iscsi->transfer_len = ntohl ( r2t->len );
+ iscsi_start_data_out ( iscsi, 0 );
+
+ return 0;
+}
+
+/**
+ * Build iSCSI data-out BHS
+ *
+ * @v iscsi iSCSI session
+ * @v datasn Data sequence number within the transfer
+ *
+ */
+static void iscsi_start_data_out ( struct iscsi_session *iscsi,
+ unsigned int datasn ) {
+ struct iscsi_bhs_data_out *data_out = &iscsi->tx_bhs.data_out;
+ unsigned long offset;
+ unsigned long remaining;
+ unsigned long len;
+
+ /* We always send 512-byte Data-Out PDUs; this removes the
+ * need to worry about the target's MaxRecvDataSegmentLength.
+ */
+ offset = datasn * 512;
+ remaining = iscsi->transfer_len - offset;
+ len = remaining;
+ if ( len > 512 )
+ len = 512;
+
+ /* Construct BHS and initiate transmission */
+ iscsi_start_tx ( iscsi );
+ data_out->opcode = ISCSI_OPCODE_DATA_OUT;
+ if ( len == remaining )
+ data_out->flags = ( ISCSI_FLAG_FINAL );
+ ISCSI_SET_LENGTHS ( data_out->lengths, 0, len );
+ data_out->lun = iscsi->lun;
+ data_out->itt = htonl ( iscsi->itt );
+ data_out->ttt = htonl ( iscsi->ttt );
+ data_out->expstatsn = htonl ( iscsi->statsn + 1 );
+ data_out->datasn = htonl ( datasn );
+ data_out->offset = htonl ( iscsi->transfer_offset + offset );
+ DBGC ( iscsi, "iSCSI %p start data out DataSN %#x len %#lx\n",
+ iscsi, datasn, len );
+}
+
+/**
+ * Complete iSCSI data-out PDU transmission
+ *
+ * @v iscsi iSCSI session
+ *
+ */
+static void iscsi_data_out_done ( struct iscsi_session *iscsi ) {
+ struct iscsi_bhs_data_out *data_out = &iscsi->tx_bhs.data_out;
+
+ /* If we haven't reached the end of the sequence, start
+ * sending the next data-out PDU.
+ */
+ if ( ! ( data_out->flags & ISCSI_FLAG_FINAL ) )
+ iscsi_start_data_out ( iscsi, ntohl ( data_out->datasn ) + 1 );
+}
+
+/**
+ * Send iSCSI data-out data segment
+ *
+ * @v iscsi iSCSI session
+ * @ret rc Return status code
+ */
+static int iscsi_tx_data_out ( struct iscsi_session *iscsi ) {
+ struct iscsi_bhs_data_out *data_out = &iscsi->tx_bhs.data_out;
+ struct io_buffer *iobuf;
+ unsigned long offset;
+ size_t len;
+
+ offset = ntohl ( data_out->offset );
+ len = ISCSI_DATA_LEN ( data_out->lengths );
+
+ assert ( iscsi->command != NULL );
+ assert ( iscsi->command->data_out );
+ assert ( ( offset + len ) <= iscsi->command->data_out_len );
+
+ iobuf = xfer_alloc_iob ( &iscsi->socket, len );
+ if ( ! iobuf )
+ return -ENOMEM;
+
+ copy_from_user ( iob_put ( iobuf, len ),
+ iscsi->command->data_out, offset, len );
+
+ return xfer_deliver_iob ( &iscsi->socket, iobuf );
+}
+
+/****************************************************************************
+ *
+ * iSCSI login
+ *
+ */
+
+/**
+ * Build iSCSI login request strings
+ *
+ * @v iscsi iSCSI session
+ *
+ * These are the initial set of strings sent in the first login
+ * request PDU. We want the following settings:
+ *
+ * HeaderDigest=None
+ * DataDigest=None
+ * MaxConnections is irrelevant; we make only one connection anyway
+ * InitialR2T=Yes [1]
+ * ImmediateData is irrelevant; we never send immediate data
+ * MaxRecvDataSegmentLength=8192 (default; we don't care) [3]
+ * MaxBurstLength=262144 (default; we don't care) [3]
+ * FirstBurstLength=262144 (default; we don't care)
+ * DefaultTime2Wait=0 [2]
+ * DefaultTime2Retain=0 [2]
+ * MaxOutstandingR2T=1
+ * DataPDUInOrder=Yes
+ * DataSequenceInOrder=Yes
+ * ErrorRecoveryLevel=0
+ *
+ * [1] InitialR2T has an OR resolution function, so the target may
+ * force us to use it. We therefore simplify our logic by always
+ * using it.
+ *
+ * [2] These ensure that we can safely start a new task once we have
+ * reconnected after a failure, without having to manually tidy up
+ * after the old one.
+ *
+ * [3] We are quite happy to use the RFC-defined default values for
+ * these parameters, but some targets (notably OpenSolaris)
+ * incorrectly assume a default value of zero, so we explicitly
+ * specify the default values.
+ */
+static int iscsi_build_login_request_strings ( struct iscsi_session *iscsi,
+ void *data, size_t len ) {
+ unsigned int used = 0;
+ unsigned int i;
+
+ if ( iscsi->status & ISCSI_STATUS_STRINGS_SECURITY ) {
+ used += ssnprintf ( data + used, len - used,
+ "InitiatorName=%s%c"
+ "TargetName=%s%c"
+ "SessionType=Normal%c"
+ "AuthMethod=CHAP,None%c",
+ iscsi_initiator_iqn(), 0,
+ iscsi->target_iqn, 0, 0, 0 );
+ }
+
+ if ( iscsi->status & ISCSI_STATUS_STRINGS_CHAP_ALGORITHM ) {
+ used += ssnprintf ( data + used, len - used, "CHAP_A=5%c", 0 );
+ }
+
+ if ( ( iscsi->status & ISCSI_STATUS_STRINGS_CHAP_RESPONSE ) &&
+ iscsi->username ) {
+ used += ssnprintf ( data + used, len - used,
+ "CHAP_N=%s%cCHAP_R=0x",
+ iscsi->username, 0 );
+ for ( i = 0 ; i < iscsi->chap.response_len ; i++ ) {
+ used += ssnprintf ( data + used, len - used, "%02x",
+ iscsi->chap.response[i] );
+ }
+ used += ssnprintf ( data + used, len - used, "%c", 0 );
+ }
+
+ if ( iscsi->status & ISCSI_STATUS_STRINGS_OPERATIONAL ) {
+ used += ssnprintf ( data + used, len - used,
+ "HeaderDigest=None%c"
+ "DataDigest=None%c"
+ "InitialR2T=Yes%c"
+ "MaxRecvDataSegmentLength=8192%c"
+ "MaxBurstLength=262144%c"
+ "DefaultTime2Wait=0%c"
+ "DefaultTime2Retain=0%c"
+ "MaxOutstandingR2T=1%c"
+ "DataPDUInOrder=Yes%c"
+ "DataSequenceInOrder=Yes%c"
+ "ErrorRecoveryLevel=0%c",
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 );
+ }
+
+ return used;
+}
+
+/**
+ * Build iSCSI login request BHS
+ *
+ * @v iscsi iSCSI session
+ */
+static void iscsi_start_login ( struct iscsi_session *iscsi ) {
+ struct iscsi_bhs_login_request *request = &iscsi->tx_bhs.login_request;
+ int len;
+
+ /* Construct BHS and initiate transmission */
+ iscsi_start_tx ( iscsi );
+ request->opcode = ( ISCSI_OPCODE_LOGIN_REQUEST |
+ ISCSI_FLAG_IMMEDIATE );
+ request->flags = ( ( iscsi->status & ISCSI_STATUS_PHASE_MASK ) |
+ ISCSI_LOGIN_FLAG_TRANSITION );
+ /* version_max and version_min left as zero */
+ len = iscsi_build_login_request_strings ( iscsi, NULL, 0 );
+ ISCSI_SET_LENGTHS ( request->lengths, 0, len );
+ request->isid_iana_en = htonl ( ISCSI_ISID_IANA |
+ IANA_EN_FEN_SYSTEMS );
+ /* isid_iana_qual left as zero */
+ request->tsih = htons ( iscsi->tsih );
+ request->itt = htonl ( iscsi->itt );
+ /* cid left as zero */
+ request->cmdsn = htonl ( iscsi->cmdsn );
+ request->expstatsn = htonl ( iscsi->statsn + 1 );
+}
+
+/**
+ * Complete iSCSI login request PDU transmission
+ *
+ * @v iscsi iSCSI session
+ *
+ */
+static void iscsi_login_request_done ( struct iscsi_session *iscsi ) {
+
+ /* Clear any "strings to send" flags */
+ iscsi->status &= ~ISCSI_STATUS_STRINGS_MASK;
+
+ /* Free any dynamically allocated storage used for login */
+ chap_finish ( &iscsi->chap );
+}
+
+/**
+ * Transmit data segment of an iSCSI login request PDU
+ *
+ * @v iscsi iSCSI session
+ * @ret rc Return status code
+ *
+ * For login requests, the data segment consists of the login strings.
+ */
+static int iscsi_tx_login_request ( struct iscsi_session *iscsi ) {
+ struct iscsi_bhs_login_request *request = &iscsi->tx_bhs.login_request;
+ struct io_buffer *iobuf;
+ size_t len;
+
+ len = ISCSI_DATA_LEN ( request->lengths );
+ iobuf = xfer_alloc_iob ( &iscsi->socket, len );
+ if ( ! iobuf )
+ return -ENOMEM;
+ iob_put ( iobuf, len );
+ iscsi_build_login_request_strings ( iscsi, iobuf->data, len );
+ return xfer_deliver_iob ( &iscsi->socket, iobuf );
+}
+
+/**
+ * Handle iSCSI TargetAddress text value
+ *
+ * @v iscsi iSCSI session
+ * @v value TargetAddress value
+ * @ret rc Return status code
+ */
+static int iscsi_handle_targetaddress_value ( struct iscsi_session *iscsi,
+ const char *value ) {
+ char *separator;
+
+ DBGC ( iscsi, "iSCSI %p will redirect to %s\n", iscsi, value );
+
+ /* Replace target address */
+ free ( iscsi->target_address );
+ iscsi->target_address = strdup ( value );
+ if ( ! iscsi->target_address )
+ return -ENOMEM;
+
+ /* Replace target port */
+ iscsi->target_port = htons ( ISCSI_PORT );
+ separator = strchr ( iscsi->target_address, ':' );
+ if ( separator ) {
+ *separator = '\0';
+ iscsi->target_port = strtoul ( ( separator + 1 ), NULL, 0 );
+ }
+
+ return 0;
+}
+
+/**
+ * Handle iSCSI AuthMethod text value
+ *
+ * @v iscsi iSCSI session
+ * @v value AuthMethod value
+ * @ret rc Return status code
+ */
+static int iscsi_handle_authmethod_value ( struct iscsi_session *iscsi,
+ const char *value ) {
+
+ /* If server requests CHAP, send the CHAP_A string */
+ if ( strcmp ( value, "CHAP" ) == 0 ) {
+ DBGC ( iscsi, "iSCSI %p initiating CHAP authentication\n",
+ iscsi );
+ iscsi->status |= ISCSI_STATUS_STRINGS_CHAP_ALGORITHM;
+ }
+ return 0;
+}
+
+/**
+ * Handle iSCSI CHAP_A text value
+ *
+ * @v iscsi iSCSI session
+ * @v value CHAP_A value
+ * @ret rc Return status code
+ */
+static int iscsi_handle_chap_a_value ( struct iscsi_session *iscsi,
+ const char *value ) {
+ int rc;
+
+ /* We only ever offer "5" (i.e. MD5) as an algorithm, so if
+ * the server responds with anything else it is a protocol
+ * violation.
+ */
+ if ( strcmp ( value, "5" ) != 0 ) {
+ DBGC ( iscsi, "iSCSI %p got invalid CHAP algorithm \"%s\"\n",
+ iscsi, value );
+ return -EPROTO;
+ }
+
+ /* Prepare for CHAP with MD5 */
+ if ( ( rc = chap_init ( &iscsi->chap, &md5_algorithm ) ) != 0 ) {
+ DBGC ( iscsi, "iSCSI %p could not initialise CHAP: %s\n",
+ iscsi, strerror ( rc ) );
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * Handle iSCSI CHAP_I text value
+ *
+ * @v iscsi iSCSI session
+ * @v value CHAP_I value
+ * @ret rc Return status code
+ */
+static int iscsi_handle_chap_i_value ( struct iscsi_session *iscsi,
+ const char *value ) {
+ unsigned int identifier;
+ char *endp;
+
+ /* The CHAP identifier is an integer value */
+ identifier = strtoul ( value, &endp, 0 );
+ if ( *endp != '\0' ) {
+ DBGC ( iscsi, "iSCSI %p saw invalid CHAP identifier \"%s\"\n",
+ iscsi, value );
+ return -EPROTO;
+ }
+
+ /* Identifier and secret are the first two components of the
+ * challenge.
+ */
+ chap_set_identifier ( &iscsi->chap, identifier );
+ if ( iscsi->password ) {
+ chap_update ( &iscsi->chap, iscsi->password,
+ strlen ( iscsi->password ) );
+ }
+
+ return 0;
+}
+
+/**
+ * Handle iSCSI CHAP_C text value
+ *
+ * @v iscsi iSCSI session
+ * @v value CHAP_C value
+ * @ret rc Return status code
+ */
+static int iscsi_handle_chap_c_value ( struct iscsi_session *iscsi,
+ const char *value ) {
+ char buf[3];
+ char *endp;
+ uint8_t byte;
+
+ /* Check and strip leading "0x" */
+ if ( ( value[0] != '0' ) || ( value[1] != 'x' ) ) {
+ DBGC ( iscsi, "iSCSI %p saw invalid CHAP challenge \"%s\"\n",
+ iscsi, value );
+ }
+ value += 2;
+
+ /* Process challenge an octet at a time */
+ for ( ; ( value[0] && value[1] ) ; value += 2 ) {
+ memcpy ( buf, value, 2 );
+ buf[2] = 0;
+ byte = strtoul ( buf, &endp, 16 );
+ if ( *endp != '\0' ) {
+ DBGC ( iscsi, "iSCSI %p saw invalid CHAP challenge "
+ "byte \"%s\"\n", iscsi, buf );
+ return -EPROTO;
+ }
+ chap_update ( &iscsi->chap, &byte, sizeof ( byte ) );
+ }
+
+ /* Build CHAP response */
+ DBGC ( iscsi, "iSCSI %p sending CHAP response\n", iscsi );
+ chap_respond ( &iscsi->chap );
+ iscsi->status |= ISCSI_STATUS_STRINGS_CHAP_RESPONSE;
+
+ return 0;
+}
+
+/** An iSCSI text string that we want to handle */
+struct iscsi_string_type {
+ /** String key
+ *
+ * This is the portion up to and including the "=" sign,
+ * e.g. "InitiatorName=", "CHAP_A=", etc.
+ */
+ const char *key;
+ /** Handle iSCSI string value
+ *
+ * @v iscsi iSCSI session
+ * @v value iSCSI string value
+ * @ret rc Return status code
+ */
+ int ( * handle ) ( struct iscsi_session *iscsi, const char *value );
+};
+
+/** iSCSI text strings that we want to handle */
+static struct iscsi_string_type iscsi_string_types[] = {
+ { "TargetAddress=", iscsi_handle_targetaddress_value },
+ { "AuthMethod=", iscsi_handle_authmethod_value },
+ { "CHAP_A=", iscsi_handle_chap_a_value },
+ { "CHAP_I=", iscsi_handle_chap_i_value },
+ { "CHAP_C=", iscsi_handle_chap_c_value },
+ { NULL, NULL }
+};
+
+/**
+ * Handle iSCSI string
+ *
+ * @v iscsi iSCSI session
+ * @v string iSCSI string (in "key=value" format)
+ * @ret rc Return status code
+ */
+static int iscsi_handle_string ( struct iscsi_session *iscsi,
+ const char *string ) {
+ struct iscsi_string_type *type;
+ size_t key_len;
+ int rc;
+
+ for ( type = iscsi_string_types ; type->key ; type++ ) {
+ key_len = strlen ( type->key );
+ if ( strncmp ( string, type->key, key_len ) != 0 )
+ continue;
+ DBGC ( iscsi, "iSCSI %p handling %s\n", iscsi, string );
+ if ( ( rc = type->handle ( iscsi,
+ ( string + key_len ) ) ) != 0 ) {
+ DBGC ( iscsi, "iSCSI %p could not handle %s: %s\n",
+ iscsi, string, strerror ( rc ) );
+ return rc;
+ }
+ return 0;
+ }
+ DBGC ( iscsi, "iSCSI %p ignoring %s\n", iscsi, string );
+ return 0;
+}
+
+/**
+ * Handle iSCSI strings
+ *
+ * @v iscsi iSCSI session
+ * @v string iSCSI string buffer
+ * @v len Length of string buffer
+ * @ret rc Return status code
+ */
+static int iscsi_handle_strings ( struct iscsi_session *iscsi,
+ const char *strings, size_t len ) {
+ size_t string_len;
+ int rc;
+
+ /* Handle each string in turn, taking care not to overrun the
+ * data buffer in case of badly-terminated data.
+ */
+ while ( 1 ) {
+ string_len = ( strnlen ( strings, len ) + 1 );
+ if ( string_len > len )
+ break;
+ if ( ( rc = iscsi_handle_string ( iscsi, strings ) ) != 0 )
+ return rc;
+ strings += string_len;
+ len -= string_len;
+ }
+ return 0;
+}
+
+/**
+ * Receive PDU data into buffer
+ *
+ * @v iscsi iSCSI session
+ * @v data Data to receive
+ * @v len Length of data
+ * @ret rc Return status code
+ *
+ * This can be used when the RX PDU type handler wishes to buffer up
+ * all received data and process the PDU as a single unit. The caller
+ * is repsonsible for calling iscsi_rx_buffered_data_done() after
+ * processing the data.
+ */
+static int iscsi_rx_buffered_data ( struct iscsi_session *iscsi,
+ const void *data, size_t len ) {
+
+ /* Allocate buffer on first call */
+ if ( ! iscsi->rx_buffer ) {
+ iscsi->rx_buffer = malloc ( iscsi->rx_len );
+ if ( ! iscsi->rx_buffer )
+ return -ENOMEM;
+ }
+
+ /* Copy data to buffer */
+ assert ( ( iscsi->rx_offset + len ) <= iscsi->rx_len );
+ memcpy ( ( iscsi->rx_buffer + iscsi->rx_offset ), data, len );
+
+ return 0;
+}
+
+/**
+ * Receive data segment of an iSCSI login response PDU
+ *
+ * @v iscsi iSCSI session
+ * @v data Received data
+ * @v len Length of received data
+ * @v remaining Data remaining after this data
+ * @ret rc Return status code
+ */
+static int iscsi_rx_login_response ( struct iscsi_session *iscsi,
+ const void *data, size_t len,
+ size_t remaining ) {
+ struct iscsi_bhs_login_response *response
+ = &iscsi->rx_bhs.login_response;
+ int rc;
+
+ /* Buffer up the PDU data */
+ if ( ( rc = iscsi_rx_buffered_data ( iscsi, data, len ) ) != 0 ) {
+ DBGC ( iscsi, "iSCSI %p could not buffer login response: %s\n",
+ iscsi, strerror ( rc ) );
+ return rc;
+ }
+ if ( remaining )
+ return 0;
+
+ /* Process string data and discard string buffer */
+ if ( ( rc = iscsi_handle_strings ( iscsi, iscsi->rx_buffer,
+ iscsi->rx_len ) ) != 0 )
+ return rc;
+ iscsi_rx_buffered_data_done ( iscsi );
+
+ /* Check for login redirection */
+ if ( response->status_class == ISCSI_STATUS_REDIRECT ) {
+ DBGC ( iscsi, "iSCSI %p redirecting to new server\n", iscsi );
+ iscsi_close_connection ( iscsi, 0 );
+ if ( ( rc = iscsi_open_connection ( iscsi ) ) != 0 ) {
+ DBGC ( iscsi, "iSCSI %p could not redirect: %s\n ",
+ iscsi, strerror ( rc ) );
+ return rc;
+ }
+ return 0;
+ }
+
+ /* Check for fatal errors */
+ if ( response->status_class != 0 ) {
+ DBGC ( iscsi, "iSCSI login failure: class %02x detail %02x\n",
+ response->status_class, response->status_detail );
+ iscsi->instant_rc = -EPERM;
+ return -EPERM;
+ }
+
+ /* Handle login transitions */
+ if ( response->flags & ISCSI_LOGIN_FLAG_TRANSITION ) {
+ switch ( response->flags & ISCSI_LOGIN_NSG_MASK ) {
+ case ISCSI_LOGIN_NSG_OPERATIONAL_NEGOTIATION:
+ iscsi->status =
+ ( ISCSI_STATUS_OPERATIONAL_NEGOTIATION_PHASE |
+ ISCSI_STATUS_STRINGS_OPERATIONAL );
+ break;
+ case ISCSI_LOGIN_NSG_FULL_FEATURE_PHASE:
+ iscsi->status = ISCSI_STATUS_FULL_FEATURE_PHASE;
+ break;
+ default:
+ DBGC ( iscsi, "iSCSI %p got invalid response flags "
+ "%02x\n", iscsi, response->flags );
+ return -EIO;
+ }
+ }
+
+ /* Send next login request PDU if we haven't reached the full
+ * feature phase yet.
+ */
+ if ( ( iscsi->status & ISCSI_STATUS_PHASE_MASK ) !=
+ ISCSI_STATUS_FULL_FEATURE_PHASE ) {
+ iscsi_start_login ( iscsi );
+ return 0;
+ }
+
+ /* Reset retry count */
+ iscsi->retry_count = 0;
+
+ /* Record TSIH for future reference */
+ iscsi->tsih = ntohl ( response->tsih );
+
+ /* Send the actual SCSI command */
+ iscsi_start_command ( iscsi );
+
+ return 0;
+}
+
+/****************************************************************************
+ *
+ * iSCSI to socket interface
+ *
+ */
+
+/**
+ * Start up a new TX PDU
+ *
+ * @v iscsi iSCSI session
+ *
+ * This initiates the process of sending a new PDU. Only one PDU may
+ * be in transit at any one time.
+ */
+static void iscsi_start_tx ( struct iscsi_session *iscsi ) {
+ assert ( iscsi->tx_state == ISCSI_TX_IDLE );
+
+ /* Initialise TX BHS */
+ memset ( &iscsi->tx_bhs, 0, sizeof ( iscsi->tx_bhs ) );
+
+ /* Flag TX engine to start transmitting */
+ iscsi->tx_state = ISCSI_TX_BHS;
+}
+
+/**
+ * Transmit nothing
+ *
+ * @v iscsi iSCSI session
+ * @ret rc Return status code
+ */
+static int iscsi_tx_nothing ( struct iscsi_session *iscsi __unused ) {
+ return 0;
+}
+
+/**
+ * Transmit basic header segment of an iSCSI PDU
+ *
+ * @v iscsi iSCSI session
+ * @ret rc Return status code
+ */
+static int iscsi_tx_bhs ( struct iscsi_session *iscsi ) {
+ return xfer_deliver_raw ( &iscsi->socket, &iscsi->tx_bhs,
+ sizeof ( iscsi->tx_bhs ) );
+}
+
+/**
+ * Transmit data segment of an iSCSI PDU
+ *
+ * @v iscsi iSCSI session
+ * @ret rc Return status code
+ *
+ * Handle transmission of part of a PDU data segment. iscsi::tx_bhs
+ * will be valid when this is called.
+ */
+static int iscsi_tx_data ( struct iscsi_session *iscsi ) {
+ struct iscsi_bhs_common *common = &iscsi->tx_bhs.common;
+
+ switch ( common->opcode & ISCSI_OPCODE_MASK ) {
+ case ISCSI_OPCODE_DATA_OUT:
+ return iscsi_tx_data_out ( iscsi );
+ case ISCSI_OPCODE_LOGIN_REQUEST:
+ return iscsi_tx_login_request ( iscsi );
+ default:
+ /* Nothing to send in other states */
+ return 0;
+ }
+}
+
+/**
+ * Transmit data padding of an iSCSI PDU
+ *
+ * @v iscsi iSCSI session
+ * @ret rc Return status code
+ *
+ * Handle transmission of any data padding in a PDU data segment.
+ * iscsi::tx_bhs will be valid when this is called.
+ */
+static int iscsi_tx_data_padding ( struct iscsi_session *iscsi ) {
+ static const char pad[] = { '\0', '\0', '\0' };
+ struct iscsi_bhs_common *common = &iscsi->tx_bhs.common;
+ size_t pad_len;
+
+ pad_len = ISCSI_DATA_PAD_LEN ( common->lengths );
+ if ( ! pad_len )
+ return 0;
+
+ return xfer_deliver_raw ( &iscsi->socket, pad, pad_len );
+}
+
+/**
+ * Complete iSCSI PDU transmission
+ *
+ * @v iscsi iSCSI session
+ *
+ * Called when a PDU has been completely transmitted and the TX state
+ * machine is about to enter the idle state. iscsi::tx_bhs will be
+ * valid for the just-completed PDU when this is called.
+ */
+static void iscsi_tx_done ( struct iscsi_session *iscsi ) {
+ struct iscsi_bhs_common *common = &iscsi->tx_bhs.common;
+
+ switch ( common->opcode & ISCSI_OPCODE_MASK ) {
+ case ISCSI_OPCODE_DATA_OUT:
+ iscsi_data_out_done ( iscsi );
+ case ISCSI_OPCODE_LOGIN_REQUEST:
+ iscsi_login_request_done ( iscsi );
+ default:
+ /* No action */
+ break;
+ }
+}
+
+/**
+ * Transmit iSCSI PDU
+ *
+ * @v iscsi iSCSI session
+ * @v buf Temporary data buffer
+ * @v len Length of temporary data buffer
+ *
+ * Constructs data to be sent for the current TX state
+ */
+static void iscsi_tx_step ( struct process *process ) {
+ struct iscsi_session *iscsi =
+ container_of ( process, struct iscsi_session, process );
+ struct iscsi_bhs_common *common = &iscsi->tx_bhs.common;
+ int ( * tx ) ( struct iscsi_session *iscsi );
+ enum iscsi_tx_state next_state;
+ size_t tx_len;
+ int rc;
+
+ /* Select fragment to transmit */
+ while ( 1 ) {
+ switch ( iscsi->tx_state ) {
+ case ISCSI_TX_IDLE:
+ /* Stop processing */
+ return;
+ case ISCSI_TX_BHS:
+ tx = iscsi_tx_bhs;
+ tx_len = sizeof ( iscsi->tx_bhs );
+ next_state = ISCSI_TX_AHS;
+ break;
+ case ISCSI_TX_AHS:
+ tx = iscsi_tx_nothing;
+ tx_len = 0;
+ next_state = ISCSI_TX_DATA;
+ break;
+ case ISCSI_TX_DATA:
+ tx = iscsi_tx_data;
+ tx_len = ISCSI_DATA_LEN ( common->lengths );
+ next_state = ISCSI_TX_DATA_PADDING;
+ break;
+ case ISCSI_TX_DATA_PADDING:
+ tx = iscsi_tx_data_padding;
+ tx_len = ISCSI_DATA_PAD_LEN ( common->lengths );
+ next_state = ISCSI_TX_IDLE;
+ break;
+ default:
+ assert ( 0 );
+ return;
+ }
+
+ /* Check for window availability, if needed */
+ if ( tx_len && ( xfer_window ( &iscsi->socket ) == 0 ) ) {
+ /* Cannot transmit at this point; stop processing */
+ return;
+ }
+
+ /* Transmit data */
+ if ( ( rc = tx ( iscsi ) ) != 0 ) {
+ DBGC ( iscsi, "iSCSI %p could not transmit: %s\n",
+ iscsi, strerror ( rc ) );
+ return;
+ }
+
+ /* Move to next state */
+ iscsi->tx_state = next_state;
+ if ( next_state == ISCSI_TX_IDLE )
+ iscsi_tx_done ( iscsi );
+ }
+}
+
+/**
+ * Receive basic header segment of an iSCSI PDU
+ *
+ * @v iscsi iSCSI session
+ * @v data Received data
+ * @v len Length of received data
+ * @v remaining Data remaining after this data
+ * @ret rc Return status code
+ *
+ * This fills in iscsi::rx_bhs with the data from the BHS portion of
+ * the received PDU.
+ */
+static int iscsi_rx_bhs ( struct iscsi_session *iscsi, const void *data,
+ size_t len, size_t remaining __unused ) {
+ memcpy ( &iscsi->rx_bhs.bytes[iscsi->rx_offset], data, len );
+ if ( ( iscsi->rx_offset + len ) >= sizeof ( iscsi->rx_bhs ) ) {
+ DBGC ( iscsi, "iSCSI %p received PDU opcode %#x len %#lx\n",
+ iscsi, iscsi->rx_bhs.common.opcode,
+ ISCSI_DATA_LEN ( iscsi->rx_bhs.common.lengths ) );
+ }
+ return 0;
+}
+
+/**
+ * Discard portion of an iSCSI PDU.
+ *
+ * @v iscsi iSCSI session
+ * @v data Received data
+ * @v len Length of received data
+ * @v remaining Data remaining after this data
+ * @ret rc Return status code
+ *
+ * This discards data from a portion of a received PDU.
+ */
+static int iscsi_rx_discard ( struct iscsi_session *iscsi __unused,
+ const void *data __unused, size_t len __unused,
+ size_t remaining __unused ) {
+ /* Do nothing */
+ return 0;
+}
+
+/**
+ * Receive data segment of an iSCSI PDU
+ *
+ * @v iscsi iSCSI session
+ * @v data Received data
+ * @v len Length of received data
+ * @v remaining Data remaining after this data
+ * @ret rc Return status code
+ *
+ * Handle processing of part of a PDU data segment. iscsi::rx_bhs
+ * will be valid when this is called.
+ */
+static int iscsi_rx_data ( struct iscsi_session *iscsi, const void *data,
+ size_t len, size_t remaining ) {
+ struct iscsi_bhs_common_response *response
+ = &iscsi->rx_bhs.common_response;
+
+ /* Update cmdsn and statsn */
+ iscsi->cmdsn = ntohl ( response->expcmdsn );
+ iscsi->statsn = ntohl ( response->statsn );
+
+ switch ( response->opcode & ISCSI_OPCODE_MASK ) {
+ case ISCSI_OPCODE_LOGIN_RESPONSE:
+ return iscsi_rx_login_response ( iscsi, data, len, remaining );
+ case ISCSI_OPCODE_SCSI_RESPONSE:
+ return iscsi_rx_scsi_response ( iscsi, data, len, remaining );
+ case ISCSI_OPCODE_DATA_IN:
+ return iscsi_rx_data_in ( iscsi, data, len, remaining );
+ case ISCSI_OPCODE_R2T:
+ return iscsi_rx_r2t ( iscsi, data, len, remaining );
+ default:
+ if ( remaining )
+ return 0;
+ DBGC ( iscsi, "iSCSI %p unknown opcode %02x\n", iscsi,
+ response->opcode );
+ return -EOPNOTSUPP;
+ }
+}
+
+/**
+ * Receive new data
+ *
+ * @v socket Transport layer interface
+ * @v data Received data
+ * @v len Length of received data
+ * @ret rc Return status code
+ *
+ * This handles received PDUs. The receive strategy is to fill in
+ * iscsi::rx_bhs with the contents of the BHS portion of the PDU,
+ * throw away any AHS portion, and then process each part of the data
+ * portion as it arrives. The data processing routine therefore
+ * always has a full copy of the BHS available, even for portions of
+ * the data in different packets to the BHS.
+ */
+static int iscsi_socket_deliver_raw ( struct xfer_interface *socket,
+ const void *data, size_t len ) {
+ struct iscsi_session *iscsi =
+ container_of ( socket, struct iscsi_session, socket );
+ struct iscsi_bhs_common *common = &iscsi->rx_bhs.common;
+ int ( * rx ) ( struct iscsi_session *iscsi, const void *data,
+ size_t len, size_t remaining );
+ enum iscsi_rx_state next_state;
+ size_t frag_len;
+ size_t remaining;
+ int rc;
+
+ while ( 1 ) {
+ switch ( iscsi->rx_state ) {
+ case ISCSI_RX_BHS:
+ rx = iscsi_rx_bhs;
+ iscsi->rx_len = sizeof ( iscsi->rx_bhs );
+ next_state = ISCSI_RX_AHS;
+ break;
+ case ISCSI_RX_AHS:
+ rx = iscsi_rx_discard;
+ iscsi->rx_len = 4 * ISCSI_AHS_LEN ( common->lengths );
+ next_state = ISCSI_RX_DATA;
+ break;
+ case ISCSI_RX_DATA:
+ rx = iscsi_rx_data;
+ iscsi->rx_len = ISCSI_DATA_LEN ( common->lengths );
+ next_state = ISCSI_RX_DATA_PADDING;
+ break;
+ case ISCSI_RX_DATA_PADDING:
+ rx = iscsi_rx_discard;
+ iscsi->rx_len = ISCSI_DATA_PAD_LEN ( common->lengths );
+ next_state = ISCSI_RX_BHS;
+ break;
+ default:
+ assert ( 0 );
+ return -EINVAL;
+ }
+
+ frag_len = iscsi->rx_len - iscsi->rx_offset;
+ if ( frag_len > len )
+ frag_len = len;
+ remaining = iscsi->rx_len - iscsi->rx_offset - frag_len;
+ if ( ( rc = rx ( iscsi, data, frag_len, remaining ) ) != 0 ) {
+ DBGC ( iscsi, "iSCSI %p could not process received "
+ "data: %s\n", iscsi, strerror ( rc ) );
+ iscsi_close_connection ( iscsi, rc );
+ iscsi_scsi_done ( iscsi, rc );
+ return rc;
+ }
+
+ iscsi->rx_offset += frag_len;
+ data += frag_len;
+ len -= frag_len;
+
+ /* If all the data for this state has not yet been
+ * received, stay in this state for now.
+ */
+ if ( iscsi->rx_offset != iscsi->rx_len )
+ return 0;
+
+ iscsi->rx_state = next_state;
+ iscsi->rx_offset = 0;
+ }
+
+ return 0;
+}
+
+/**
+ * Handle stream connection closure
+ *
+ * @v socket Transport layer interface
+ * @v rc Reason for close
+ *
+ */
+static void iscsi_socket_close ( struct xfer_interface *socket, int rc ) {
+ struct iscsi_session *iscsi =
+ container_of ( socket, struct iscsi_session, socket );
+
+ /* Even a graceful close counts as an error for iSCSI */
+ if ( ! rc )
+ rc = -ECONNRESET;
+
+ /* Close session cleanly */
+ iscsi_close_connection ( iscsi, rc );
+
+ /* Retry connection if within the retry limit, otherwise fail */
+ if ( ++iscsi->retry_count <= ISCSI_MAX_RETRIES ) {
+ DBGC ( iscsi, "iSCSI %p retrying connection (retry #%d)\n",
+ iscsi, iscsi->retry_count );
+ if ( ( rc = iscsi_open_connection ( iscsi ) ) != 0 ) {
+ DBGC ( iscsi, "iSCSI %p could not reconnect: %s\n",
+ iscsi, strerror ( rc ) );
+ iscsi_scsi_done ( iscsi, rc );
+ }
+ } else {
+ DBGC ( iscsi, "iSCSI %p retry count exceeded\n", iscsi );
+ iscsi->instant_rc = rc;
+ iscsi_scsi_done ( iscsi, rc );
+ }
+}
+
+/**
+ * Handle redirection event
+ *
+ * @v socket Transport layer interface
+ * @v type Location type
+ * @v args Remaining arguments depend upon location type
+ * @ret rc Return status code
+ */
+static int iscsi_vredirect ( struct xfer_interface *socket, int type,
+ va_list args ) {
+ struct iscsi_session *iscsi =
+ container_of ( socket, struct iscsi_session, socket );
+ va_list tmp;
+ struct sockaddr *peer;
+
+ /* Intercept redirects to a LOCATION_SOCKET and record the IP
+ * address for the iBFT. This is a bit of a hack, but avoids
+ * inventing an ioctl()-style call to retrieve the socket
+ * address from a data-xfer interface.
+ */
+ if ( type == LOCATION_SOCKET ) {
+ va_copy ( tmp, args );
+ ( void ) va_arg ( tmp, int ); /* Discard "semantics" */
+ peer = va_arg ( tmp, struct sockaddr * );
+ memcpy ( &iscsi->target_sockaddr, peer,
+ sizeof ( iscsi->target_sockaddr ) );
+ va_end ( tmp );
+ }
+
+ return xfer_vopen ( socket, type, args );
+}
+
+
+/** iSCSI socket operations */
+static struct xfer_interface_operations iscsi_socket_operations = {
+ .close = iscsi_socket_close,
+ .vredirect = iscsi_vredirect,
+ .window = unlimited_xfer_window,
+ .alloc_iob = default_xfer_alloc_iob,
+ .deliver_iob = xfer_deliver_as_raw,
+ .deliver_raw = iscsi_socket_deliver_raw,
+};
+
+
+/****************************************************************************
+ *
+ * iSCSI command issuing
+ *
+ */
+
+/**
+ * Issue SCSI command
+ *
+ * @v scsi SCSI device
+ * @v command SCSI command
+ * @ret rc Return status code
+ */
+static int iscsi_command ( struct scsi_device *scsi,
+ struct scsi_command *command ) {
+ struct iscsi_session *iscsi =
+ container_of ( scsi->backend, struct iscsi_session, refcnt );
+ int rc;
+
+ /* Record SCSI command */
+ iscsi->command = command;
+
+ /* Abort immediately if we have a recorded permanent failure */
+ if ( iscsi->instant_rc ) {
+ rc = iscsi->instant_rc;
+ goto done;
+ }
+
+ /* Issue command or open connection as appropriate */
+ if ( iscsi->status ) {
+ iscsi_start_command ( iscsi );
+ } else {
+ if ( ( rc = iscsi_open_connection ( iscsi ) ) != 0 )
+ goto done;
+ }
+
+ /* Wait for command to complete */
+ iscsi->rc = -EINPROGRESS;
+ while ( iscsi->rc == -EINPROGRESS )
+ step();
+ rc = iscsi->rc;
+
+ done:
+ iscsi->command = NULL;
+ return rc;
+}
+
+static int iscsi_detached_command ( struct scsi_device *scsi __unused,
+ struct scsi_command *command __unused ) {
+ return -ENODEV;
+}
+
+/**
+ * Shut down iSCSI interface
+ *
+ * @v scsi SCSI device
+ */
+void iscsi_detach ( struct scsi_device *scsi ) {
+ struct iscsi_session *iscsi =
+ container_of ( scsi->backend, struct iscsi_session, refcnt );
+
+ xfer_nullify ( &iscsi->socket );
+ iscsi_close_connection ( iscsi, 0 );
+ process_del ( &iscsi->process );
+ scsi->command = iscsi_detached_command;
+ ref_put ( scsi->backend );
+ scsi->backend = NULL;
+}
+
+/****************************************************************************
+ *
+ * Instantiator
+ *
+ */
+
+/** iSCSI root path components (as per RFC4173) */
+enum iscsi_root_path_component {
+ RP_LITERAL = 0,
+ RP_SERVERNAME,
+ RP_PROTOCOL,
+ RP_PORT,
+ RP_LUN,
+ RP_TARGETNAME,
+ NUM_RP_COMPONENTS
+};
+
+/**
+ * Parse iSCSI LUN
+ *
+ * @v iscsi iSCSI session
+ * @v lun_string LUN string representation (as per RFC4173)
+ * @ret rc Return status code
+ */
+static int iscsi_parse_lun ( struct iscsi_session *iscsi,
+ const char *lun_string ) {
+ char *p = ( char * ) lun_string;
+ union {
+ uint64_t u64;
+ uint16_t u16[4];
+ } lun;
+ int i;
+
+ /* Empty LUN; assume LUN 0 */
+ if ( ! *lun_string )
+ return 0;
+
+ for ( i = 0 ; i < 4 ; i++ ) {
+ lun.u16[i] = strtoul ( p, &p, 16 );
+ if ( *p != '-' )
+ return -EINVAL;
+ p++;
+ }
+ if ( *p )
+ return -EINVAL;
+
+ iscsi->lun = lun.u64;
+ return 0;
+}
+
+/**
+ * Parse iSCSI root path
+ *
+ * @v iscsi iSCSI session
+ * @v root_path iSCSI root path (as per RFC4173)
+ * @ret rc Return status code
+ */
+static int iscsi_parse_root_path ( struct iscsi_session *iscsi,
+ const char *root_path ) {
+ char rp_copy[ strlen ( root_path ) + 1 ];
+ char *rp_comp[NUM_RP_COMPONENTS];
+ char *rp = rp_copy;
+ int i = 0;
+ int rc;
+
+ /* Split root path into component parts */
+ strcpy ( rp_copy, root_path );
+ while ( 1 ) {
+ rp_comp[i++] = rp;
+ if ( i == NUM_RP_COMPONENTS )
+ break;
+ for ( ; *rp != ':' ; rp++ ) {
+ if ( ! *rp ) {
+ DBGC ( iscsi, "iSCSI %p root path \"%s\" "
+ "too short\n", iscsi, root_path );
+ return -EINVAL;
+ }
+ }
+ *(rp++) = '\0';
+ }
+
+ /* Use root path components to configure iSCSI session */
+ iscsi->target_address = strdup ( rp_comp[RP_SERVERNAME] );
+ if ( ! iscsi->target_address )
+ return -ENOMEM;
+ iscsi->target_port = strtoul ( rp_comp[RP_PORT], NULL, 10 );
+ if ( ! iscsi->target_port )
+ iscsi->target_port = ISCSI_PORT;
+ if ( ( rc = iscsi_parse_lun ( iscsi, rp_comp[RP_LUN] ) ) != 0 ) {
+ DBGC ( iscsi, "iSCSI %p invalid LUN \"%s\"\n",
+ iscsi, rp_comp[RP_LUN] );
+ return rc;
+ }
+ iscsi->target_iqn = strdup ( rp_comp[RP_TARGETNAME] );
+ if ( ! iscsi->target_iqn )
+ return -ENOMEM;
+
+ return 0;
+}
+
+/**
+ * Set iSCSI authentication details
+ *
+ * @v iscsi iSCSI session
+ * @v username Username, if any
+ * @v password Password, if any
+ * @ret rc Return status code
+ */
+static int iscsi_set_auth ( struct iscsi_session *iscsi,
+ const char *username, const char *password ) {
+
+ if ( username ) {
+ iscsi->username = strdup ( username );
+ if ( ! iscsi->username )
+ return -ENOMEM;
+ }
+
+ if ( password ) {
+ iscsi->password = strdup ( password );
+ if ( ! iscsi->password )
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/**
+ * Attach iSCSI interface
+ *
+ * @v scsi SCSI device
+ * @v root_path iSCSI root path (as per RFC4173)
+ * @ret rc Return status code
+ */
+int iscsi_attach ( struct scsi_device *scsi, const char *root_path ) {
+ struct iscsi_session *iscsi;
+ int rc;
+
+ /* Allocate and initialise structure */
+ iscsi = zalloc ( sizeof ( *iscsi ) );
+ if ( ! iscsi )
+ return -ENOMEM;
+ iscsi->refcnt.free = iscsi_free;
+ xfer_init ( &iscsi->socket, &iscsi_socket_operations, &iscsi->refcnt );
+ process_init ( &iscsi->process, iscsi_tx_step, &iscsi->refcnt );
+
+ /* Parse root path */
+ if ( ( rc = iscsi_parse_root_path ( iscsi, root_path ) ) != 0 )
+ goto err;
+ /* Set fields not specified by root path */
+ if ( ( rc = iscsi_set_auth ( iscsi, iscsi_username,
+ iscsi_password ) ) != 0 )
+ goto err;
+
+ /* Sanity checks */
+ if ( ! iscsi->target_address ) {
+ DBGC ( iscsi, "iSCSI %p does not yet support discovery\n",
+ iscsi );
+ rc = -ENOTSUP;
+ goto err;
+ }
+ if ( ! iscsi->target_iqn ) {
+ DBGC ( iscsi, "iSCSI %p no target address supplied in %s\n",
+ iscsi, root_path );
+ rc = -EINVAL;
+ goto err;
+ }
+
+ /* Attach parent interface, mortalise self, and return */
+ scsi->backend = ref_get ( &iscsi->refcnt );
+ scsi->command = iscsi_command;
+ scsi->lun = iscsi->lun;
+ ref_put ( &iscsi->refcnt );
+ return 0;
+
+ err:
+ ref_put ( &iscsi->refcnt );
+ return rc;
+}
+
+/****************************************************************************
+ *
+ * Settings
+ *
+ */
+
+/** iSCSI initiator IQN setting */
+struct setting initiator_iqn_setting __setting = {
+ .name = "initiator-iqn",
+ .description = "iSCSI initiator name",
+ .tag = DHCP_ISCSI_INITIATOR_IQN,
+ .type = &setting_type_string,
+};
+
+/** An iSCSI string setting */
+struct iscsi_string_setting {
+ /** Setting */
+ struct setting *setting;
+ /** String to update */
+ char **string;
+ /** String prefix */
+ const char *prefix;
+};
+
+/** iSCSI string settings */
+static struct iscsi_string_setting iscsi_string_settings[] = {
+ {
+ .setting = &initiator_iqn_setting,
+ .string = &iscsi_explicit_initiator_iqn,
+ .prefix = "",
+ },
+ {
+ .setting = &username_setting,
+ .string = &iscsi_username,
+ .prefix = "",
+ },
+ {
+ .setting = &password_setting,
+ .string = &iscsi_password,
+ .prefix = "",
+ },
+ {
+ .setting = &hostname_setting,
+ .string = &iscsi_default_initiator_iqn,
+ .prefix = "iqn.2000-09.org.etherboot:",
+ },
+};
+
+/**
+ * Apply iSCSI setting
+ *
+ * @v setting iSCSI string setting
+ * @ret rc Return status code
+ */
+static int apply_iscsi_string_setting ( struct iscsi_string_setting *setting ){
+ size_t prefix_len;
+ int setting_len;
+ size_t len;
+ int check_len;
+ char *p;
+
+ /* Free old string */
+ free ( *setting->string );
+ *setting->string = NULL;
+
+ /* Allocate new string */
+ prefix_len = strlen ( setting->prefix );
+ setting_len = fetch_setting_len ( NULL, setting->setting );
+ if ( setting_len < 0 ) {
+ /* Missing settings are not errors; leave strings as NULL */
+ return 0;
+ }
+ len = ( prefix_len + setting_len + 1 );
+ p = *setting->string = malloc ( len );
+ if ( ! p )
+ return -ENOMEM;
+
+ /* Fill new string */
+ strcpy ( p, setting->prefix );
+ check_len = fetch_string_setting ( NULL, setting->setting,
+ ( p + prefix_len ),
+ ( len - prefix_len ) );
+ assert ( check_len == setting_len );
+
+ return 0;
+}
+
+/**
+ * Apply iSCSI settings
+ *
+ * @ret rc Return status code
+ */
+static int apply_iscsi_settings ( void ) {
+ struct iscsi_string_setting *setting;
+ unsigned int i;
+ int rc;
+
+ for ( i = 0 ; i < ( sizeof ( iscsi_string_settings ) /
+ sizeof ( iscsi_string_settings[0] ) ) ; i++ ) {
+ setting = &iscsi_string_settings[i];
+ if ( ( rc = apply_iscsi_string_setting ( setting ) ) != 0 ) {
+ DBG ( "iSCSI could not apply setting %s\n",
+ setting->setting->name );
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+/** iSCSI settings applicator */
+struct settings_applicator iscsi_settings_applicator __settings_applicator = {
+ .apply = apply_iscsi_settings,
+};
+
+/****************************************************************************
+ *
+ * Initiator name
+ *
+ */
+
+/**
+ * Get iSCSI initiator IQN
+ *
+ * @v iscsi iSCSI session
+ * @ret rc Return status code
+ */
+const char * iscsi_initiator_iqn ( void ) {
+
+ if ( iscsi_explicit_initiator_iqn )
+ return iscsi_explicit_initiator_iqn;
+ if ( iscsi_default_initiator_iqn )
+ return iscsi_default_initiator_iqn;
+ return "iqn.2000-09.org.etherboot:UNKNOWN";
+}
diff --git a/gpxe/src/net/tcpip.c b/gpxe/src/net/tcpip.c
new file mode 100644
index 00000000..1bc8d1a3
--- /dev/null
+++ b/gpxe/src/net/tcpip.c
@@ -0,0 +1,145 @@
+#include <stdint.h>
+#include <string.h>
+#include <errno.h>
+#include <byteswap.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/tables.h>
+#include <gpxe/tcpip.h>
+
+/** @file
+ *
+ * Transport-network layer interface
+ *
+ * This file contains functions and utilities for the
+ * TCP/IP transport-network layer interface
+ */
+
+/** Registered network-layer protocols that support TCP/IP */
+static struct tcpip_net_protocol tcpip_net_protocols[0]
+ __table_start ( struct tcpip_net_protocol, tcpip_net_protocols );
+static struct tcpip_net_protocol tcpip_net_protocols_end[0]
+ __table_end ( struct tcpip_net_protocol, tcpip_net_protocols );
+
+/** Registered transport-layer protocols that support TCP/IP */
+static struct tcpip_protocol tcpip_protocols[0]
+ __table_start ( struct tcpip_protocol, tcpip_protocols );
+static struct tcpip_protocol tcpip_protocols_end[0]
+ __table_end ( struct tcpip_protocol, tcpip_protocols );
+
+/** Process a received TCP/IP packet
+ *
+ * @v iobuf I/O buffer
+ * @v tcpip_proto Transport-layer protocol number
+ * @v st_src Partially-filled source address
+ * @v st_dest Partially-filled destination address
+ * @v pshdr_csum Pseudo-header checksum
+ * @ret rc Return status code
+ *
+ * This function expects a transport-layer segment from the network
+ * layer. The network layer should fill in as much as it can of the
+ * source and destination addresses (i.e. it should fill in the
+ * address family and the network-layer addresses, but leave the ports
+ * and the rest of the structures as zero).
+ */
+int tcpip_rx ( struct io_buffer *iobuf, uint8_t tcpip_proto,
+ struct sockaddr_tcpip *st_src,
+ struct sockaddr_tcpip *st_dest,
+ uint16_t pshdr_csum ) {
+ struct tcpip_protocol *tcpip;
+
+ /* Hand off packet to the appropriate transport-layer protocol */
+ for ( tcpip = tcpip_protocols; tcpip < tcpip_protocols_end; tcpip++ ) {
+ if ( tcpip->tcpip_proto == tcpip_proto ) {
+ DBG ( "TCP/IP received %s packet\n", tcpip->name );
+ return tcpip->rx ( iobuf, st_src, st_dest, pshdr_csum );
+ }
+ }
+
+ DBG ( "Unrecognised TCP/IP protocol %d\n", tcpip_proto );
+ free_iob ( iobuf );
+ return -EPROTONOSUPPORT;
+}
+
+/** Transmit a TCP/IP packet
+ *
+ * @v iobuf I/O buffer
+ * @v tcpip_protocol Transport-layer protocol
+ * @v st_dest Destination address
+ * @v netdev Network device to use if no route found, or NULL
+ * @v trans_csum Transport-layer checksum to complete, or NULL
+ * @ret rc Return status code
+ */
+int tcpip_tx ( struct io_buffer *iobuf, struct tcpip_protocol *tcpip_protocol,
+ struct sockaddr_tcpip *st_dest, struct net_device *netdev,
+ uint16_t *trans_csum ) {
+ struct tcpip_net_protocol *tcpip_net;
+
+ /* Hand off packet to the appropriate network-layer protocol */
+ for ( tcpip_net = tcpip_net_protocols ;
+ tcpip_net < tcpip_net_protocols_end ; tcpip_net++ ) {
+ if ( tcpip_net->sa_family == st_dest->st_family ) {
+ DBG ( "TCP/IP sending %s packet\n", tcpip_net->name );
+ return tcpip_net->tx ( iobuf, tcpip_protocol, st_dest,
+ netdev, trans_csum );
+ }
+ }
+
+ DBG ( "Unrecognised TCP/IP address family %d\n", st_dest->st_family );
+ free_iob ( iobuf );
+ return -EAFNOSUPPORT;
+}
+
+/**
+ * Calculate continued TCP/IP checkum
+ *
+ * @v partial Checksum of already-summed data, in network byte order
+ * @v data Data buffer
+ * @v len Length of data buffer
+ * @ret cksum Updated checksum, in network byte order
+ *
+ * Calculates a TCP/IP-style 16-bit checksum over the data block. The
+ * checksum is returned in network byte order.
+ *
+ * This function may be used to add new data to an existing checksum.
+ * The function assumes that both the old data and the new data start
+ * on even byte offsets; if this is not the case then you will need to
+ * byte-swap either the input partial checksum, the output checksum,
+ * or both. Deciding which to swap is left as an exercise for the
+ * interested reader.
+ */
+uint16_t tcpip_continue_chksum ( uint16_t partial, const void *data,
+ size_t len ) {
+ unsigned int cksum = ( ( ~partial ) & 0xffff );
+ unsigned int value;
+ unsigned int i;
+
+ for ( i = 0 ; i < len ; i++ ) {
+ value = * ( ( uint8_t * ) data + i );
+ if ( i & 1 ) {
+ /* Odd bytes: swap on little-endian systems */
+ value = be16_to_cpu ( value );
+ } else {
+ /* Even bytes: swap on big-endian systems */
+ value = le16_to_cpu ( value );
+ }
+ cksum += value;
+ if ( cksum > 0xffff )
+ cksum -= 0xffff;
+ }
+
+ return ( ~cksum );
+}
+
+/**
+ * Calculate TCP/IP checkum
+ *
+ * @v data Data buffer
+ * @v len Length of data buffer
+ * @ret cksum Checksum, in network byte order
+ *
+ * Calculates a TCP/IP-style 16-bit checksum over the data block. The
+ * checksum is returned in network byte order.
+ */
+uint16_t tcpip_chksum ( const void *data, size_t len ) {
+ return tcpip_continue_chksum ( TCPIP_EMPTY_CSUM, data, len );
+}
diff --git a/gpxe/src/net/tls.c b/gpxe/src/net/tls.c
new file mode 100644
index 00000000..834686fb
--- /dev/null
+++ b/gpxe/src/net/tls.c
@@ -0,0 +1,1731 @@
+/*
+ * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/**
+ * @file
+ *
+ * Transport Layer Security Protocol
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <byteswap.h>
+#include <gpxe/hmac.h>
+#include <gpxe/md5.h>
+#include <gpxe/sha1.h>
+#include <gpxe/aes.h>
+#include <gpxe/rsa.h>
+#include <gpxe/xfer.h>
+#include <gpxe/open.h>
+#include <gpxe/filter.h>
+#include <gpxe/tls.h>
+
+static int tls_send_plaintext ( struct tls_session *tls, unsigned int type,
+ const void *data, size_t len );
+static void tls_clear_cipher ( struct tls_session *tls,
+ struct tls_cipherspec *cipherspec );
+
+/**
+ * Free TLS session
+ *
+ * @v refcnt Reference counter
+ */
+static void free_tls ( struct refcnt *refcnt ) {
+ struct tls_session *tls =
+ container_of ( refcnt, struct tls_session, refcnt );
+
+ /* Free dynamically-allocated resources */
+ tls_clear_cipher ( tls, &tls->tx_cipherspec );
+ tls_clear_cipher ( tls, &tls->tx_cipherspec_pending );
+ tls_clear_cipher ( tls, &tls->rx_cipherspec );
+ tls_clear_cipher ( tls, &tls->rx_cipherspec_pending );
+ free ( tls->rsa_mod );
+ free ( tls->rsa_pub_exp );
+ free ( tls->rx_data );
+
+ /* Free TLS structure itself */
+ free ( tls );
+}
+
+/**
+ * Finish with TLS session
+ *
+ * @v tls TLS session
+ * @v rc Status code
+ */
+static void tls_close ( struct tls_session *tls, int rc ) {
+
+ /* Remove process */
+ process_del ( &tls->process );
+
+ /* Close ciphertext and plaintext streams */
+ xfer_nullify ( &tls->cipherstream.xfer );
+ xfer_close ( &tls->cipherstream.xfer, rc );
+ xfer_nullify ( &tls->plainstream.xfer );
+ xfer_close ( &tls->plainstream.xfer, rc );
+}
+
+/******************************************************************************
+ *
+ * Random number generation
+ *
+ ******************************************************************************
+ */
+
+/**
+ * Generate random data
+ *
+ * @v data Buffer to fill
+ * @v len Length of buffer
+ */
+static void tls_generate_random ( void *data, size_t len ) {
+ /* FIXME: Some real random data source would be nice... */
+ memset ( data, 0x01, len );
+}
+
+/**
+ * Update HMAC with a list of ( data, len ) pairs
+ *
+ * @v digest Hash function to use
+ * @v digest_ctx Digest context
+ * @v args ( data, len ) pairs of data, terminated by NULL
+ */
+static void tls_hmac_update_va ( struct crypto_algorithm *digest,
+ void *digest_ctx, va_list args ) {
+ void *data;
+ size_t len;
+
+ while ( ( data = va_arg ( args, void * ) ) ) {
+ len = va_arg ( args, size_t );
+ hmac_update ( digest, digest_ctx, data, len );
+ }
+}
+
+/**
+ * Generate secure pseudo-random data using a single hash function
+ *
+ * @v tls TLS session
+ * @v digest Hash function to use
+ * @v secret Secret
+ * @v secret_len Length of secret
+ * @v out Output buffer
+ * @v out_len Length of output buffer
+ * @v seeds ( data, len ) pairs of seed data, terminated by NULL
+ */
+static void tls_p_hash_va ( struct tls_session *tls,
+ struct crypto_algorithm *digest,
+ void *secret, size_t secret_len,
+ void *out, size_t out_len,
+ va_list seeds ) {
+ uint8_t secret_copy[secret_len];
+ uint8_t digest_ctx[digest->ctxsize];
+ uint8_t digest_ctx_partial[digest->ctxsize];
+ uint8_t a[digest->digestsize];
+ uint8_t out_tmp[digest->digestsize];
+ size_t frag_len = digest->digestsize;
+ va_list tmp;
+
+ /* Copy the secret, in case HMAC modifies it */
+ memcpy ( secret_copy, secret, secret_len );
+ secret = secret_copy;
+ DBGC2 ( tls, "TLS %p %s secret:\n", tls, digest->name );
+ DBGC2_HD ( tls, secret, secret_len );
+
+ /* Calculate A(1) */
+ hmac_init ( digest, digest_ctx, secret, &secret_len );
+ va_copy ( tmp, seeds );
+ tls_hmac_update_va ( digest, digest_ctx, tmp );
+ va_end ( tmp );
+ hmac_final ( digest, digest_ctx, secret, &secret_len, a );
+ DBGC2 ( tls, "TLS %p %s A(1):\n", tls, digest->name );
+ DBGC2_HD ( tls, &a, sizeof ( a ) );
+
+ /* Generate as much data as required */
+ while ( out_len ) {
+ /* Calculate output portion */
+ hmac_init ( digest, digest_ctx, secret, &secret_len );
+ hmac_update ( digest, digest_ctx, a, sizeof ( a ) );
+ memcpy ( digest_ctx_partial, digest_ctx, digest->ctxsize );
+ va_copy ( tmp, seeds );
+ tls_hmac_update_va ( digest, digest_ctx, tmp );
+ va_end ( tmp );
+ hmac_final ( digest, digest_ctx,
+ secret, &secret_len, out_tmp );
+
+ /* Copy output */
+ if ( frag_len > out_len )
+ frag_len = out_len;
+ memcpy ( out, out_tmp, frag_len );
+ DBGC2 ( tls, "TLS %p %s output:\n", tls, digest->name );
+ DBGC2_HD ( tls, out, frag_len );
+
+ /* Calculate A(i) */
+ hmac_final ( digest, digest_ctx_partial,
+ secret, &secret_len, a );
+ DBGC2 ( tls, "TLS %p %s A(n):\n", tls, digest->name );
+ DBGC2_HD ( tls, &a, sizeof ( a ) );
+
+ out += frag_len;
+ out_len -= frag_len;
+ }
+}
+
+/**
+ * Generate secure pseudo-random data
+ *
+ * @v tls TLS session
+ * @v secret Secret
+ * @v secret_len Length of secret
+ * @v out Output buffer
+ * @v out_len Length of output buffer
+ * @v ... ( data, len ) pairs of seed data, terminated by NULL
+ */
+static void tls_prf ( struct tls_session *tls, void *secret, size_t secret_len,
+ void *out, size_t out_len, ... ) {
+ va_list seeds;
+ va_list tmp;
+ size_t subsecret_len;
+ void *md5_secret;
+ void *sha1_secret;
+ uint8_t out_md5[out_len];
+ uint8_t out_sha1[out_len];
+ unsigned int i;
+
+ va_start ( seeds, out_len );
+
+ /* Split secret into two, with an overlap of up to one byte */
+ subsecret_len = ( ( secret_len + 1 ) / 2 );
+ md5_secret = secret;
+ sha1_secret = ( secret + secret_len - subsecret_len );
+
+ /* Calculate MD5 portion */
+ va_copy ( tmp, seeds );
+ tls_p_hash_va ( tls, &md5_algorithm, md5_secret, subsecret_len,
+ out_md5, out_len, seeds );
+ va_end ( tmp );
+
+ /* Calculate SHA1 portion */
+ va_copy ( tmp, seeds );
+ tls_p_hash_va ( tls, &sha1_algorithm, sha1_secret, subsecret_len,
+ out_sha1, out_len, seeds );
+ va_end ( tmp );
+
+ /* XOR the two portions together into the final output buffer */
+ for ( i = 0 ; i < out_len ; i++ ) {
+ *( ( uint8_t * ) out + i ) = ( out_md5[i] ^ out_sha1[i] );
+ }
+
+ va_end ( seeds );
+}
+
+/**
+ * Generate secure pseudo-random data
+ *
+ * @v secret Secret
+ * @v secret_len Length of secret
+ * @v out Output buffer
+ * @v out_len Length of output buffer
+ * @v label String literal label
+ * @v ... ( data, len ) pairs of seed data
+ */
+#define tls_prf_label( tls, secret, secret_len, out, out_len, label, ... ) \
+ tls_prf ( (tls), (secret), (secret_len), (out), (out_len), \
+ label, ( sizeof ( label ) - 1 ), __VA_ARGS__, NULL )
+
+/******************************************************************************
+ *
+ * Secret management
+ *
+ ******************************************************************************
+ */
+
+/**
+ * Generate master secret
+ *
+ * @v tls TLS session
+ *
+ * The pre-master secret and the client and server random values must
+ * already be known.
+ */
+static void tls_generate_master_secret ( struct tls_session *tls ) {
+ DBGC ( tls, "TLS %p pre-master-secret:\n", tls );
+ DBGC_HD ( tls, &tls->pre_master_secret,
+ sizeof ( tls->pre_master_secret ) );
+ DBGC ( tls, "TLS %p client random bytes:\n", tls );
+ DBGC_HD ( tls, &tls->client_random, sizeof ( tls->server_random ) );
+ DBGC ( tls, "TLS %p server random bytes:\n", tls );
+ DBGC_HD ( tls, &tls->server_random, sizeof ( tls->server_random ) );
+
+ tls_prf_label ( tls, tls->pre_master_secret,
+ sizeof ( tls->pre_master_secret ),
+ tls->master_secret, sizeof ( tls->master_secret ),
+ "master secret",
+ tls->client_random, sizeof ( tls->client_random ),
+ tls->server_random, sizeof ( tls->server_random ) );
+
+ DBGC ( tls, "TLS %p generated master secret:\n", tls );
+ DBGC_HD ( tls, &tls->master_secret, sizeof ( tls->master_secret ) );
+}
+
+/**
+ * Generate key material
+ *
+ * @v tls TLS session
+ *
+ * The master secret must already be known.
+ */
+static int tls_generate_keys ( struct tls_session *tls ) {
+ struct tls_cipherspec *tx_cipherspec = &tls->tx_cipherspec_pending;
+ struct tls_cipherspec *rx_cipherspec = &tls->rx_cipherspec_pending;
+ size_t hash_size = tx_cipherspec->digest->digestsize;
+ size_t key_size = tx_cipherspec->key_len;
+ size_t iv_size = tx_cipherspec->cipher->blocksize;
+ size_t total = ( 2 * ( hash_size + key_size + iv_size ) );
+ uint8_t key_block[total];
+ uint8_t *key;
+ int rc;
+
+ /* Generate key block */
+ tls_prf_label ( tls, tls->master_secret, sizeof ( tls->master_secret ),
+ key_block, sizeof ( key_block ), "key expansion",
+ tls->server_random, sizeof ( tls->server_random ),
+ tls->client_random, sizeof ( tls->client_random ) );
+
+ /* Split key block into portions */
+ key = key_block;
+
+ /* TX MAC secret */
+ memcpy ( tx_cipherspec->mac_secret, key, hash_size );
+ DBGC ( tls, "TLS %p TX MAC secret:\n", tls );
+ DBGC_HD ( tls, key, hash_size );
+ key += hash_size;
+
+ /* RX MAC secret */
+ memcpy ( rx_cipherspec->mac_secret, key, hash_size );
+ DBGC ( tls, "TLS %p RX MAC secret:\n", tls );
+ DBGC_HD ( tls, key, hash_size );
+ key += hash_size;
+
+ /* TX key */
+ if ( ( rc = cipher_setkey ( tx_cipherspec->cipher,
+ tx_cipherspec->cipher_ctx,
+ key, key_size ) ) != 0 ) {
+ DBGC ( tls, "TLS %p could not set TX key: %s\n",
+ tls, strerror ( rc ) );
+ return rc;
+ }
+ DBGC ( tls, "TLS %p TX key:\n", tls );
+ DBGC_HD ( tls, key, key_size );
+ key += key_size;
+
+ /* RX key */
+ if ( ( rc = cipher_setkey ( rx_cipherspec->cipher,
+ rx_cipherspec->cipher_ctx,
+ key, key_size ) ) != 0 ) {
+ DBGC ( tls, "TLS %p could not set TX key: %s\n",
+ tls, strerror ( rc ) );
+ return rc;
+ }
+
+ /* FIXME: AES needs to be fixed to not require this */
+ AES_convert_key ( rx_cipherspec->cipher_ctx );
+
+ DBGC ( tls, "TLS %p RX key:\n", tls );
+ DBGC_HD ( tls, key, key_size );
+ key += key_size;
+
+ /* TX initialisation vector */
+ cipher_setiv ( tx_cipherspec->cipher, tx_cipherspec->cipher_ctx, key );
+ DBGC ( tls, "TLS %p TX IV:\n", tls );
+ DBGC_HD ( tls, key, iv_size );
+ key += iv_size;
+
+ /* RX initialisation vector */
+ cipher_setiv ( rx_cipherspec->cipher, rx_cipherspec->cipher_ctx, key );
+ DBGC ( tls, "TLS %p RX IV:\n", tls );
+ DBGC_HD ( tls, key, iv_size );
+ key += iv_size;
+
+ assert ( ( key_block + total ) == key );
+
+ return 0;
+}
+
+/******************************************************************************
+ *
+ * Cipher suite management
+ *
+ ******************************************************************************
+ */
+
+/**
+ * Clear cipher suite
+ *
+ * @v cipherspec TLS cipher specification
+ */
+static void tls_clear_cipher ( struct tls_session *tls __unused,
+ struct tls_cipherspec *cipherspec ) {
+ free ( cipherspec->dynamic );
+ memset ( cipherspec, 0, sizeof ( cipherspec ) );
+ cipherspec->pubkey = &crypto_null;
+ cipherspec->cipher = &crypto_null;
+ cipherspec->digest = &crypto_null;
+}
+
+/**
+ * Set cipher suite
+ *
+ * @v tls TLS session
+ * @v cipherspec TLS cipher specification
+ * @v pubkey Public-key encryption elgorithm
+ * @v cipher Bulk encryption cipher algorithm
+ * @v digest MAC digest algorithm
+ * @v key_len Key length
+ * @ret rc Return status code
+ */
+static int tls_set_cipher ( struct tls_session *tls,
+ struct tls_cipherspec *cipherspec,
+ struct crypto_algorithm *pubkey,
+ struct crypto_algorithm *cipher,
+ struct crypto_algorithm *digest,
+ size_t key_len ) {
+ size_t total;
+ void *dynamic;
+
+ /* Clear out old cipher contents, if any */
+ tls_clear_cipher ( tls, cipherspec );
+
+ /* Allocate dynamic storage */
+ total = ( pubkey->ctxsize + 2 * cipher->ctxsize + digest->digestsize );
+ dynamic = malloc ( total );
+ if ( ! dynamic ) {
+ DBGC ( tls, "TLS %p could not allocate %zd bytes for crypto "
+ "context\n", tls, total );
+ return -ENOMEM;
+ }
+ memset ( dynamic, 0, total );
+
+ /* Assign storage */
+ cipherspec->dynamic = dynamic;
+ cipherspec->pubkey_ctx = dynamic; dynamic += pubkey->ctxsize;
+ cipherspec->cipher_ctx = dynamic; dynamic += cipher->ctxsize;
+ cipherspec->cipher_next_ctx = dynamic; dynamic += cipher->ctxsize;
+ cipherspec->mac_secret = dynamic; dynamic += digest->digestsize;
+ assert ( ( cipherspec->dynamic + total ) == dynamic );
+
+ /* Store parameters */
+ cipherspec->pubkey = pubkey;
+ cipherspec->cipher = cipher;
+ cipherspec->digest = digest;
+ cipherspec->key_len = key_len;
+
+ return 0;
+}
+
+/**
+ * Select next cipher suite
+ *
+ * @v tls TLS session
+ * @v cipher_suite Cipher suite specification
+ * @ret rc Return status code
+ */
+static int tls_select_cipher ( struct tls_session *tls,
+ unsigned int cipher_suite ) {
+ struct crypto_algorithm *pubkey = &crypto_null;
+ struct crypto_algorithm *cipher = &crypto_null;
+ struct crypto_algorithm *digest = &crypto_null;
+ unsigned int key_len = 0;
+ int rc;
+
+ switch ( cipher_suite ) {
+ case htons ( TLS_RSA_WITH_AES_128_CBC_SHA ):
+ key_len = ( 128 / 8 );
+ cipher = &aes_algorithm;
+ digest = &sha1_algorithm;
+ break;
+ case htons ( TLS_RSA_WITH_AES_256_CBC_SHA ):
+ key_len = ( 256 / 8 );
+ cipher = &aes_algorithm;
+ digest = &sha1_algorithm;
+ break;
+ default:
+ DBGC ( tls, "TLS %p does not support cipher %04x\n",
+ tls, ntohs ( cipher_suite ) );
+ return -ENOTSUP;
+ }
+
+ /* Set ciphers */
+ if ( ( rc = tls_set_cipher ( tls, &tls->tx_cipherspec_pending, pubkey,
+ cipher, digest, key_len ) ) != 0 )
+ return rc;
+ if ( ( rc = tls_set_cipher ( tls, &tls->rx_cipherspec_pending, pubkey,
+ cipher, digest, key_len ) ) != 0 )
+ return rc;
+
+ DBGC ( tls, "TLS %p selected %s-%s-%d-%s\n", tls,
+ pubkey->name, cipher->name, ( key_len * 8 ), digest->name );
+
+ return 0;
+}
+
+/**
+ * Activate next cipher suite
+ *
+ * @v tls TLS session
+ * @v pending Pending cipher specification
+ * @v active Active cipher specification to replace
+ * @ret rc Return status code
+ */
+static int tls_change_cipher ( struct tls_session *tls,
+ struct tls_cipherspec *pending,
+ struct tls_cipherspec *active ) {
+
+ /* Sanity check */
+ if ( /* FIXME (when pubkey is not hard-coded to RSA):
+ * ( pending->pubkey == &crypto_null ) || */
+ ( pending->cipher == &crypto_null ) ||
+ ( pending->digest == &crypto_null ) ) {
+ DBGC ( tls, "TLS %p refusing to use null cipher\n", tls );
+ return -ENOTSUP;
+ }
+
+ tls_clear_cipher ( tls, active );
+ memswap ( active, pending, sizeof ( *active ) );
+ return 0;
+}
+
+/******************************************************************************
+ *
+ * Handshake verification
+ *
+ ******************************************************************************
+ */
+
+/**
+ * Add handshake record to verification hash
+ *
+ * @v tls TLS session
+ * @v data Handshake record
+ * @v len Length of handshake record
+ */
+static void tls_add_handshake ( struct tls_session *tls,
+ const void *data, size_t len ) {
+
+ digest_update ( &md5_algorithm, tls->handshake_md5_ctx, data, len );
+ digest_update ( &sha1_algorithm, tls->handshake_sha1_ctx, data, len );
+}
+
+/**
+ * Calculate handshake verification hash
+ *
+ * @v tls TLS session
+ * @v out Output buffer
+ *
+ * Calculates the MD5+SHA1 digest over all handshake messages seen so
+ * far.
+ */
+static void tls_verify_handshake ( struct tls_session *tls, void *out ) {
+ struct crypto_algorithm *md5 = &md5_algorithm;
+ struct crypto_algorithm *sha1 = &sha1_algorithm;
+ uint8_t md5_ctx[md5->ctxsize];
+ uint8_t sha1_ctx[sha1->ctxsize];
+ void *md5_digest = out;
+ void *sha1_digest = ( out + md5->digestsize );
+
+ memcpy ( md5_ctx, tls->handshake_md5_ctx, sizeof ( md5_ctx ) );
+ memcpy ( sha1_ctx, tls->handshake_sha1_ctx, sizeof ( sha1_ctx ) );
+ digest_final ( md5, md5_ctx, md5_digest );
+ digest_final ( sha1, sha1_ctx, sha1_digest );
+}
+
+/******************************************************************************
+ *
+ * Record handling
+ *
+ ******************************************************************************
+ */
+
+/**
+ * Transmit Handshake record
+ *
+ * @v tls TLS session
+ * @v data Plaintext record
+ * @v len Length of plaintext record
+ * @ret rc Return status code
+ */
+static int tls_send_handshake ( struct tls_session *tls,
+ void *data, size_t len ) {
+
+ /* Add to handshake digest */
+ tls_add_handshake ( tls, data, len );
+
+ /* Send record */
+ return tls_send_plaintext ( tls, TLS_TYPE_HANDSHAKE, data, len );
+}
+
+/**
+ * Transmit Client Hello record
+ *
+ * @v tls TLS session
+ * @ret rc Return status code
+ */
+static int tls_send_client_hello ( struct tls_session *tls ) {
+ struct {
+ uint32_t type_length;
+ uint16_t version;
+ uint8_t random[32];
+ uint8_t session_id_len;
+ uint16_t cipher_suite_len;
+ uint16_t cipher_suites[2];
+ uint8_t compression_methods_len;
+ uint8_t compression_methods[1];
+ } __attribute__ (( packed )) hello;
+
+ memset ( &hello, 0, sizeof ( hello ) );
+ hello.type_length = ( cpu_to_le32 ( TLS_CLIENT_HELLO ) |
+ htonl ( sizeof ( hello ) -
+ sizeof ( hello.type_length ) ) );
+ hello.version = htons ( TLS_VERSION_TLS_1_0 );
+ memcpy ( &hello.random, tls->client_random, sizeof ( hello.random ) );
+ hello.cipher_suite_len = htons ( sizeof ( hello.cipher_suites ) );
+ hello.cipher_suites[0] = htons ( TLS_RSA_WITH_AES_128_CBC_SHA );
+ hello.cipher_suites[1] = htons ( TLS_RSA_WITH_AES_256_CBC_SHA );
+ hello.compression_methods_len = sizeof ( hello.compression_methods );
+
+ return tls_send_handshake ( tls, &hello, sizeof ( hello ) );
+}
+
+/**
+ * Transmit Client Key Exchange record
+ *
+ * @v tls TLS session
+ * @ret rc Return status code
+ */
+static int tls_send_client_key_exchange ( struct tls_session *tls ) {
+ /* FIXME: Hack alert */
+ RSA_CTX *rsa_ctx;
+ RSA_pub_key_new ( &rsa_ctx, tls->rsa_mod, tls->rsa_mod_len,
+ tls->rsa_pub_exp, tls->rsa_pub_exp_len );
+ struct {
+ uint32_t type_length;
+ uint16_t encrypted_pre_master_secret_len;
+ uint8_t encrypted_pre_master_secret[rsa_ctx->num_octets];
+ } __attribute__ (( packed )) key_xchg;
+
+ memset ( &key_xchg, 0, sizeof ( key_xchg ) );
+ key_xchg.type_length = ( cpu_to_le32 ( TLS_CLIENT_KEY_EXCHANGE ) |
+ htonl ( sizeof ( key_xchg ) -
+ sizeof ( key_xchg.type_length ) ) );
+ key_xchg.encrypted_pre_master_secret_len
+ = htons ( sizeof ( key_xchg.encrypted_pre_master_secret ) );
+
+ /* FIXME: Hack alert */
+ DBGC ( tls, "RSA encrypting plaintext, modulus, exponent:\n" );
+ DBGC_HD ( tls, &tls->pre_master_secret,
+ sizeof ( tls->pre_master_secret ) );
+ DBGC_HD ( tls, tls->rsa_mod, tls->rsa_mod_len );
+ DBGC_HD ( tls, tls->rsa_pub_exp, tls->rsa_pub_exp_len );
+ RSA_encrypt ( rsa_ctx, tls->pre_master_secret,
+ sizeof ( tls->pre_master_secret ),
+ key_xchg.encrypted_pre_master_secret, 0 );
+ DBGC ( tls, "RSA encrypt done. Ciphertext:\n" );
+ DBGC_HD ( tls, &key_xchg.encrypted_pre_master_secret,
+ sizeof ( key_xchg.encrypted_pre_master_secret ) );
+ RSA_free ( rsa_ctx );
+
+
+ return tls_send_handshake ( tls, &key_xchg, sizeof ( key_xchg ) );
+}
+
+/**
+ * Transmit Change Cipher record
+ *
+ * @v tls TLS session
+ * @ret rc Return status code
+ */
+static int tls_send_change_cipher ( struct tls_session *tls ) {
+ static const uint8_t change_cipher[1] = { 1 };
+ return tls_send_plaintext ( tls, TLS_TYPE_CHANGE_CIPHER,
+ change_cipher, sizeof ( change_cipher ) );
+}
+
+/**
+ * Transmit Finished record
+ *
+ * @v tls TLS session
+ * @ret rc Return status code
+ */
+static int tls_send_finished ( struct tls_session *tls ) {
+ struct {
+ uint32_t type_length;
+ uint8_t verify_data[12];
+ } __attribute__ (( packed )) finished;
+ uint8_t digest[MD5_DIGEST_SIZE + SHA1_DIGEST_SIZE];
+
+ memset ( &finished, 0, sizeof ( finished ) );
+ finished.type_length = ( cpu_to_le32 ( TLS_FINISHED ) |
+ htonl ( sizeof ( finished ) -
+ sizeof ( finished.type_length ) ) );
+ tls_verify_handshake ( tls, digest );
+ tls_prf_label ( tls, tls->master_secret, sizeof ( tls->master_secret ),
+ finished.verify_data, sizeof ( finished.verify_data ),
+ "client finished", digest, sizeof ( digest ) );
+
+ return tls_send_handshake ( tls, &finished, sizeof ( finished ) );
+}
+
+/**
+ * Receive new Change Cipher record
+ *
+ * @v tls TLS session
+ * @v data Plaintext record
+ * @v len Length of plaintext record
+ * @ret rc Return status code
+ */
+static int tls_new_change_cipher ( struct tls_session *tls,
+ void *data, size_t len ) {
+ int rc;
+
+ if ( ( len != 1 ) || ( *( ( uint8_t * ) data ) != 1 ) ) {
+ DBGC ( tls, "TLS %p received invalid Change Cipher\n", tls );
+ DBGC_HD ( tls, data, len );
+ return -EINVAL;
+ }
+
+ if ( ( rc = tls_change_cipher ( tls, &tls->rx_cipherspec_pending,
+ &tls->rx_cipherspec ) ) != 0 ) {
+ DBGC ( tls, "TLS %p could not activate RX cipher: %s\n",
+ tls, strerror ( rc ) );
+ return rc;
+ }
+ tls->rx_seq = ~( ( uint64_t ) 0 );
+
+ return 0;
+}
+
+/**
+ * Receive new Alert record
+ *
+ * @v tls TLS session
+ * @v data Plaintext record
+ * @v len Length of plaintext record
+ * @ret rc Return status code
+ */
+static int tls_new_alert ( struct tls_session *tls, void *data, size_t len ) {
+ struct {
+ uint8_t level;
+ uint8_t description;
+ char next[0];
+ } __attribute__ (( packed )) *alert = data;
+ void *end = alert->next;
+
+ /* Sanity check */
+ if ( end != ( data + len ) ) {
+ DBGC ( tls, "TLS %p received overlength Alert\n", tls );
+ DBGC_HD ( tls, data, len );
+ return -EINVAL;
+ }
+
+ switch ( alert->level ) {
+ case TLS_ALERT_WARNING:
+ DBGC ( tls, "TLS %p received warning alert %d\n",
+ tls, alert->description );
+ return 0;
+ case TLS_ALERT_FATAL:
+ DBGC ( tls, "TLS %p received fatal alert %d\n",
+ tls, alert->description );
+ return -EPERM;
+ default:
+ DBGC ( tls, "TLS %p received unknown alert level %d"
+ "(alert %d)\n", tls, alert->level, alert->description );
+ return -EIO;
+ }
+}
+
+/**
+ * Receive new Server Hello record
+ *
+ * @v tls TLS session
+ * @v data Plaintext record
+ * @v len Length of plaintext record
+ * @ret rc Return status code
+ */
+static int tls_new_server_hello ( struct tls_session *tls,
+ void *data, size_t len ) {
+ struct {
+ uint32_t type_length;
+ uint16_t version;
+ uint8_t random[32];
+ uint8_t session_id_len;
+ char next[0];
+ } __attribute__ (( packed )) *hello_a = data;
+ struct {
+ uint8_t session_id[hello_a->session_id_len];
+ uint16_t cipher_suite;
+ uint8_t compression_method;
+ char next[0];
+ } __attribute__ (( packed )) *hello_b = ( void * ) &hello_a->next;
+ void *end = hello_b->next;
+ int rc;
+
+ /* Sanity check */
+ if ( end != ( data + len ) ) {
+ DBGC ( tls, "TLS %p received overlength Server Hello\n", tls );
+ DBGC_HD ( tls, data, len );
+ return -EINVAL;
+ }
+
+ /* Check protocol version */
+ if ( ntohs ( hello_a->version ) < TLS_VERSION_TLS_1_0 ) {
+ DBGC ( tls, "TLS %p does not support protocol version %d.%d\n",
+ tls, ( ntohs ( hello_a->version ) >> 8 ),
+ ( ntohs ( hello_a->version ) & 0xff ) );
+ return -ENOTSUP;
+ }
+
+ /* Copy out server random bytes */
+ memcpy ( tls->server_random, hello_a->random,
+ sizeof ( tls->server_random ) );
+
+ /* Select cipher suite */
+ if ( ( rc = tls_select_cipher ( tls, hello_b->cipher_suite ) ) != 0 )
+ return rc;
+
+ /* Generate secrets */
+ tls_generate_master_secret ( tls );
+ if ( ( rc = tls_generate_keys ( tls ) ) != 0 )
+ return rc;
+
+ return 0;
+}
+
+/**
+ * Receive new Certificate record
+ *
+ * @v tls TLS session
+ * @v data Plaintext record
+ * @v len Length of plaintext record
+ * @ret rc Return status code
+ */
+static int tls_new_certificate ( struct tls_session *tls,
+ void *data, size_t len ) {
+ struct {
+ uint32_t type_length;
+ uint8_t length[3];
+ uint8_t first_cert_length[3];
+ uint8_t asn1_start[0];
+ } __attribute__ (( packed )) *certificate = data;
+ uint8_t *cert = certificate->asn1_start;
+ int offset = 0;
+
+ /* FIXME */
+ (void) len;
+
+ if (asn1_next_obj(cert, &offset, ASN1_SEQUENCE) < 0 ||
+ asn1_next_obj(cert, &offset, ASN1_SEQUENCE) < 0 ||
+ asn1_skip_obj(cert, &offset, ASN1_EXPLICIT_TAG) ||
+ asn1_skip_obj(cert, &offset, ASN1_INTEGER) ||
+ asn1_skip_obj(cert, &offset, ASN1_SEQUENCE) ||
+ asn1_skip_obj(cert, &offset, ASN1_SEQUENCE) ||
+ asn1_skip_obj(cert, &offset, ASN1_SEQUENCE) ||
+ asn1_skip_obj(cert, &offset, ASN1_SEQUENCE) ||
+ asn1_next_obj(cert, &offset, ASN1_SEQUENCE) < 0 ||
+ asn1_skip_obj(cert, &offset, ASN1_SEQUENCE) ||
+ asn1_next_obj(cert, &offset, ASN1_BIT_STRING) < 0) {
+ DBGC ( tls, "TLS %p invalid certificate\n", tls );
+ DBGC_HD ( tls, cert + offset, 64 );
+ return -EPERM;
+ }
+
+ offset++;
+
+ if (asn1_next_obj(cert, &offset, ASN1_SEQUENCE) < 0) {
+ DBGC ( tls, "TLS %p invalid certificate\n", tls );
+ DBGC_HD ( tls, cert + offset, 64 );
+ return -EPERM;
+ }
+
+ tls->rsa_mod_len = asn1_get_int(cert, &offset, &tls->rsa_mod);
+ tls->rsa_pub_exp_len = asn1_get_int(cert, &offset, &tls->rsa_pub_exp);
+
+ DBGC_HD ( tls, tls->rsa_mod, tls->rsa_mod_len );
+ DBGC_HD ( tls, tls->rsa_pub_exp, tls->rsa_pub_exp_len );
+
+ return 0;
+}
+
+/**
+ * Receive new Server Hello Done record
+ *
+ * @v tls TLS session
+ * @v data Plaintext record
+ * @v len Length of plaintext record
+ * @ret rc Return status code
+ */
+static int tls_new_server_hello_done ( struct tls_session *tls,
+ void *data, size_t len ) {
+ struct {
+ uint32_t type_length;
+ char next[0];
+ } __attribute__ (( packed )) *hello_done = data;
+ void *end = hello_done->next;
+
+ /* Sanity check */
+ if ( end != ( data + len ) ) {
+ DBGC ( tls, "TLS %p received overlength Server Hello Done\n",
+ tls );
+ DBGC_HD ( tls, data, len );
+ return -EINVAL;
+ }
+
+ /* Check that we are ready to send the Client Key Exchange */
+ if ( tls->tx_state != TLS_TX_NONE ) {
+ DBGC ( tls, "TLS %p received Server Hello Done while in "
+ "TX state %d\n", tls, tls->tx_state );
+ return -EIO;
+ }
+
+ /* Start sending the Client Key Exchange */
+ tls->tx_state = TLS_TX_CLIENT_KEY_EXCHANGE;
+
+ return 0;
+}
+
+/**
+ * Receive new Finished record
+ *
+ * @v tls TLS session
+ * @v data Plaintext record
+ * @v len Length of plaintext record
+ * @ret rc Return status code
+ */
+static int tls_new_finished ( struct tls_session *tls,
+ void *data, size_t len ) {
+
+ /* FIXME: Handle this properly */
+ tls->tx_state = TLS_TX_DATA;
+ ( void ) data;
+ ( void ) len;
+ return 0;
+}
+
+/**
+ * Receive new Handshake record
+ *
+ * @v tls TLS session
+ * @v data Plaintext record
+ * @v len Length of plaintext record
+ * @ret rc Return status code
+ */
+static int tls_new_handshake ( struct tls_session *tls,
+ void *data, size_t len ) {
+ uint8_t *type = data;
+ int rc;
+
+ switch ( *type ) {
+ case TLS_SERVER_HELLO:
+ rc = tls_new_server_hello ( tls, data, len );
+ break;
+ case TLS_CERTIFICATE:
+ rc = tls_new_certificate ( tls, data, len );
+ break;
+ case TLS_SERVER_HELLO_DONE:
+ rc = tls_new_server_hello_done ( tls, data, len );
+ break;
+ case TLS_FINISHED:
+ rc = tls_new_finished ( tls, data, len );
+ break;
+ default:
+ DBGC ( tls, "TLS %p ignoring handshake type %d\n",
+ tls, *type );
+ rc = 0;
+ break;
+ }
+
+ /* Add to handshake digest (except for Hello Requests, which
+ * are explicitly excludede).
+ */
+ if ( *type != TLS_HELLO_REQUEST )
+ tls_add_handshake ( tls, data, len );
+
+ return rc;
+}
+
+/**
+ * Receive new record
+ *
+ * @v tls TLS session
+ * @v type Record type
+ * @v data Plaintext record
+ * @v len Length of plaintext record
+ * @ret rc Return status code
+ */
+static int tls_new_record ( struct tls_session *tls,
+ unsigned int type, void *data, size_t len ) {
+
+ switch ( type ) {
+ case TLS_TYPE_CHANGE_CIPHER:
+ return tls_new_change_cipher ( tls, data, len );
+ case TLS_TYPE_ALERT:
+ return tls_new_alert ( tls, data, len );
+ case TLS_TYPE_HANDSHAKE:
+ return tls_new_handshake ( tls, data, len );
+ case TLS_TYPE_DATA:
+ return xfer_deliver_raw ( &tls->plainstream.xfer, data, len );
+ default:
+ /* RFC4346 says that we should just ignore unknown
+ * record types.
+ */
+ DBGC ( tls, "TLS %p ignoring record type %d\n", tls, type );
+ return 0;
+ }
+}
+
+/******************************************************************************
+ *
+ * Record encryption/decryption
+ *
+ ******************************************************************************
+ */
+
+/**
+ * Calculate HMAC
+ *
+ * @v tls TLS session
+ * @v cipherspec Cipher specification
+ * @v seq Sequence number
+ * @v tlshdr TLS header
+ * @v data Data
+ * @v len Length of data
+ * @v mac HMAC to fill in
+ */
+static void tls_hmac ( struct tls_session *tls __unused,
+ struct tls_cipherspec *cipherspec,
+ uint64_t seq, struct tls_header *tlshdr,
+ const void *data, size_t len, void *hmac ) {
+ struct crypto_algorithm *digest = cipherspec->digest;
+ uint8_t digest_ctx[digest->ctxsize];
+
+ hmac_init ( digest, digest_ctx, cipherspec->mac_secret,
+ &digest->digestsize );
+ seq = cpu_to_be64 ( seq );
+ hmac_update ( digest, digest_ctx, &seq, sizeof ( seq ) );
+ hmac_update ( digest, digest_ctx, tlshdr, sizeof ( *tlshdr ) );
+ hmac_update ( digest, digest_ctx, data, len );
+ hmac_final ( digest, digest_ctx, cipherspec->mac_secret,
+ &digest->digestsize, hmac );
+}
+
+/**
+ * Allocate and assemble stream-ciphered record from data and MAC portions
+ *
+ * @v tls TLS session
+ * @ret data Data
+ * @ret len Length of data
+ * @ret digest MAC digest
+ * @ret plaintext_len Length of plaintext record
+ * @ret plaintext Allocated plaintext record
+ */
+static void * __malloc tls_assemble_stream ( struct tls_session *tls,
+ const void *data, size_t len,
+ void *digest, size_t *plaintext_len ) {
+ size_t mac_len = tls->tx_cipherspec.digest->digestsize;
+ void *plaintext;
+ void *content;
+ void *mac;
+
+ /* Calculate stream-ciphered struct length */
+ *plaintext_len = ( len + mac_len );
+
+ /* Allocate stream-ciphered struct */
+ plaintext = malloc ( *plaintext_len );
+ if ( ! plaintext )
+ return NULL;
+ content = plaintext;
+ mac = ( content + len );
+
+ /* Fill in stream-ciphered struct */
+ memcpy ( content, data, len );
+ memcpy ( mac, digest, mac_len );
+
+ return plaintext;
+}
+
+/**
+ * Allocate and assemble block-ciphered record from data and MAC portions
+ *
+ * @v tls TLS session
+ * @ret data Data
+ * @ret len Length of data
+ * @ret digest MAC digest
+ * @ret plaintext_len Length of plaintext record
+ * @ret plaintext Allocated plaintext record
+ */
+static void * tls_assemble_block ( struct tls_session *tls,
+ const void *data, size_t len,
+ void *digest, size_t *plaintext_len ) {
+ size_t blocksize = tls->tx_cipherspec.cipher->blocksize;
+ size_t iv_len = blocksize;
+ size_t mac_len = tls->tx_cipherspec.digest->digestsize;
+ size_t padding_len;
+ void *plaintext;
+ void *iv;
+ void *content;
+ void *mac;
+ void *padding;
+
+ /* FIXME: TLSv1.1 has an explicit IV */
+ iv_len = 0;
+
+ /* Calculate block-ciphered struct length */
+ padding_len = ( ( blocksize - 1 ) & -( iv_len + len + mac_len + 1 ) );
+ *plaintext_len = ( iv_len + len + mac_len + padding_len + 1 );
+
+ /* Allocate block-ciphered struct */
+ plaintext = malloc ( *plaintext_len );
+ if ( ! plaintext )
+ return NULL;
+ iv = plaintext;
+ content = ( iv + iv_len );
+ mac = ( content + len );
+ padding = ( mac + mac_len );
+
+ /* Fill in block-ciphered struct */
+ memset ( iv, 0, iv_len );
+ memcpy ( content, data, len );
+ memcpy ( mac, digest, mac_len );
+ memset ( padding, padding_len, ( padding_len + 1 ) );
+
+ return plaintext;
+}
+
+/**
+ * Send plaintext record
+ *
+ * @v tls TLS session
+ * @v type Record type
+ * @v data Plaintext record
+ * @v len Length of plaintext record
+ * @ret rc Return status code
+ */
+static int tls_send_plaintext ( struct tls_session *tls, unsigned int type,
+ const void *data, size_t len ) {
+ struct tls_header plaintext_tlshdr;
+ struct tls_header *tlshdr;
+ struct tls_cipherspec *cipherspec = &tls->tx_cipherspec;
+ void *plaintext = NULL;
+ size_t plaintext_len;
+ struct io_buffer *ciphertext = NULL;
+ size_t ciphertext_len;
+ size_t mac_len = cipherspec->digest->digestsize;
+ uint8_t mac[mac_len];
+ int rc;
+
+ /* Construct header */
+ plaintext_tlshdr.type = type;
+ plaintext_tlshdr.version = htons ( TLS_VERSION_TLS_1_0 );
+ plaintext_tlshdr.length = htons ( len );
+
+ /* Calculate MAC */
+ tls_hmac ( tls, cipherspec, tls->tx_seq, &plaintext_tlshdr,
+ data, len, mac );
+
+ /* Allocate and assemble plaintext struct */
+ if ( is_stream_cipher ( cipherspec->cipher ) ) {
+ plaintext = tls_assemble_stream ( tls, data, len, mac,
+ &plaintext_len );
+ } else {
+ plaintext = tls_assemble_block ( tls, data, len, mac,
+ &plaintext_len );
+ }
+ if ( ! plaintext ) {
+ DBGC ( tls, "TLS %p could not allocate %zd bytes for "
+ "plaintext\n", tls, plaintext_len );
+ rc = -ENOMEM;
+ goto done;
+ }
+
+ DBGC2 ( tls, "Sending plaintext data:\n" );
+ DBGC2_HD ( tls, plaintext, plaintext_len );
+
+ /* Allocate ciphertext */
+ ciphertext_len = ( sizeof ( *tlshdr ) + plaintext_len );
+ ciphertext = xfer_alloc_iob ( &tls->cipherstream.xfer,
+ ciphertext_len );
+ if ( ! ciphertext ) {
+ DBGC ( tls, "TLS %p could not allocate %zd bytes for "
+ "ciphertext\n", tls, ciphertext_len );
+ rc = -ENOMEM;
+ goto done;
+ }
+
+ /* Assemble ciphertext */
+ tlshdr = iob_put ( ciphertext, sizeof ( *tlshdr ) );
+ tlshdr->type = type;
+ tlshdr->version = htons ( TLS_VERSION_TLS_1_0 );
+ tlshdr->length = htons ( plaintext_len );
+ memcpy ( cipherspec->cipher_next_ctx, cipherspec->cipher_ctx,
+ cipherspec->cipher->ctxsize );
+ if ( ( rc = cipher_encrypt ( cipherspec->cipher,
+ cipherspec->cipher_next_ctx, plaintext,
+ iob_put ( ciphertext, plaintext_len ),
+ plaintext_len ) ) != 0 ) {
+ DBGC ( tls, "TLS %p could not encrypt: %s\n",
+ tls, strerror ( rc ) );
+ DBGC_HD ( tls, plaintext, plaintext_len );
+ goto done;
+ }
+
+ /* Free plaintext as soon as possible to conserve memory */
+ free ( plaintext );
+ plaintext = NULL;
+
+ /* Send ciphertext */
+ rc = xfer_deliver_iob ( &tls->cipherstream.xfer, ciphertext );
+ ciphertext = NULL;
+ if ( rc != 0 ) {
+ DBGC ( tls, "TLS %p could not deliver ciphertext: %s\n",
+ tls, strerror ( rc ) );
+ goto done;
+ }
+
+ /* Update TX state machine to next record */
+ tls->tx_seq += 1;
+ memcpy ( tls->tx_cipherspec.cipher_ctx,
+ tls->tx_cipherspec.cipher_next_ctx,
+ tls->tx_cipherspec.cipher->ctxsize );
+
+ done:
+ free ( plaintext );
+ free_iob ( ciphertext );
+ return rc;
+}
+
+/**
+ * Split stream-ciphered record into data and MAC portions
+ *
+ * @v tls TLS session
+ * @v plaintext Plaintext record
+ * @v plaintext_len Length of record
+ * @ret data Data
+ * @ret len Length of data
+ * @ret digest MAC digest
+ * @ret rc Return status code
+ */
+static int tls_split_stream ( struct tls_session *tls,
+ void *plaintext, size_t plaintext_len,
+ void **data, size_t *len, void **digest ) {
+ void *content;
+ size_t content_len;
+ void *mac;
+ size_t mac_len;
+
+ /* Decompose stream-ciphered data */
+ mac_len = tls->rx_cipherspec.digest->digestsize;
+ if ( plaintext_len < mac_len ) {
+ DBGC ( tls, "TLS %p received underlength record\n", tls );
+ DBGC_HD ( tls, plaintext, plaintext_len );
+ return -EINVAL;
+ }
+ content_len = ( plaintext_len - mac_len );
+ content = plaintext;
+ mac = ( content + content_len );
+
+ /* Fill in return values */
+ *data = content;
+ *len = content_len;
+ *digest = mac;
+
+ return 0;
+}
+
+/**
+ * Split block-ciphered record into data and MAC portions
+ *
+ * @v tls TLS session
+ * @v plaintext Plaintext record
+ * @v plaintext_len Length of record
+ * @ret data Data
+ * @ret len Length of data
+ * @ret digest MAC digest
+ * @ret rc Return status code
+ */
+static int tls_split_block ( struct tls_session *tls,
+ void *plaintext, size_t plaintext_len,
+ void **data, size_t *len,
+ void **digest ) {
+ void *iv;
+ size_t iv_len;
+ void *content;
+ size_t content_len;
+ void *mac;
+ size_t mac_len;
+ void *padding;
+ size_t padding_len;
+ unsigned int i;
+
+ /* Decompose block-ciphered data */
+ if ( plaintext_len < 1 ) {
+ DBGC ( tls, "TLS %p received underlength record\n", tls );
+ DBGC_HD ( tls, plaintext, plaintext_len );
+ return -EINVAL;
+ }
+ iv_len = tls->rx_cipherspec.cipher->blocksize;
+
+ /* FIXME: TLSv1.1 uses an explicit IV */
+ iv_len = 0;
+
+ mac_len = tls->rx_cipherspec.digest->digestsize;
+ padding_len = *( ( uint8_t * ) ( plaintext + plaintext_len - 1 ) );
+ if ( plaintext_len < ( iv_len + mac_len + padding_len + 1 ) ) {
+ DBGC ( tls, "TLS %p received underlength record\n", tls );
+ DBGC_HD ( tls, plaintext, plaintext_len );
+ return -EINVAL;
+ }
+ content_len = ( plaintext_len - iv_len - mac_len - padding_len - 1 );
+ iv = plaintext;
+ content = ( iv + iv_len );
+ mac = ( content + content_len );
+ padding = ( mac + mac_len );
+
+ /* Verify padding bytes */
+ for ( i = 0 ; i < padding_len ; i++ ) {
+ if ( *( ( uint8_t * ) ( padding + i ) ) != padding_len ) {
+ DBGC ( tls, "TLS %p received bad padding\n", tls );
+ DBGC_HD ( tls, plaintext, plaintext_len );
+ return -EINVAL;
+ }
+ }
+
+ /* Fill in return values */
+ *data = content;
+ *len = content_len;
+ *digest = mac;
+
+ return 0;
+}
+
+/**
+ * Receive new ciphertext record
+ *
+ * @v tls TLS session
+ * @v tlshdr Record header
+ * @v ciphertext Ciphertext record
+ * @ret rc Return status code
+ */
+static int tls_new_ciphertext ( struct tls_session *tls,
+ struct tls_header *tlshdr, void *ciphertext ) {
+ struct tls_header plaintext_tlshdr;
+ struct tls_cipherspec *cipherspec = &tls->rx_cipherspec;
+ size_t record_len = ntohs ( tlshdr->length );
+ void *plaintext = NULL;
+ void *data;
+ size_t len;
+ void *mac;
+ size_t mac_len = cipherspec->digest->digestsize;
+ uint8_t verify_mac[mac_len];
+ int rc;
+
+ /* Allocate buffer for plaintext */
+ plaintext = malloc ( record_len );
+ if ( ! plaintext ) {
+ DBGC ( tls, "TLS %p could not allocate %zd bytes for "
+ "decryption buffer\n", tls, record_len );
+ rc = -ENOMEM;
+ goto done;
+ }
+
+ /* Decrypt the record */
+ if ( ( rc = cipher_decrypt ( cipherspec->cipher,
+ cipherspec->cipher_ctx, ciphertext,
+ plaintext, record_len ) ) != 0 ) {
+ DBGC ( tls, "TLS %p could not decrypt: %s\n",
+ tls, strerror ( rc ) );
+ DBGC_HD ( tls, ciphertext, record_len );
+ goto done;
+ }
+
+ /* Split record into content and MAC */
+ if ( is_stream_cipher ( cipherspec->cipher ) ) {
+ if ( ( rc = tls_split_stream ( tls, plaintext, record_len,
+ &data, &len, &mac ) ) != 0 )
+ goto done;
+ } else {
+ if ( ( rc = tls_split_block ( tls, plaintext, record_len,
+ &data, &len, &mac ) ) != 0 )
+ goto done;
+ }
+
+ /* Verify MAC */
+ plaintext_tlshdr.type = tlshdr->type;
+ plaintext_tlshdr.version = tlshdr->version;
+ plaintext_tlshdr.length = htons ( len );
+ tls_hmac ( tls, cipherspec, tls->rx_seq, &plaintext_tlshdr,
+ data, len, verify_mac);
+ if ( memcmp ( mac, verify_mac, mac_len ) != 0 ) {
+ DBGC ( tls, "TLS %p failed MAC verification\n", tls );
+ DBGC_HD ( tls, plaintext, record_len );
+ goto done;
+ }
+
+ DBGC2 ( tls, "Received plaintext data:\n" );
+ DBGC2_HD ( tls, data, len );
+
+ /* Process plaintext record */
+ if ( ( rc = tls_new_record ( tls, tlshdr->type, data, len ) ) != 0 )
+ goto done;
+
+ rc = 0;
+ done:
+ free ( plaintext );
+ return rc;
+}
+
+/******************************************************************************
+ *
+ * Plaintext stream operations
+ *
+ ******************************************************************************
+ */
+
+/**
+ * Close interface
+ *
+ * @v xfer Plainstream data transfer interface
+ * @v rc Reason for close
+ */
+static void tls_plainstream_close ( struct xfer_interface *xfer, int rc ) {
+ struct tls_session *tls =
+ container_of ( xfer, struct tls_session, plainstream.xfer );
+
+ tls_close ( tls, rc );
+}
+
+/**
+ * Check flow control window
+ *
+ * @v xfer Plainstream data transfer interface
+ * @ret len Length of window
+ */
+static size_t tls_plainstream_window ( struct xfer_interface *xfer ) {
+ struct tls_session *tls =
+ container_of ( xfer, struct tls_session, plainstream.xfer );
+
+ /* Block window unless we are ready to accept data */
+ if ( tls->tx_state != TLS_TX_DATA )
+ return 0;
+
+ return filter_window ( xfer );
+}
+
+/**
+ * Deliver datagram as raw data
+ *
+ * @v xfer Plainstream data transfer interface
+ * @v data Data buffer
+ * @v len Length of data buffer
+ * @ret rc Return status code
+ */
+static int tls_plainstream_deliver_raw ( struct xfer_interface *xfer,
+ const void *data, size_t len ) {
+ struct tls_session *tls =
+ container_of ( xfer, struct tls_session, plainstream.xfer );
+
+ /* Refuse unless we are ready to accept data */
+ if ( tls->tx_state != TLS_TX_DATA )
+ return -ENOTCONN;
+
+ return tls_send_plaintext ( tls, TLS_TYPE_DATA, data, len );
+}
+
+/** TLS plaintext stream operations */
+static struct xfer_interface_operations tls_plainstream_operations = {
+ .close = tls_plainstream_close,
+ .vredirect = ignore_xfer_vredirect,
+ .window = tls_plainstream_window,
+ .alloc_iob = default_xfer_alloc_iob,
+ .deliver_iob = xfer_deliver_as_raw,
+ .deliver_raw = tls_plainstream_deliver_raw,
+};
+
+/******************************************************************************
+ *
+ * Ciphertext stream operations
+ *
+ ******************************************************************************
+ */
+
+/**
+ * Close interface
+ *
+ * @v xfer Plainstream data transfer interface
+ * @v rc Reason for close
+ */
+static void tls_cipherstream_close ( struct xfer_interface *xfer, int rc ) {
+ struct tls_session *tls =
+ container_of ( xfer, struct tls_session, cipherstream.xfer );
+
+ tls_close ( tls, rc );
+}
+
+/**
+ * Handle received TLS header
+ *
+ * @v tls TLS session
+ * @ret rc Returned status code
+ */
+static int tls_newdata_process_header ( struct tls_session *tls ) {
+ size_t data_len = ntohs ( tls->rx_header.length );
+
+ /* Allocate data buffer now that we know the length */
+ assert ( tls->rx_data == NULL );
+ tls->rx_data = malloc ( data_len );
+ if ( ! tls->rx_data ) {
+ DBGC ( tls, "TLS %p could not allocate %zd bytes "
+ "for receive buffer\n", tls, data_len );
+ return -ENOMEM;
+ }
+
+ /* Move to data state */
+ tls->rx_state = TLS_RX_DATA;
+
+ return 0;
+}
+
+/**
+ * Handle received TLS data payload
+ *
+ * @v tls TLS session
+ * @ret rc Returned status code
+ */
+static int tls_newdata_process_data ( struct tls_session *tls ) {
+ int rc;
+
+ /* Process record */
+ if ( ( rc = tls_new_ciphertext ( tls, &tls->rx_header,
+ tls->rx_data ) ) != 0 )
+ return rc;
+
+ /* Increment RX sequence number */
+ tls->rx_seq += 1;
+
+ /* Free data buffer */
+ free ( tls->rx_data );
+ tls->rx_data = NULL;
+
+ /* Return to header state */
+ tls->rx_state = TLS_RX_HEADER;
+
+ return 0;
+}
+
+/**
+ * Receive new ciphertext
+ *
+ * @v app Stream application
+ * @v data Data received
+ * @v len Length of received data
+ * @ret rc Return status code
+ */
+static int tls_cipherstream_deliver_raw ( struct xfer_interface *xfer,
+ const void *data, size_t len ) {
+ struct tls_session *tls =
+ container_of ( xfer, struct tls_session, cipherstream.xfer );
+ size_t frag_len;
+ void *buf;
+ size_t buf_len;
+ int ( * process ) ( struct tls_session *tls );
+ int rc;
+
+ while ( len ) {
+ /* Select buffer according to current state */
+ switch ( tls->rx_state ) {
+ case TLS_RX_HEADER:
+ buf = &tls->rx_header;
+ buf_len = sizeof ( tls->rx_header );
+ process = tls_newdata_process_header;
+ break;
+ case TLS_RX_DATA:
+ buf = tls->rx_data;
+ buf_len = ntohs ( tls->rx_header.length );
+ process = tls_newdata_process_data;
+ break;
+ default:
+ assert ( 0 );
+ return -EINVAL;
+ }
+
+ /* Copy data portion to buffer */
+ frag_len = ( buf_len - tls->rx_rcvd );
+ if ( frag_len > len )
+ frag_len = len;
+ memcpy ( ( buf + tls->rx_rcvd ), data, frag_len );
+ tls->rx_rcvd += frag_len;
+ data += frag_len;
+ len -= frag_len;
+
+ /* Process data if buffer is now full */
+ if ( tls->rx_rcvd == buf_len ) {
+ if ( ( rc = process ( tls ) ) != 0 ) {
+ tls_close ( tls, rc );
+ return rc;
+ }
+ tls->rx_rcvd = 0;
+ }
+ }
+
+ return 0;
+}
+
+/** TLS ciphertext stream operations */
+static struct xfer_interface_operations tls_cipherstream_operations = {
+ .close = tls_cipherstream_close,
+ .vredirect = xfer_vopen,
+ .window = filter_window,
+ .alloc_iob = default_xfer_alloc_iob,
+ .deliver_iob = xfer_deliver_as_raw,
+ .deliver_raw = tls_cipherstream_deliver_raw,
+};
+
+/******************************************************************************
+ *
+ * Controlling process
+ *
+ ******************************************************************************
+ */
+
+/**
+ * TLS TX state machine
+ *
+ * @v process TLS process
+ */
+static void tls_step ( struct process *process ) {
+ struct tls_session *tls =
+ container_of ( process, struct tls_session, process );
+ int rc;
+
+ /* Wait for cipherstream to become ready */
+ if ( ! xfer_window ( &tls->cipherstream.xfer ) )
+ return;
+
+ switch ( tls->tx_state ) {
+ case TLS_TX_NONE:
+ /* Nothing to do */
+ break;
+ case TLS_TX_CLIENT_HELLO:
+ /* Send Client Hello */
+ if ( ( rc = tls_send_client_hello ( tls ) ) != 0 ) {
+ DBGC ( tls, "TLS %p could not send Client Hello: %s\n",
+ tls, strerror ( rc ) );
+ goto err;
+ }
+ tls->tx_state = TLS_TX_NONE;
+ break;
+ case TLS_TX_CLIENT_KEY_EXCHANGE:
+ /* Send Client Key Exchange */
+ if ( ( rc = tls_send_client_key_exchange ( tls ) ) != 0 ) {
+ DBGC ( tls, "TLS %p could send Client Key Exchange: "
+ "%s\n", tls, strerror ( rc ) );
+ goto err;
+ }
+ tls->tx_state = TLS_TX_CHANGE_CIPHER;
+ break;
+ case TLS_TX_CHANGE_CIPHER:
+ /* Send Change Cipher, and then change the cipher in use */
+ if ( ( rc = tls_send_change_cipher ( tls ) ) != 0 ) {
+ DBGC ( tls, "TLS %p could not send Change Cipher: "
+ "%s\n", tls, strerror ( rc ) );
+ goto err;
+ }
+ if ( ( rc = tls_change_cipher ( tls,
+ &tls->tx_cipherspec_pending,
+ &tls->tx_cipherspec )) != 0 ){
+ DBGC ( tls, "TLS %p could not activate TX cipher: "
+ "%s\n", tls, strerror ( rc ) );
+ goto err;
+ }
+ tls->tx_seq = 0;
+ tls->tx_state = TLS_TX_FINISHED;
+ break;
+ case TLS_TX_FINISHED:
+ /* Send Finished */
+ if ( ( rc = tls_send_finished ( tls ) ) != 0 ) {
+ DBGC ( tls, "TLS %p could not send Finished: %s\n",
+ tls, strerror ( rc ) );
+ goto err;
+ }
+ tls->tx_state = TLS_TX_NONE;
+ break;
+ case TLS_TX_DATA:
+ /* Nothing to do */
+ break;
+ default:
+ assert ( 0 );
+ }
+
+ return;
+
+ err:
+ tls_close ( tls, rc );
+}
+
+/******************************************************************************
+ *
+ * Instantiator
+ *
+ ******************************************************************************
+ */
+
+int add_tls ( struct xfer_interface *xfer, struct xfer_interface **next ) {
+ struct tls_session *tls;
+
+ /* Allocate and initialise TLS structure */
+ tls = malloc ( sizeof ( *tls ) );
+ if ( ! tls )
+ return -ENOMEM;
+ memset ( tls, 0, sizeof ( *tls ) );
+ tls->refcnt.free = free_tls;
+ filter_init ( &tls->plainstream, &tls_plainstream_operations,
+ &tls->cipherstream, &tls_cipherstream_operations,
+ &tls->refcnt );
+ tls_clear_cipher ( tls, &tls->tx_cipherspec );
+ tls_clear_cipher ( tls, &tls->tx_cipherspec_pending );
+ tls_clear_cipher ( tls, &tls->rx_cipherspec );
+ tls_clear_cipher ( tls, &tls->rx_cipherspec_pending );
+ *( ( uint32_t * ) tls->client_random ) = 0; /* GMT Unix time */
+ tls_generate_random ( ( tls->client_random + 4 ),
+ ( sizeof ( tls->client_random ) - 4 ) );
+ *( ( uint16_t * ) tls->pre_master_secret )
+ = htons ( TLS_VERSION_TLS_1_0 );
+ tls_generate_random ( ( tls->pre_master_secret + 2 ),
+ ( sizeof ( tls->pre_master_secret ) - 2 ) );
+ digest_init ( &md5_algorithm, tls->handshake_md5_ctx );
+ digest_init ( &sha1_algorithm, tls->handshake_sha1_ctx );
+ tls->tx_state = TLS_TX_CLIENT_HELLO;
+ process_init ( &tls->process, tls_step, &tls->refcnt );
+
+ /* Attach to parent interface, mortalise self, and return */
+ xfer_plug_plug ( &tls->plainstream.xfer, xfer );
+ *next = &tls->cipherstream.xfer;
+ ref_put ( &tls->refcnt );
+ return 0;
+}
+
diff --git a/gpxe/src/net/udp.c b/gpxe/src/net/udp.c
new file mode 100644
index 00000000..89a5b868
--- /dev/null
+++ b/gpxe/src/net/udp.c
@@ -0,0 +1,465 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <byteswap.h>
+#include <errno.h>
+#include <gpxe/tcpip.h>
+#include <gpxe/iobuf.h>
+#include <gpxe/xfer.h>
+#include <gpxe/open.h>
+#include <gpxe/uri.h>
+#include <gpxe/udp.h>
+
+/** @file
+ *
+ * UDP protocol
+ */
+
+/**
+ * A UDP connection
+ *
+ */
+struct udp_connection {
+ /** Reference counter */
+ struct refcnt refcnt;
+ /** List of UDP connections */
+ struct list_head list;
+
+ /** Data transfer interface */
+ struct xfer_interface xfer;
+
+ /** Remote socket address */
+ struct sockaddr_tcpip peer;
+ /** Local port on which the connection receives packets */
+ unsigned int local_port;
+};
+
+/**
+ * List of registered UDP connections
+ */
+static LIST_HEAD ( udp_conns );
+
+/* Forward declatations */
+static struct xfer_interface_operations udp_xfer_operations;
+struct tcpip_protocol udp_protocol;
+
+/**
+ * Bind UDP connection to local port
+ *
+ * @v udp UDP connection
+ * @v port Local port, in network byte order, or zero
+ * @ret rc Return status code
+ *
+ * Opens the UDP connection and binds to a local port. If no local
+ * port is specified, the first available port will be used.
+ */
+static int udp_bind ( struct udp_connection *udp, unsigned int port ) {
+ struct udp_connection *existing;
+ static uint16_t try_port = 1024;
+
+ /* If no port specified, find the first available port */
+ if ( ! port ) {
+ for ( ; try_port ; try_port++ ) {
+ if ( try_port < 1024 )
+ continue;
+ if ( udp_bind ( udp, htons ( try_port ) ) == 0 )
+ return 0;
+ }
+ return -EADDRINUSE;
+ }
+
+ /* Attempt bind to local port */
+ list_for_each_entry ( existing, &udp_conns, list ) {
+ if ( existing->local_port == port ) {
+ DBGC ( udp, "UDP %p could not bind: port %d in use\n",
+ udp, ntohs ( port ) );
+ return -EADDRINUSE;
+ }
+ }
+ udp->local_port = port;
+
+ /* Add to UDP connection list */
+ DBGC ( udp, "UDP %p bound to port %d\n", udp, ntohs ( port ) );
+
+ return 0;
+}
+
+/**
+ * Open a UDP connection
+ *
+ * @v xfer Data transfer interface
+ * @v peer Peer socket address, or NULL
+ * @v local Local socket address, or NULL
+ * @v promisc Socket is promiscuous
+ * @ret rc Return status code
+ */
+static int udp_open_common ( struct xfer_interface *xfer,
+ struct sockaddr *peer, struct sockaddr *local,
+ int promisc ) {
+ struct sockaddr_tcpip *st_peer = ( struct sockaddr_tcpip * ) peer;
+ struct sockaddr_tcpip *st_local = ( struct sockaddr_tcpip * ) local;
+ struct udp_connection *udp;
+ unsigned int bind_port;
+ int rc;
+
+ /* Allocate and initialise structure */
+ udp = zalloc ( sizeof ( *udp ) );
+ if ( ! udp )
+ return -ENOMEM;
+ DBGC ( udp, "UDP %p allocated\n", udp );
+ xfer_init ( &udp->xfer, &udp_xfer_operations, &udp->refcnt );
+ if ( st_peer )
+ memcpy ( &udp->peer, st_peer, sizeof ( udp->peer ) );
+
+ /* Bind to local port */
+ if ( ! promisc ) {
+ bind_port = ( st_local ? st_local->st_port : 0 );
+ if ( ( rc = udp_bind ( udp, bind_port ) ) != 0 )
+ goto err;
+ }
+
+ /* Attach parent interface, transfer reference to connection
+ * list and return
+ */
+ xfer_plug_plug ( &udp->xfer, xfer );
+ list_add ( &udp->list, &udp_conns );
+ return 0;
+
+ err:
+ ref_put ( &udp->refcnt );
+ return rc;
+}
+
+/**
+ * Open a UDP connection
+ *
+ * @v xfer Data transfer interface
+ * @v peer Peer socket address
+ * @v local Local socket address, or NULL
+ * @ret rc Return status code
+ */
+int udp_open ( struct xfer_interface *xfer, struct sockaddr *peer,
+ struct sockaddr *local ) {
+ return udp_open_common ( xfer, peer, local, 0 );
+}
+
+/**
+ * Open a promiscuous UDP connection
+ *
+ * @v xfer Data transfer interface
+ * @ret rc Return status code
+ *
+ * Promiscuous UDP connections are required in order to support the
+ * PXE API.
+ */
+int udp_open_promisc ( struct xfer_interface *xfer ) {
+ return udp_open_common ( xfer, NULL, NULL, 1 );
+}
+
+/**
+ * Close a UDP connection
+ *
+ * @v udp UDP connection
+ * @v rc Reason for close
+ */
+static void udp_close ( struct udp_connection *udp, int rc ) {
+
+ /* Close data transfer interface */
+ xfer_nullify ( &udp->xfer );
+ xfer_close ( &udp->xfer, rc );
+
+ /* Remove from list of connections and drop list's reference */
+ list_del ( &udp->list );
+ ref_put ( &udp->refcnt );
+
+ DBGC ( udp, "UDP %p closed\n", udp );
+}
+
+/**
+ * Transmit data via a UDP connection to a specified address
+ *
+ * @v udp UDP connection
+ * @v iobuf I/O buffer
+ * @v src_port Source port, or 0 to use default
+ * @v dest Destination address, or NULL to use default
+ * @v netdev Network device, or NULL to use default
+ * @ret rc Return status code
+ */
+static int udp_tx ( struct udp_connection *udp, struct io_buffer *iobuf,
+ unsigned int src_port, struct sockaddr_tcpip *dest,
+ struct net_device *netdev ) {
+ struct udp_header *udphdr;
+ size_t len;
+ int rc;
+
+ /* Check we can accommodate the header */
+ if ( ( rc = iob_ensure_headroom ( iobuf, UDP_MAX_HLEN ) ) != 0 ) {
+ free_iob ( iobuf );
+ return rc;
+ }
+
+ /* Fill in default values if not explicitly provided */
+ if ( ! src_port )
+ src_port = udp->local_port;
+ if ( ! dest )
+ dest = &udp->peer;
+
+ /* Add the UDP header */
+ udphdr = iob_push ( iobuf, sizeof ( *udphdr ) );
+ len = iob_len ( iobuf );
+ udphdr->dest = dest->st_port;
+ udphdr->src = src_port;
+ udphdr->len = htons ( len );
+ udphdr->chksum = 0;
+ udphdr->chksum = tcpip_chksum ( udphdr, len );
+
+ /* Dump debugging information */
+ DBGC ( udp, "UDP %p TX %d->%d len %d\n", udp,
+ ntohs ( udphdr->src ), ntohs ( udphdr->dest ),
+ ntohs ( udphdr->len ) );
+
+ /* Send it to the next layer for processing */
+ if ( ( rc = tcpip_tx ( iobuf, &udp_protocol, dest, netdev,
+ &udphdr->chksum ) ) != 0 ) {
+ DBGC ( udp, "UDP %p could not transmit packet: %s\n",
+ udp, strerror ( rc ) );
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * Identify UDP connection by local port number
+ *
+ * @v local_port Local port (in network-endian order)
+ * @ret udp UDP connection, or NULL
+ */
+static struct udp_connection * udp_demux ( unsigned int local_port ) {
+ struct udp_connection *udp;
+
+ list_for_each_entry ( udp, &udp_conns, list ) {
+ if ( ( udp->local_port == local_port ) ||
+ ( udp->local_port == 0 ) ) {
+ return udp;
+ }
+ }
+ return NULL;
+}
+
+/**
+ * Process a received packet
+ *
+ * @v iobuf I/O buffer
+ * @v st_src Partially-filled source address
+ * @v st_dest Partially-filled destination address
+ * @v pshdr_csum Pseudo-header checksum
+ * @ret rc Return status code
+ */
+static int udp_rx ( struct io_buffer *iobuf, struct sockaddr_tcpip *st_src,
+ struct sockaddr_tcpip *st_dest, uint16_t pshdr_csum ) {
+ struct udp_header *udphdr = iobuf->data;
+ struct udp_connection *udp;
+ struct xfer_metadata meta;
+ size_t ulen;
+ unsigned int csum;
+ int rc = 0;
+
+ /* Sanity check packet */
+ if ( iob_len ( iobuf ) < sizeof ( *udphdr ) ) {
+ DBG ( "UDP packet too short at %zd bytes (min %zd bytes)\n",
+ iob_len ( iobuf ), sizeof ( *udphdr ) );
+
+ rc = -EINVAL;
+ goto done;
+ }
+ ulen = ntohs ( udphdr->len );
+ if ( ulen < sizeof ( *udphdr ) ) {
+ DBG ( "UDP length too short at %zd bytes "
+ "(header is %zd bytes)\n", ulen, sizeof ( *udphdr ) );
+ rc = -EINVAL;
+ goto done;
+ }
+ if ( ulen > iob_len ( iobuf ) ) {
+ DBG ( "UDP length too long at %zd bytes (packet is %zd "
+ "bytes)\n", ulen, iob_len ( iobuf ) );
+ rc = -EINVAL;
+ goto done;
+ }
+ if ( udphdr->chksum ) {
+ csum = tcpip_continue_chksum ( pshdr_csum, iobuf->data, ulen );
+ if ( csum != 0 ) {
+ DBG ( "UDP checksum incorrect (is %04x including "
+ "checksum field, should be 0000)\n", csum );
+ rc = -EINVAL;
+ goto done;
+ }
+ }
+
+ /* Parse parameters from header and strip header */
+ st_src->st_port = udphdr->src;
+ st_dest->st_port = udphdr->dest;
+ udp = udp_demux ( udphdr->dest );
+ iob_unput ( iobuf, ( iob_len ( iobuf ) - ulen ) );
+ iob_pull ( iobuf, sizeof ( *udphdr ) );
+
+ /* Dump debugging information */
+ DBGC ( udp, "UDP %p RX %d<-%d len %zd\n", udp,
+ ntohs ( udphdr->dest ), ntohs ( udphdr->src ), ulen );
+
+ /* Ignore if no matching connection found */
+ if ( ! udp ) {
+ DBG ( "No UDP connection listening on port %d\n",
+ ntohs ( udphdr->dest ) );
+ rc = -ENOTCONN;
+ goto done;
+ }
+
+ /* Pass data to application */
+ memset ( &meta, 0, sizeof ( meta ) );
+ meta.src = ( struct sockaddr * ) st_src;
+ meta.dest = ( struct sockaddr * ) st_dest;
+ rc = xfer_deliver_iob_meta ( &udp->xfer, iobuf, &meta );
+ iobuf = NULL;
+
+ done:
+ free_iob ( iobuf );
+ return rc;
+}
+
+struct tcpip_protocol udp_protocol __tcpip_protocol = {
+ .name = "UDP",
+ .rx = udp_rx,
+ .tcpip_proto = IP_UDP,
+};
+
+/***************************************************************************
+ *
+ * Data transfer interface
+ *
+ ***************************************************************************
+ */
+
+/**
+ * Close interface
+ *
+ * @v xfer Data transfer interface
+ * @v rc Reason for close
+ */
+static void udp_xfer_close ( struct xfer_interface *xfer, int rc ) {
+ struct udp_connection *udp =
+ container_of ( xfer, struct udp_connection, xfer );
+
+ /* Close connection */
+ udp_close ( udp, rc );
+}
+
+/**
+ * Allocate I/O buffer for UDP
+ *
+ * @v xfer Data transfer interface
+ * @v len Payload size
+ * @ret iobuf I/O buffer, or NULL
+ */
+static struct io_buffer * udp_alloc_iob ( struct xfer_interface *xfer,
+ size_t len ) {
+ struct udp_connection *udp =
+ container_of ( xfer, struct udp_connection, xfer );
+ struct io_buffer *iobuf;
+
+ iobuf = alloc_iob ( UDP_MAX_HLEN + len );
+ if ( ! iobuf ) {
+ DBGC ( udp, "UDP %p cannot allocate buffer of length %zd\n",
+ udp, len );
+ return NULL;
+ }
+ iob_reserve ( iobuf, UDP_MAX_HLEN );
+ return iobuf;
+}
+
+/**
+ * Deliver datagram as I/O buffer
+ *
+ * @v xfer Data transfer interface
+ * @v iobuf Datagram I/O buffer
+ * @v meta Data transfer metadata, or NULL
+ * @ret rc Return status code
+ */
+static int udp_xfer_deliver_iob ( struct xfer_interface *xfer,
+ struct io_buffer *iobuf,
+ struct xfer_metadata *meta ) {
+ struct udp_connection *udp =
+ container_of ( xfer, struct udp_connection, xfer );
+ struct sockaddr_tcpip *src;
+ struct sockaddr_tcpip *dest = NULL;
+ struct net_device *netdev = NULL;
+ unsigned int src_port = 0;
+
+ /* Apply xfer metadata */
+ if ( meta ) {
+ src = ( struct sockaddr_tcpip * ) meta->src;
+ if ( src )
+ src_port = src->st_port;
+ dest = ( struct sockaddr_tcpip * ) meta->dest;
+ netdev = meta->netdev;
+ }
+
+ /* Transmit data, if possible */
+ udp_tx ( udp, iobuf, src_port, dest, netdev );
+
+ return 0;
+}
+
+/** UDP data transfer interface operations */
+static struct xfer_interface_operations udp_xfer_operations = {
+ .close = udp_xfer_close,
+ .vredirect = ignore_xfer_vredirect,
+ .window = unlimited_xfer_window,
+ .alloc_iob = udp_alloc_iob,
+ .deliver_iob = udp_xfer_deliver_iob,
+ .deliver_raw = xfer_deliver_as_iob,
+};
+
+/***************************************************************************
+ *
+ * Openers
+ *
+ ***************************************************************************
+ */
+
+/** UDP socket opener */
+struct socket_opener udp_socket_opener __socket_opener = {
+ .semantics = SOCK_DGRAM,
+ .family = AF_INET,
+ .open = udp_open,
+};
+
+char UDP_SOCK_DGRAM[1];
+
+/**
+ * Open UDP URI
+ *
+ * @v xfer Data transfer interface
+ * @v uri URI
+ * @ret rc Return status code
+ */
+static int udp_open_uri ( struct xfer_interface *xfer, struct uri *uri ) {
+ struct sockaddr_tcpip peer;
+
+ /* Sanity check */
+ if ( ! uri->host )
+ return -EINVAL;
+
+ memset ( &peer, 0, sizeof ( peer ) );
+ peer.st_port = htons ( uri_port ( uri, 0 ) );
+ return xfer_open_named_socket ( xfer, SOCK_DGRAM,
+ ( struct sockaddr * ) &peer,
+ uri->host, NULL );
+}
+
+/** UDP URI opener */
+struct uri_opener udp_uri_opener __uri_opener = {
+ .scheme = "udp",
+ .open = udp_open_uri,
+};
diff --git a/gpxe/src/net/udp/dhcp.c b/gpxe/src/net/udp/dhcp.c
new file mode 100644
index 00000000..3961c61d
--- /dev/null
+++ b/gpxe/src/net/udp/dhcp.c
@@ -0,0 +1,825 @@
+/*
+ * Copyright (C) 2006 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <assert.h>
+#include <byteswap.h>
+#include <gpxe/if_ether.h>
+#include <gpxe/netdevice.h>
+#include <gpxe/device.h>
+#include <gpxe/xfer.h>
+#include <gpxe/open.h>
+#include <gpxe/job.h>
+#include <gpxe/retry.h>
+#include <gpxe/tcpip.h>
+#include <gpxe/ip.h>
+#include <gpxe/uuid.h>
+#include <gpxe/dhcp.h>
+#include <gpxe/timer.h>
+#include <gpxe/settings.h>
+#include <gpxe/dhcp.h>
+#include <gpxe/dhcpopts.h>
+#include <gpxe/dhcppkt.h>
+
+/** @file
+ *
+ * Dynamic Host Configuration Protocol
+ *
+ */
+
+/**
+ * DHCP operation types
+ *
+ * This table maps from DHCP message types (i.e. values of the @c
+ * DHCP_MESSAGE_TYPE option) to values of the "op" field within a DHCP
+ * packet.
+ */
+static const uint8_t dhcp_op[] = {
+ [DHCPDISCOVER] = BOOTP_REQUEST,
+ [DHCPOFFER] = BOOTP_REPLY,
+ [DHCPREQUEST] = BOOTP_REQUEST,
+ [DHCPDECLINE] = BOOTP_REQUEST,
+ [DHCPACK] = BOOTP_REPLY,
+ [DHCPNAK] = BOOTP_REPLY,
+ [DHCPRELEASE] = BOOTP_REQUEST,
+ [DHCPINFORM] = BOOTP_REQUEST,
+};
+
+/** Raw option data for options common to all DHCP requests */
+static uint8_t dhcp_request_options_data[] = {
+ DHCP_MAX_MESSAGE_SIZE, DHCP_WORD ( ETH_MAX_MTU ),
+ DHCP_VENDOR_CLASS_ID,
+ DHCP_STRING ( 'P', 'X', 'E', 'C', 'l', 'i', 'e', 'n', 't', ':',
+ 'A', 'r', 'c', 'h', ':', '0', '0', '0', '0', '0', ':',
+ 'U', 'N', 'D', 'I', ':', '0', '0', '2', '0', '0', '1' ),
+ DHCP_CLIENT_ARCHITECTURE, DHCP_WORD ( 0 ),
+ DHCP_CLIENT_NDI, DHCP_OPTION ( 1 /* UNDI */ , 2, 1 /* v2.1 */ ),
+ DHCP_PARAMETER_REQUEST_LIST,
+ DHCP_OPTION ( DHCP_SUBNET_MASK, DHCP_ROUTERS, DHCP_DNS_SERVERS,
+ DHCP_LOG_SERVERS, DHCP_HOST_NAME, DHCP_DOMAIN_NAME,
+ DHCP_ROOT_PATH, DHCP_VENDOR_ENCAP, DHCP_VENDOR_CLASS_ID,
+ DHCP_TFTP_SERVER_NAME, DHCP_BOOTFILE_NAME,
+ DHCP_EB_ENCAP, DHCP_ISCSI_INITIATOR_IQN ),
+ DHCP_END
+};
+
+/** Options common to all DHCP requests */
+static struct dhcp_options dhcp_request_options = {
+ .data = dhcp_request_options_data,
+ .max_len = sizeof ( dhcp_request_options_data ),
+ .len = sizeof ( dhcp_request_options_data ),
+};
+
+/** DHCP feature codes */
+static uint8_t dhcp_features[0] __table_start ( uint8_t, dhcp_features );
+static uint8_t dhcp_features_end[0] __table_end ( uint8_t, dhcp_features );
+
+/**
+ * Name a DHCP packet type
+ *
+ * @v msgtype DHCP message type
+ * @ret string DHCP mesasge type name
+ */
+static inline const char * dhcp_msgtype_name ( unsigned int msgtype ) {
+ switch ( msgtype ) {
+ case 0: return "BOOTP"; /* Non-DHCP packet */
+ case DHCPDISCOVER: return "DHCPDISCOVER";
+ case DHCPOFFER: return "DHCPOFFER";
+ case DHCPREQUEST: return "DHCPREQUEST";
+ case DHCPDECLINE: return "DHCPDECLINE";
+ case DHCPACK: return "DHCPACK";
+ case DHCPNAK: return "DHCPNAK";
+ case DHCPRELEASE: return "DHCPRELEASE";
+ case DHCPINFORM: return "DHCPINFORM";
+ default: return "DHCP<invalid>";
+ }
+}
+
+/**
+ * Calculate DHCP transaction ID for a network device
+ *
+ * @v netdev Network device
+ * @ret xid DHCP XID
+ *
+ * Extract the least significant bits of the hardware address for use
+ * as the transaction ID.
+ */
+static uint32_t dhcp_xid ( struct net_device *netdev ) {
+ uint32_t xid;
+
+ memcpy ( &xid, ( netdev->ll_addr + netdev->ll_protocol->ll_addr_len
+ - sizeof ( xid ) ), sizeof ( xid ) );
+ return xid;
+}
+
+/**
+ * Create a DHCP packet
+ *
+ * @v dhcppkt DHCP packet structure to fill in
+ * @v netdev Network device
+ * @v msgtype DHCP message type
+ * @v options Initial options to include (or NULL)
+ * @v data Buffer for DHCP packet
+ * @v max_len Size of DHCP packet buffer
+ * @ret rc Return status code
+ *
+ * Creates a DHCP packet in the specified buffer, and fills out a @c
+ * dhcp_packet structure that can be passed to
+ * set_dhcp_packet_option() or copy_dhcp_packet_options().
+ */
+int create_dhcp_packet ( struct dhcp_packet *dhcppkt,
+ struct net_device *netdev, uint8_t msgtype,
+ struct dhcp_options *options,
+ void *data, size_t max_len ) {
+ struct dhcphdr *dhcphdr = data;
+ size_t options_len;
+ unsigned int hlen;
+ int rc;
+
+ /* Sanity check */
+ options_len = ( options ? options->len : 0 );
+ if ( max_len < ( sizeof ( *dhcphdr ) + options_len ) )
+ return -ENOSPC;
+
+ /* Initialise DHCP packet content */
+ memset ( dhcphdr, 0, max_len );
+ dhcphdr->xid = dhcp_xid ( netdev );
+ dhcphdr->magic = htonl ( DHCP_MAGIC_COOKIE );
+ dhcphdr->htype = ntohs ( netdev->ll_protocol->ll_proto );
+ dhcphdr->op = dhcp_op[msgtype];
+ /* If hardware length exceeds the chaddr field length, don't
+ * use the chaddr field. This is as per RFC4390.
+ */
+ hlen = netdev->ll_protocol->ll_addr_len;
+ if ( hlen > sizeof ( dhcphdr->chaddr ) ) {
+ hlen = 0;
+ dhcphdr->flags = htons ( BOOTP_FL_BROADCAST );
+ }
+ dhcphdr->hlen = hlen;
+ memcpy ( dhcphdr->chaddr, netdev->ll_addr, hlen );
+ memcpy ( dhcphdr->options, options->data, options_len );
+
+ /* Initialise DHCP packet structure */
+ memset ( dhcppkt, 0, sizeof ( *dhcppkt ) );
+ dhcppkt_init ( dhcppkt, data, max_len );
+
+ /* Set DHCP_MESSAGE_TYPE option */
+ if ( ( rc = dhcppkt_store ( dhcppkt, DHCP_MESSAGE_TYPE,
+ &msgtype, sizeof ( msgtype ) ) ) != 0 )
+ return rc;
+
+ return 0;
+}
+
+/** DHCP network device descriptor */
+struct dhcp_netdev_desc {
+ /** Bus type ID */
+ uint8_t type;
+ /** Vendor ID */
+ uint16_t vendor;
+ /** Device ID */
+ uint16_t device;
+} __attribute__ (( packed ));
+
+/** DHCP client identifier */
+struct dhcp_client_id {
+ /** Link-layer protocol */
+ uint8_t ll_proto;
+ /** Link-layer address */
+ uint8_t ll_addr[MAX_LL_ADDR_LEN];
+} __attribute__ (( packed ));
+
+/** DHCP client UUID */
+struct dhcp_client_uuid {
+ /** Identifier type */
+ uint8_t type;
+ /** UUID */
+ union uuid uuid;
+} __attribute__ (( packed ));
+
+#define DHCP_CLIENT_UUID_TYPE 0
+
+/**
+ * Create DHCP request packet
+ *
+ * @v dhcppkt DHCP packet structure to fill in
+ * @v netdev Network device
+ * @v dhcpoffer DHCPOFFER packet received from server
+ * @v data Buffer for DHCP packet
+ * @v max_len Size of DHCP packet buffer
+ * @ret rc Return status code
+ */
+int create_dhcp_request ( struct dhcp_packet *dhcppkt,
+ struct net_device *netdev,
+ struct dhcp_packet *dhcpoffer,
+ void *data, size_t max_len ) {
+ struct device_description *desc = &netdev->dev->desc;
+ struct dhcp_netdev_desc dhcp_desc;
+ struct dhcp_client_id client_id;
+ struct dhcp_client_uuid client_uuid;
+ unsigned int msgtype;
+ size_t dhcp_features_len;
+ size_t ll_addr_len;
+ int rc;
+
+ /* Create DHCP packet */
+ msgtype = ( dhcpoffer ? DHCPREQUEST : DHCPDISCOVER );
+ if ( ( rc = create_dhcp_packet ( dhcppkt, netdev, msgtype,
+ &dhcp_request_options, data,
+ max_len ) ) != 0 ) {
+ DBG ( "DHCP could not create DHCP packet: %s\n",
+ strerror ( rc ) );
+ return rc;
+ }
+
+ /* Copy any required options from previous server repsonse */
+ if ( dhcpoffer ) {
+ struct in_addr server_id;
+ struct in_addr requested_ip;
+
+ if ( dhcppkt_fetch ( dhcpoffer, DHCP_SERVER_IDENTIFIER,
+ &server_id, sizeof ( server_id ) )
+ != sizeof ( server_id ) ) {
+ DBG ( "DHCP offer missing server identifier\n" );
+ return -EINVAL;
+ }
+ if ( dhcppkt_fetch ( dhcpoffer, DHCP_EB_YIADDR,
+ &requested_ip, sizeof ( requested_ip ) )
+ != sizeof ( requested_ip ) ) {
+ DBG ( "DHCP offer missing IP address\n" );
+ return -EINVAL;
+ }
+ if ( ( rc = dhcppkt_store ( dhcppkt, DHCP_SERVER_IDENTIFIER,
+ &server_id,
+ sizeof ( server_id ) ) ) != 0 ) {
+ DBG ( "DHCP could not set server identifier: %s\n ",
+ strerror ( rc ) );
+ return rc;
+ }
+ if ( ( rc = dhcppkt_store ( dhcppkt, DHCP_REQUESTED_ADDRESS,
+ &requested_ip,
+ sizeof ( requested_ip ) ) ) != 0 ){
+ DBG ( "DHCP could not set requested address: %s\n",
+ strerror ( rc ) );
+ return rc;
+ }
+ }
+
+ /* Add options to identify the feature list */
+ dhcp_features_len = ( dhcp_features_end - dhcp_features );
+ if ( ( rc = dhcppkt_store ( dhcppkt, DHCP_EB_ENCAP, dhcp_features,
+ dhcp_features_len ) ) != 0 ) {
+ DBG ( "DHCP could not set features list option: %s\n",
+ strerror ( rc ) );
+ return rc;
+ }
+
+ /* Add options to identify the network device */
+ dhcp_desc.type = desc->bus_type;
+ dhcp_desc.vendor = htons ( desc->vendor );
+ dhcp_desc.device = htons ( desc->device );
+ if ( ( rc = dhcppkt_store ( dhcppkt, DHCP_EB_BUS_ID, &dhcp_desc,
+ sizeof ( dhcp_desc ) ) ) != 0 ) {
+ DBG ( "DHCP could not set bus ID option: %s\n",
+ strerror ( rc ) );
+ return rc;
+ }
+
+ /* Add DHCP client identifier. Required for Infiniband, and
+ * doesn't hurt other link layers.
+ */
+ client_id.ll_proto = ntohs ( netdev->ll_protocol->ll_proto );
+ ll_addr_len = netdev->ll_protocol->ll_addr_len;
+ assert ( ll_addr_len <= sizeof ( client_id.ll_addr ) );
+ memcpy ( client_id.ll_addr, netdev->ll_addr, ll_addr_len );
+ if ( ( rc = dhcppkt_store ( dhcppkt, DHCP_CLIENT_ID, &client_id,
+ ( ll_addr_len + 1 ) ) ) != 0 ) {
+ DBG ( "DHCP could not set client ID: %s\n",
+ strerror ( rc ) );
+ return rc;
+ }
+
+ /* Add client UUID, if we have one. Required for PXE. */
+ client_uuid.type = DHCP_CLIENT_UUID_TYPE;
+ if ( ( rc = get_uuid ( &client_uuid.uuid ) ) == 0 ) {
+ if ( ( rc = dhcppkt_store ( dhcppkt, DHCP_CLIENT_UUID,
+ &client_uuid,
+ sizeof ( client_uuid ) ) ) != 0 ) {
+ DBG ( "DHCP could not set client UUID: %s\n",
+ strerror ( rc ) );
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+/****************************************************************************
+ *
+ * DHCP settings
+ *
+ */
+
+/** A DHCP settings block */
+struct dhcp_settings {
+ /** Reference counter */
+ struct refcnt refcnt;
+ /** DHCP packet */
+ struct dhcp_packet dhcppkt;
+ /** Setting interface */
+ struct settings settings;
+};
+
+/**
+ * Decrement reference count on DHCP settings block
+ *
+ * @v dhcpset DHCP settings block
+ */
+static inline void dhcpset_put ( struct dhcp_settings *dhcpset ) {
+ ref_put ( &dhcpset->refcnt );
+}
+
+/**
+ * Store value of DHCP setting
+ *
+ * @v settings Settings block
+ * @v setting Setting to store
+ * @v data Setting data, or NULL to clear setting
+ * @v len Length of setting data
+ * @ret rc Return status code
+ */
+static int dhcpset_store ( struct settings *settings, struct setting *setting,
+ const void *data, size_t len ) {
+ struct dhcp_settings *dhcpset =
+ container_of ( settings, struct dhcp_settings, settings );
+
+ return dhcppkt_store ( &dhcpset->dhcppkt, setting->tag, data, len );
+}
+
+/**
+ * Fetch value of setting
+ *
+ * @v settings Settings block, or NULL to search all blocks
+ * @v setting Setting to fetch
+ * @v data Buffer to fill with setting data
+ * @v len Length of buffer
+ * @ret len Length of setting data, or negative error
+ */
+static int dhcpset_fetch ( struct settings *settings, struct setting *setting,
+ void *data, size_t len ) {
+ struct dhcp_settings *dhcpset =
+ container_of ( settings, struct dhcp_settings, settings );
+
+ return dhcppkt_fetch ( &dhcpset->dhcppkt, setting->tag, data, len );
+}
+
+/** DHCP settings operations */
+static struct settings_operations dhcpset_settings_operations = {
+ .store = dhcpset_store,
+ .fetch = dhcpset_fetch,
+};
+
+/**
+ * Create DHCP setting block
+ *
+ * @v dhcphdr DHCP packet
+ * @v len Length of DHCP packet
+ * @ret dhcpset DHCP settings block
+ */
+static struct dhcp_settings * dhcpset_create ( const struct dhcphdr *dhcphdr,
+ size_t len ) {
+ struct dhcp_settings *dhcpset;
+ void *data;
+
+ dhcpset = zalloc ( sizeof ( *dhcpset ) + len );
+ if ( dhcpset ) {
+ data = ( ( ( void * ) dhcpset ) + sizeof ( *dhcpset ) );
+ memcpy ( data, dhcphdr, len );
+ dhcppkt_init ( &dhcpset->dhcppkt, data, len );
+ settings_init ( &dhcpset->settings,
+ &dhcpset_settings_operations, &dhcpset->refcnt,
+ DHCP_SETTINGS_NAME );
+ }
+ return dhcpset;
+}
+
+/****************************************************************************
+ *
+ * DHCP to UDP interface
+ *
+ */
+
+/** A DHCP session */
+struct dhcp_session {
+ /** Reference counter */
+ struct refcnt refcnt;
+ /** Job control interface */
+ struct job_interface job;
+ /** Data transfer interface */
+ struct xfer_interface xfer;
+
+ /** Network device being configured */
+ struct net_device *netdev;
+
+ /** State of the session
+ *
+ * This is a value for the @c DHCP_MESSAGE_TYPE option
+ * (e.g. @c DHCPDISCOVER).
+ */
+ int state;
+ /** Response obtained from DHCP server */
+ struct dhcp_settings *response;
+ /** Response obtained from ProxyDHCP server */
+ struct dhcp_settings *proxy_response;
+ /** Retransmission timer */
+ struct retry_timer timer;
+ /** Session start time (in ticks) */
+ unsigned long start;
+};
+
+/**
+ * Free DHCP session
+ *
+ * @v refcnt Reference counter
+ */
+static void dhcp_free ( struct refcnt *refcnt ) {
+ struct dhcp_session *dhcp =
+ container_of ( refcnt, struct dhcp_session, refcnt );
+
+ netdev_put ( dhcp->netdev );
+ dhcpset_put ( dhcp->response );
+ dhcpset_put ( dhcp->proxy_response );
+ free ( dhcp );
+}
+
+/**
+ * Mark DHCP session as complete
+ *
+ * @v dhcp DHCP session
+ * @v rc Return status code
+ */
+static void dhcp_finished ( struct dhcp_session *dhcp, int rc ) {
+
+ /* Block futher incoming messages */
+ job_nullify ( &dhcp->job );
+ xfer_nullify ( &dhcp->xfer );
+
+ /* Stop retry timer */
+ stop_timer ( &dhcp->timer );
+
+ /* Free resources and close interfaces */
+ xfer_close ( &dhcp->xfer, rc );
+ job_done ( &dhcp->job, rc );
+}
+
+/**
+ * Register options received via DHCP
+ *
+ * @v dhcp DHCP session
+ * @ret rc Return status code
+ */
+static int dhcp_register_settings ( struct dhcp_session *dhcp ) {
+ struct settings *old_settings;
+ struct settings *settings;
+ struct settings *parent;
+ int rc;
+
+ /* Register ProxyDHCP settings, if present */
+ if ( dhcp->proxy_response ) {
+ settings = &dhcp->proxy_response->settings;
+ settings->name = PROXYDHCP_SETTINGS_NAME;
+ old_settings = find_settings ( settings->name );
+ if ( old_settings )
+ unregister_settings ( old_settings );
+ if ( ( rc = register_settings ( settings, NULL ) ) != 0 )
+ return rc;
+ }
+
+ /* Register DHCP settings */
+ parent = netdev_settings ( dhcp->netdev );
+ settings = &dhcp->response->settings;
+ old_settings = find_child_settings ( parent, settings->name );
+ if ( old_settings )
+ unregister_settings ( old_settings );
+ if ( ( rc = register_settings ( settings, parent ) ) != 0 )
+ return rc;
+
+ return 0;
+}
+
+/****************************************************************************
+ *
+ * Data transfer interface
+ *
+ */
+
+/**
+ * Transmit DHCP request
+ *
+ * @v dhcp DHCP session
+ * @ret rc Return status code
+ */
+static int dhcp_send_request ( struct dhcp_session *dhcp ) {
+ struct xfer_metadata meta = {
+ .netdev = dhcp->netdev,
+ };
+ struct io_buffer *iobuf;
+ struct dhcp_packet *dhcpoffer = NULL;
+ struct dhcp_packet dhcppkt;
+ int rc;
+
+ DBGC ( dhcp, "DHCP %p transmitting %s\n",
+ dhcp, dhcp_msgtype_name ( dhcp->state ) );
+
+ assert ( ( dhcp->state == DHCPDISCOVER ) ||
+ ( dhcp->state == DHCPREQUEST ) );
+
+ /* Start retry timer. Do this first so that failures to
+ * transmit will be retried.
+ */
+ start_timer ( &dhcp->timer );
+
+ /* Allocate buffer for packet */
+ iobuf = xfer_alloc_iob ( &dhcp->xfer, DHCP_MIN_LEN );
+ if ( ! iobuf )
+ return -ENOMEM;
+
+ /* Create DHCP packet in temporary buffer */
+ if ( dhcp->state == DHCPREQUEST ) {
+ assert ( dhcp->response );
+ dhcpoffer = &dhcp->response->dhcppkt;
+ }
+ if ( ( rc = create_dhcp_request ( &dhcppkt, dhcp->netdev,
+ dhcpoffer, iobuf->data,
+ iob_tailroom ( iobuf ) ) ) != 0 ) {
+ DBGC ( dhcp, "DHCP %p could not construct DHCP request: %s\n",
+ dhcp, strerror ( rc ) );
+ goto done;
+ }
+
+ /* Transmit the packet */
+ iob_put ( iobuf, dhcppkt.len );
+ rc = xfer_deliver_iob_meta ( &dhcp->xfer, iobuf, &meta );
+ iobuf = NULL;
+ if ( rc != 0 ) {
+ DBGC ( dhcp, "DHCP %p could not transmit UDP packet: %s\n",
+ dhcp, strerror ( rc ) );
+ goto done;
+ }
+
+ done:
+ free_iob ( iobuf );
+ return rc;
+}
+
+/**
+ * Handle DHCP retry timer expiry
+ *
+ * @v timer DHCP retry timer
+ * @v fail Failure indicator
+ */
+static void dhcp_timer_expired ( struct retry_timer *timer, int fail ) {
+ struct dhcp_session *dhcp =
+ container_of ( timer, struct dhcp_session, timer );
+
+ if ( fail ) {
+ dhcp_finished ( dhcp, -ETIMEDOUT );
+ } else {
+ dhcp_send_request ( dhcp );
+ }
+}
+
+/**
+ * Receive new data
+ *
+ * @v xfer Data transfer interface
+ * @v iobuf I/O buffer
+ * @v data Received data
+ * @v len Length of received data
+ * @ret rc Return status code
+ */
+static int dhcp_deliver_raw ( struct xfer_interface *xfer,
+ const void *data, size_t len ) {
+ struct dhcp_session *dhcp =
+ container_of ( xfer, struct dhcp_session, xfer );
+ struct dhcp_settings *response;
+ struct dhcp_settings **store_response;
+ struct dhcphdr *dhcphdr;
+ uint8_t msgtype = 0;
+ uint8_t priority = 0;
+ uint8_t existing_priority = 0;
+ unsigned long elapsed;
+ int is_proxy;
+ uint8_t ignore_proxy = 0;
+ int rc;
+
+ /* Convert packet into a DHCP settings block */
+ response = dhcpset_create ( data, len );
+ if ( ! response ) {
+ DBGC ( dhcp, "DHCP %p could not store DHCP packet\n", dhcp );
+ return -ENOMEM;
+ }
+ dhcphdr = response->dhcppkt.dhcphdr;
+
+ /* Check for matching transaction ID */
+ if ( dhcphdr->xid != dhcp_xid ( dhcp->netdev ) ) {
+ DBGC ( dhcp, "DHCP %p wrong transaction ID (wanted %08lx, "
+ "got %08lx)\n", dhcp, ntohl ( dhcphdr->xid ),
+ ntohl ( dhcp_xid ( dhcp->netdev ) ) );
+ goto out_discard;
+ };
+
+ /* Determine and verify message type */
+ is_proxy = ( dhcphdr->yiaddr.s_addr == 0 );
+ dhcppkt_fetch ( &response->dhcppkt, DHCP_MESSAGE_TYPE, &msgtype,
+ sizeof ( msgtype ) );
+ DBGC ( dhcp, "DHCP %p received %s%s\n", dhcp,
+ ( is_proxy ? "Proxy" : "" ), dhcp_msgtype_name ( msgtype ) );
+ if ( ( ( dhcp->state != DHCPDISCOVER ) || ( msgtype != DHCPOFFER ) ) &&
+ ( ( dhcp->state != DHCPREQUEST ) || ( msgtype != DHCPACK ) ) ) {
+ DBGC ( dhcp, "DHCP %p discarding %s while in %s state\n",
+ dhcp, dhcp_msgtype_name ( msgtype ),
+ dhcp_msgtype_name ( dhcp->state ) );
+ goto out_discard;
+ }
+
+ /* Update stored standard/ProxyDHCP options, if the new
+ * options have equal or higher priority than the
+ * currently-stored options.
+ */
+ store_response = ( is_proxy ? &dhcp->proxy_response : &dhcp->response);
+ if ( *store_response ) {
+ dhcppkt_fetch ( &(*store_response)->dhcppkt, DHCP_EB_PRIORITY,
+ &existing_priority,
+ sizeof ( existing_priority ) );
+ }
+ dhcppkt_fetch ( &response->dhcppkt, DHCP_EB_PRIORITY, &priority,
+ sizeof ( priority ) );
+ if ( priority >= existing_priority ) {
+ dhcpset_put ( *store_response );
+ *store_response = response;
+ } else {
+ dhcpset_put ( response );
+ }
+
+ /* If we don't yet have a standard DHCP response (i.e. one
+ * with an IP address), then just leave the timer running.
+ */
+ if ( ! dhcp->response )
+ goto out;
+
+ /* Handle DHCP response */
+ dhcppkt_fetch ( &dhcp->response->dhcppkt, DHCP_EB_NO_PROXYDHCP,
+ &ignore_proxy, sizeof ( ignore_proxy ) );
+ switch ( dhcp->state ) {
+ case DHCPDISCOVER:
+ /* If we have allowed sufficient time for ProxyDHCP
+ * reponses, then transition to making the DHCPREQUEST.
+ */
+ elapsed = ( currticks() - dhcp->start );
+ if ( ignore_proxy || ( elapsed > PROXYDHCP_WAIT_TIME ) ) {
+ stop_timer ( &dhcp->timer );
+ dhcp->state = DHCPREQUEST;
+ dhcp_send_request ( dhcp );
+ }
+ break;
+ case DHCPREQUEST:
+ /* DHCP finished; register options and exit */
+ if ( ignore_proxy && dhcp->proxy_response ) {
+ dhcpset_put ( dhcp->proxy_response );
+ dhcp->proxy_response = NULL;
+ }
+ if ( ( rc = dhcp_register_settings ( dhcp ) ) != 0 ) {
+ dhcp_finished ( dhcp, rc );
+ break;
+ }
+ dhcp_finished ( dhcp, 0 );
+ break;
+ default:
+ assert ( 0 );
+ }
+
+ out:
+ return 0;
+
+ out_discard:
+ dhcpset_put ( response );
+ return 0;
+}
+
+/** DHCP data transfer interface operations */
+static struct xfer_interface_operations dhcp_xfer_operations = {
+ .close = ignore_xfer_close,
+ .vredirect = xfer_vopen,
+ .window = unlimited_xfer_window,
+ .alloc_iob = default_xfer_alloc_iob,
+ .deliver_iob = xfer_deliver_as_raw,
+ .deliver_raw = dhcp_deliver_raw,
+};
+
+/****************************************************************************
+ *
+ * Job control interface
+ *
+ */
+
+/**
+ * Handle kill() event received via job control interface
+ *
+ * @v job DHCP job control interface
+ */
+static void dhcp_job_kill ( struct job_interface *job ) {
+ struct dhcp_session *dhcp =
+ container_of ( job, struct dhcp_session, job );
+
+ /* Terminate DHCP session */
+ dhcp_finished ( dhcp, -ECANCELED );
+}
+
+/** DHCP job control interface operations */
+static struct job_interface_operations dhcp_job_operations = {
+ .done = ignore_job_done,
+ .kill = dhcp_job_kill,
+ .progress = ignore_job_progress,
+};
+
+/****************************************************************************
+ *
+ * Instantiator
+ *
+ */
+
+/**
+ * Start DHCP on a network device
+ *
+ * @v job Job control interface
+ * @v netdev Network device
+ * @v register_options DHCP option block registration routine
+ * @ret rc Return status code
+ *
+ * Starts DHCP on the specified network device. If successful, the @c
+ * register_options() routine will be called with the acquired
+ * options.
+ */
+int start_dhcp ( struct job_interface *job, struct net_device *netdev ) {
+ static struct sockaddr_in server = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = INADDR_BROADCAST,
+ .sin_port = htons ( BOOTPS_PORT ),
+ };
+ static struct sockaddr_in client = {
+ .sin_family = AF_INET,
+ .sin_port = htons ( BOOTPC_PORT ),
+ };
+ struct dhcp_session *dhcp;
+ int rc;
+
+ /* Allocate and initialise structure */
+ dhcp = zalloc ( sizeof ( *dhcp ) );
+ if ( ! dhcp )
+ return -ENOMEM;
+ dhcp->refcnt.free = dhcp_free;
+ job_init ( &dhcp->job, &dhcp_job_operations, &dhcp->refcnt );
+ xfer_init ( &dhcp->xfer, &dhcp_xfer_operations, &dhcp->refcnt );
+ dhcp->netdev = netdev_get ( netdev );
+ dhcp->timer.expired = dhcp_timer_expired;
+ dhcp->state = DHCPDISCOVER;
+ dhcp->start = currticks();
+
+ /* Instantiate child objects and attach to our interfaces */
+ if ( ( rc = xfer_open_socket ( &dhcp->xfer, SOCK_DGRAM,
+ ( struct sockaddr * ) &server,
+ ( struct sockaddr * ) &client ) ) != 0 )
+ goto err;
+
+ /* Start timer to initiate initial DHCPREQUEST */
+ start_timer_nodelay ( &dhcp->timer );
+
+ /* Attach parent interface, mortalise self, and return */
+ job_plug_plug ( &dhcp->job, job );
+ ref_put ( &dhcp->refcnt );
+ return 0;
+
+ err:
+ dhcp_finished ( dhcp, rc );
+ ref_put ( &dhcp->refcnt );
+ return rc;
+}
diff --git a/gpxe/src/net/udp/dns.c b/gpxe/src/net/udp/dns.c
new file mode 100644
index 00000000..1bcdbc7e
--- /dev/null
+++ b/gpxe/src/net/udp/dns.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (C) 2006 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * Portions copyright (C) 2004 Anselm M. Hoffmeister
+ * <stockholm@users.sourceforge.net>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <byteswap.h>
+#include <gpxe/refcnt.h>
+#include <gpxe/xfer.h>
+#include <gpxe/open.h>
+#include <gpxe/resolv.h>
+#include <gpxe/retry.h>
+#include <gpxe/tcpip.h>
+#include <gpxe/settings.h>
+#include <gpxe/features.h>
+#include <gpxe/dns.h>
+
+/** @file
+ *
+ * DNS protocol
+ *
+ */
+
+FEATURE ( FEATURE_PROTOCOL, "DNS", DHCP_EB_FEATURE_DNS, 1 );
+
+/** The DNS server */
+static struct sockaddr_tcpip nameserver = {
+ .st_port = htons ( DNS_PORT ),
+};
+
+/** A DNS request */
+struct dns_request {
+ /** Reference counter */
+ struct refcnt refcnt;
+ /** Name resolution interface */
+ struct resolv_interface resolv;
+ /** Data transfer interface */
+ struct xfer_interface socket;
+ /** Retry timer */
+ struct retry_timer timer;
+
+ /** Socket address to fill in with resolved address */
+ struct sockaddr sa;
+ /** Current query packet */
+ struct dns_query query;
+ /** Location of query info structure within current packet
+ *
+ * The query info structure is located immediately after the
+ * compressed name.
+ */
+ struct dns_query_info *qinfo;
+ /** Recursion counter */
+ unsigned int recursion;
+};
+
+/**
+ * Mark DNS request as complete
+ *
+ * @v dns DNS request
+ * @v rc Return status code
+ */
+static void dns_done ( struct dns_request *dns, int rc ) {
+
+ /* Stop the retry timer */
+ stop_timer ( &dns->timer );
+
+ /* Close data transfer interface */
+ xfer_nullify ( &dns->socket );
+ xfer_close ( &dns->socket, rc );
+
+ /* Mark name resolution as complete */
+ resolv_done ( &dns->resolv, &dns->sa, rc );
+}
+
+/**
+ * Compare DNS reply name against the query name from the original request
+ *
+ * @v dns DNS request
+ * @v reply DNS reply
+ * @v rname Reply name
+ * @ret zero Names match
+ * @ret non-zero Names do not match
+ */
+static int dns_name_cmp ( struct dns_request *dns,
+ const struct dns_header *reply,
+ const char *rname ) {
+ const char *qname = dns->query.payload;
+ int i;
+
+ while ( 1 ) {
+ /* Obtain next section of rname */
+ while ( ( *rname ) & 0xc0 ) {
+ rname = ( ( ( char * ) reply ) +
+ ( ntohs( *((uint16_t *)rname) ) & ~0xc000 ));
+ }
+ /* Check that lengths match */
+ if ( *rname != *qname )
+ return -1;
+ /* If length is zero, we have reached the end */
+ if ( ! *qname )
+ return 0;
+ /* Check that data matches */
+ for ( i = *qname + 1; i > 0 ; i-- ) {
+ if ( *(rname++) != *(qname++) )
+ return -1;
+ }
+ }
+}
+
+/**
+ * Skip over a (possibly compressed) DNS name
+ *
+ * @v name DNS name
+ * @ret name Next DNS name
+ */
+static const char * dns_skip_name ( const char *name ) {
+ while ( 1 ) {
+ if ( ! *name ) {
+ /* End of name */
+ return ( name + 1);
+ }
+ if ( *name & 0xc0 ) {
+ /* Start of a compressed name */
+ return ( name + 2 );
+ }
+ /* Uncompressed name portion */
+ name += *name + 1;
+ }
+}
+
+/**
+ * Find an RR in a reply packet corresponding to our query
+ *
+ * @v dns DNS request
+ * @v reply DNS reply
+ * @ret rr DNS RR, or NULL if not found
+ */
+static union dns_rr_info * dns_find_rr ( struct dns_request *dns,
+ const struct dns_header *reply ) {
+ int i, cmp;
+ const char *p = ( ( char * ) reply ) + sizeof ( struct dns_header );
+ union dns_rr_info *rr_info;
+
+ /* Skip over the questions section */
+ for ( i = ntohs ( reply->qdcount ) ; i > 0 ; i-- ) {
+ p = dns_skip_name ( p ) + sizeof ( struct dns_query_info );
+ }
+
+ /* Process the answers section */
+ for ( i = ntohs ( reply->ancount ) ; i > 0 ; i-- ) {
+ cmp = dns_name_cmp ( dns, reply, p );
+ p = dns_skip_name ( p );
+ rr_info = ( ( union dns_rr_info * ) p );
+ if ( cmp == 0 )
+ return rr_info;
+ p += ( sizeof ( rr_info->common ) +
+ ntohs ( rr_info->common.rdlength ) );
+ }
+
+ return NULL;
+}
+
+/**
+ * Convert a standard NUL-terminated string to a DNS name
+ *
+ * @v string Name as a NUL-terminated string
+ * @v buf Buffer in which to place DNS name
+ * @ret next Byte following constructed DNS name
+ *
+ * DNS names consist of "<length>element" pairs.
+ */
+static char * dns_make_name ( const char *string, char *buf ) {
+ char *length_byte = buf++;
+ char c;
+
+ while ( ( c = *(string++) ) ) {
+ if ( c == '.' ) {
+ *length_byte = buf - length_byte - 1;
+ length_byte = buf;
+ }
+ *(buf++) = c;
+ }
+ *length_byte = buf - length_byte - 1;
+ *(buf++) = '\0';
+ return buf;
+}
+
+/**
+ * Convert an uncompressed DNS name to a NUL-terminated string
+ *
+ * @v name DNS name
+ * @ret string NUL-terminated string
+ *
+ * Produce a printable version of a DNS name. Used only for debugging.
+ */
+static inline char * dns_unmake_name ( char *name ) {
+ char *p;
+ unsigned int len;
+
+ p = name;
+ while ( ( len = *p ) ) {
+ *(p++) = '.';
+ p += len;
+ }
+
+ return name + 1;
+}
+
+/**
+ * Decompress a DNS name
+ *
+ * @v reply DNS replay
+ * @v name DNS name
+ * @v buf Buffer into which to decompress DNS name
+ * @ret next Byte following decompressed DNS name
+ */
+static char * dns_decompress_name ( const struct dns_header *reply,
+ const char *name, char *buf ) {
+ int i, len;
+
+ do {
+ /* Obtain next section of name */
+ while ( ( *name ) & 0xc0 ) {
+ name = ( ( char * ) reply +
+ ( ntohs ( *((uint16_t *)name) ) & ~0xc000 ) );
+ }
+ /* Copy data */
+ len = *name;
+ for ( i = len + 1 ; i > 0 ; i-- ) {
+ *(buf++) = *(name++);
+ }
+ } while ( len );
+ return buf;
+}
+
+/**
+ * Send next packet in DNS request
+ *
+ * @v dns DNS request
+ */
+static int dns_send_packet ( struct dns_request *dns ) {
+ static unsigned int qid = 0;
+ size_t qlen;
+
+ /* Increment query ID */
+ dns->query.dns.id = htons ( ++qid );
+
+ DBGC ( dns, "DNS %p sending query ID %d\n", dns, qid );
+
+ /* Start retransmission timer */
+ start_timer ( &dns->timer );
+
+ /* Send the data */
+ qlen = ( ( ( void * ) dns->qinfo ) - ( ( void * ) &dns->query )
+ + sizeof ( dns->qinfo ) );
+ return xfer_deliver_raw ( &dns->socket, &dns->query, qlen );
+}
+
+/**
+ * Handle DNS retransmission timer expiry
+ *
+ * @v timer Retry timer
+ * @v fail Failure indicator
+ */
+static void dns_timer_expired ( struct retry_timer *timer, int fail ) {
+ struct dns_request *dns =
+ container_of ( timer, struct dns_request, timer );
+
+ if ( fail ) {
+ dns_done ( dns, -ETIMEDOUT );
+ } else {
+ dns_send_packet ( dns );
+ }
+}
+
+/**
+ * Receive new data
+ *
+ * @v socket UDP socket
+ * @v data DNS reply
+ * @v len Length of DNS reply
+ * @ret rc Return status code
+ */
+static int dns_xfer_deliver_raw ( struct xfer_interface *socket,
+ const void *data, size_t len ) {
+ struct dns_request *dns =
+ container_of ( socket, struct dns_request, socket );
+ const struct dns_header *reply = data;
+ union dns_rr_info *rr_info;
+ struct sockaddr_in *sin;
+ unsigned int qtype = dns->qinfo->qtype;
+
+ /* Sanity check */
+ if ( len < sizeof ( *reply ) ) {
+ DBGC ( dns, "DNS %p received underlength packet length %zd\n",
+ dns, len );
+ return -EINVAL;
+ }
+
+ /* Check reply ID matches query ID */
+ if ( reply->id != dns->query.dns.id ) {
+ DBGC ( dns, "DNS %p received unexpected reply ID %d "
+ "(wanted %d)\n", dns, ntohs ( reply->id ),
+ ntohs ( dns->query.dns.id ) );
+ return -EINVAL;
+ }
+
+ DBGC ( dns, "DNS %p received reply ID %d\n", dns, ntohs ( reply->id ));
+
+ /* Stop the retry timer. After this point, each code path
+ * must either restart the timer by calling dns_send_packet(),
+ * or mark the DNS operation as complete by calling
+ * dns_done()
+ */
+ stop_timer ( &dns->timer );
+
+ /* Search through response for useful answers. Do this
+ * multiple times, to take advantage of useful nameservers
+ * which send us e.g. the CNAME *and* the A record for the
+ * pointed-to name.
+ */
+ while ( ( rr_info = dns_find_rr ( dns, reply ) ) ) {
+ switch ( rr_info->common.type ) {
+
+ case htons ( DNS_TYPE_A ):
+
+ /* Found the target A record */
+ DBGC ( dns, "DNS %p found address %s\n",
+ dns, inet_ntoa ( rr_info->a.in_addr ) );
+ sin = ( struct sockaddr_in * ) &dns->sa;
+ sin->sin_family = AF_INET;
+ sin->sin_addr = rr_info->a.in_addr;
+
+ /* Mark operation as complete */
+ dns_done ( dns, 0 );
+ return 0;
+
+ case htons ( DNS_TYPE_CNAME ):
+
+ /* Found a CNAME record; update query and recurse */
+ DBGC ( dns, "DNS %p found CNAME\n", dns );
+ dns->qinfo = ( void * ) dns_decompress_name ( reply,
+ rr_info->cname.cname,
+ dns->query.payload );
+ dns->qinfo->qtype = htons ( DNS_TYPE_A );
+ dns->qinfo->qclass = htons ( DNS_CLASS_IN );
+
+ /* Terminate the operation if we recurse too far */
+ if ( ++dns->recursion > DNS_MAX_CNAME_RECURSION ) {
+ DBGC ( dns, "DNS %p recursion exceeded\n",
+ dns );
+ dns_done ( dns, -ELOOP );
+ return 0;
+ }
+ break;
+
+ default:
+ DBGC ( dns, "DNS %p got unknown record type %d\n",
+ dns, ntohs ( rr_info->common.type ) );
+ break;
+ }
+ }
+
+ /* Determine what to do next based on the type of query we
+ * issued and the reponse we received
+ */
+ switch ( qtype ) {
+
+ case htons ( DNS_TYPE_A ):
+ /* We asked for an A record and got nothing;
+ * try the CNAME.
+ */
+ DBGC ( dns, "DNS %p found no A record; trying CNAME\n", dns );
+ dns->qinfo->qtype = htons ( DNS_TYPE_CNAME );
+ dns_send_packet ( dns );
+ return 0;
+
+ case htons ( DNS_TYPE_CNAME ):
+ /* We asked for a CNAME record. If we got a response
+ * (i.e. if the next A query is already set up), then
+ * issue it, otherwise abort.
+ */
+ if ( dns->qinfo->qtype == htons ( DNS_TYPE_A ) ) {
+ dns_send_packet ( dns );
+ return 0;
+ } else {
+ DBGC ( dns, "DNS %p found no CNAME record\n", dns );
+ dns_done ( dns, -ENXIO );
+ return 0;
+ }
+
+ default:
+ assert ( 0 );
+ dns_done ( dns, -EINVAL );
+ return 0;
+ }
+}
+
+/**
+ * Receive new data
+ *
+ * @v socket UDP socket
+ * @v rc Reason for close
+ */
+static void dns_xfer_close ( struct xfer_interface *socket, int rc ) {
+ struct dns_request *dns =
+ container_of ( socket, struct dns_request, socket );
+
+ if ( ! rc )
+ rc = -ECONNABORTED;
+
+ dns_done ( dns, rc );
+}
+
+/** DNS socket operations */
+static struct xfer_interface_operations dns_socket_operations = {
+ .close = dns_xfer_close,
+ .vredirect = xfer_vopen,
+ .window = unlimited_xfer_window,
+ .alloc_iob = default_xfer_alloc_iob,
+ .deliver_iob = xfer_deliver_as_raw,
+ .deliver_raw = dns_xfer_deliver_raw,
+};
+
+/**
+ * Resolve name using DNS
+ *
+ * @v resolv Name resolution interface
+ * @v name Name to resolve
+ * @v sa Socket address to fill in
+ * @ret rc Return status code
+ */
+static int dns_resolv ( struct resolv_interface *resolv,
+ const char *name, struct sockaddr *sa ) {
+ struct dns_request *dns;
+ int rc;
+
+ /* Fail immediately if no DNS servers */
+ if ( ! nameserver.st_family ) {
+ DBG ( "DNS not attempting to resolve \"%s\": "
+ "no DNS servers\n", name );
+ return -ENXIO;
+ }
+
+ /* Allocate DNS structure */
+ dns = zalloc ( sizeof ( *dns ) );
+ if ( ! dns )
+ return -ENOMEM;
+ resolv_init ( &dns->resolv, &null_resolv_ops, &dns->refcnt );
+ xfer_init ( &dns->socket, &dns_socket_operations, &dns->refcnt );
+ dns->timer.expired = dns_timer_expired;
+ memcpy ( &dns->sa, sa, sizeof ( dns->sa ) );
+
+ /* Create query */
+ dns->query.dns.flags = htons ( DNS_FLAG_QUERY | DNS_FLAG_OPCODE_QUERY |
+ DNS_FLAG_RD );
+ dns->query.dns.qdcount = htons ( 1 );
+ dns->qinfo = ( void * ) dns_make_name ( name, dns->query.payload );
+ dns->qinfo->qtype = htons ( DNS_TYPE_A );
+ dns->qinfo->qclass = htons ( DNS_CLASS_IN );
+
+ /* Open UDP connection */
+ if ( ( rc = xfer_open_socket ( &dns->socket, SOCK_DGRAM,
+ ( struct sockaddr * ) &nameserver,
+ NULL ) ) != 0 ) {
+ DBGC ( dns, "DNS %p could not open socket: %s\n",
+ dns, strerror ( rc ) );
+ goto err;
+ }
+
+ /* Send first DNS packet */
+ dns_send_packet ( dns );
+
+ /* Attach parent interface, mortalise self, and return */
+ resolv_plug_plug ( &dns->resolv, resolv );
+ ref_put ( &dns->refcnt );
+ return 0;
+
+ err:
+ ref_put ( &dns->refcnt );
+ return rc;
+}
+
+/** DNS name resolver */
+struct resolver dns_resolver __resolver ( RESOLV_NORMAL ) = {
+ .name = "DNS",
+ .resolv = dns_resolv,
+};
+
+/******************************************************************************
+ *
+ * Settings
+ *
+ ******************************************************************************
+ */
+
+/** DNS server setting */
+struct setting dns_setting __setting = {
+ .name = "dns",
+ .description = "DNS server",
+ .tag = DHCP_DNS_SERVERS,
+ .type = &setting_type_ipv4,
+};
+
+/**
+ * Apply nameserver setting
+ *
+ * @ret rc Return status code
+ */
+static int apply_nameserver_setting ( void ) {
+ struct sockaddr_in *sin_nameserver =
+ ( struct sockaddr_in * ) &nameserver;
+ int len;
+
+ if ( ( len = fetch_ipv4_setting ( NULL, &dns_setting,
+ &sin_nameserver->sin_addr ) ) >= 0 ){
+ sin_nameserver->sin_family = AF_INET;
+ DBG ( "DNS using nameserver %s\n",
+ inet_ntoa ( sin_nameserver->sin_addr ) );
+ }
+
+ return 0;
+}
+
+/** Nameserver setting applicator */
+struct settings_applicator nameserver_applicator __settings_applicator = {
+ .apply = apply_nameserver_setting,
+};
diff --git a/gpxe/src/net/udp/tftp.c b/gpxe/src/net/udp/tftp.c
new file mode 100644
index 00000000..e49bcf9f
--- /dev/null
+++ b/gpxe/src/net/udp/tftp.c
@@ -0,0 +1,1149 @@
+/*
+ * Copyright (C) 2006 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <strings.h>
+#include <byteswap.h>
+#include <errno.h>
+#include <assert.h>
+#include <gpxe/refcnt.h>
+#include <gpxe/xfer.h>
+#include <gpxe/open.h>
+#include <gpxe/uri.h>
+#include <gpxe/tcpip.h>
+#include <gpxe/retry.h>
+#include <gpxe/features.h>
+#include <gpxe/bitmap.h>
+#include <gpxe/settings.h>
+#include <gpxe/dhcp.h>
+#include <gpxe/uri.h>
+#include <gpxe/tftp.h>
+
+/** @file
+ *
+ * TFTP protocol
+ *
+ */
+
+FEATURE ( FEATURE_PROTOCOL, "TFTP", DHCP_EB_FEATURE_TFTP, 1 );
+
+/**
+ * A TFTP request
+ *
+ * This data structure holds the state for an ongoing TFTP transfer.
+ */
+struct tftp_request {
+ /** Reference count */
+ struct refcnt refcnt;
+ /** Data transfer interface */
+ struct xfer_interface xfer;
+
+ /** URI being fetched */
+ struct uri *uri;
+ /** Transport layer interface */
+ struct xfer_interface socket;
+ /** Multicast transport layer interface */
+ struct xfer_interface mc_socket;
+
+ /** Data block size
+ *
+ * This is the "blksize" option negotiated with the TFTP
+ * server. (If the TFTP server does not support TFTP options,
+ * this will default to 512).
+ */
+ unsigned int blksize;
+ /** File size
+ *
+ * This is the value returned in the "tsize" option from the
+ * TFTP server. If the TFTP server does not support the
+ * "tsize" option, this value will be zero.
+ */
+ unsigned long tsize;
+
+ /** Server port
+ *
+ * This is the port to which RRQ packets are sent.
+ */
+ unsigned int port;
+ /** Peer address
+ *
+ * The peer address is determined by the first response
+ * received to the TFTP RRQ.
+ */
+ struct sockaddr_tcpip peer;
+ /** Request flags */
+ unsigned int flags;
+ /** MTFTP timeout count */
+ unsigned int mtftp_timeouts;
+
+ /** Block bitmap */
+ struct bitmap bitmap;
+ /** Maximum known length
+ *
+ * We don't always know the file length in advance. In
+ * particular, if the TFTP server doesn't support the tsize
+ * option, or we are using MTFTP, then we don't know the file
+ * length until we see the end-of-file block (which, in the
+ * case of MTFTP, may not be the last block we see).
+ *
+ * This value is updated whenever we obtain information about
+ * the file length.
+ */
+ size_t filesize;
+ /** Retransmission timer */
+ struct retry_timer timer;
+};
+
+/** TFTP request flags */
+enum {
+ /** Send ACK packets */
+ TFTP_FL_SEND_ACK = 0x0001,
+ /** Request blksize and tsize options */
+ TFTP_FL_RRQ_SIZES = 0x0002,
+ /** Request multicast option */
+ TFTP_FL_RRQ_MULTICAST = 0x0004,
+ /** Perform MTFTP recovery on timeout */
+ TFTP_FL_MTFTP_RECOVERY = 0x0008,
+};
+
+/** Maximum number of MTFTP open requests before falling back to TFTP */
+#define MTFTP_MAX_TIMEOUTS 3
+
+/**
+ * Free TFTP request
+ *
+ * @v refcnt Reference counter
+ */
+static void tftp_free ( struct refcnt *refcnt ) {
+ struct tftp_request *tftp =
+ container_of ( refcnt, struct tftp_request, refcnt );
+
+ uri_put ( tftp->uri );
+ bitmap_free ( &tftp->bitmap );
+ free ( tftp );
+}
+
+/**
+ * Mark TFTP request as complete
+ *
+ * @v tftp TFTP connection
+ * @v rc Return status code
+ */
+static void tftp_done ( struct tftp_request *tftp, int rc ) {
+
+ DBGC ( tftp, "TFTP %p finished with status %d (%s)\n",
+ tftp, rc, strerror ( rc ) );
+
+ /* Stop the retry timer */
+ stop_timer ( &tftp->timer );
+
+ /* Close all data transfer interfaces */
+ xfer_nullify ( &tftp->socket );
+ xfer_close ( &tftp->socket, rc );
+ xfer_nullify ( &tftp->mc_socket );
+ xfer_close ( &tftp->mc_socket, rc );
+ xfer_nullify ( &tftp->xfer );
+ xfer_close ( &tftp->xfer, rc );
+}
+
+/**
+ * Reopen TFTP socket
+ *
+ * @v tftp TFTP connection
+ * @ret rc Return status code
+ */
+static int tftp_reopen ( struct tftp_request *tftp ) {
+ struct sockaddr_tcpip server;
+ int rc;
+
+ /* Close socket */
+ xfer_close ( &tftp->socket, 0 );
+
+ /* Disable ACK sending. */
+ tftp->flags &= ~TFTP_FL_SEND_ACK;
+
+ /* Reset peer address */
+ memset ( &tftp->peer, 0, sizeof ( tftp->peer ) );
+
+ /* Open socket */
+ memset ( &server, 0, sizeof ( server ) );
+ server.st_port = htons ( tftp->port );
+ if ( ( rc = xfer_open_named_socket ( &tftp->socket, SOCK_DGRAM,
+ ( struct sockaddr * ) &server,
+ tftp->uri->host, NULL ) ) != 0 ) {
+ DBGC ( tftp, "TFTP %p could not open socket: %s\n",
+ tftp, strerror ( rc ) );
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * Reopen TFTP multicast socket
+ *
+ * @v tftp TFTP connection
+ * @v local Local socket address
+ * @ret rc Return status code
+ */
+static int tftp_reopen_mc ( struct tftp_request *tftp,
+ struct sockaddr *local ) {
+ int rc;
+
+ /* Close multicast socket */
+ xfer_close ( &tftp->mc_socket, 0 );
+
+ /* Open multicast socket. We never send via this socket, so
+ * use the local address as the peer address (since the peer
+ * address cannot be NULL).
+ */
+ if ( ( rc = xfer_open_socket ( &tftp->mc_socket, SOCK_DGRAM,
+ local, local ) ) != 0 ) {
+ DBGC ( tftp, "TFTP %p could not open multicast "
+ "socket: %s\n", tftp, strerror ( rc ) );
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * Presize TFTP receive buffers and block bitmap
+ *
+ * @v tftp TFTP connection
+ * @v filesize Known minimum file size
+ * @ret rc Return status code
+ */
+static int tftp_presize ( struct tftp_request *tftp, size_t filesize ) {
+ unsigned int num_blocks;
+ int rc;
+
+ /* Do nothing if we are already large enough */
+ if ( filesize <= tftp->filesize )
+ return 0;
+
+ /* Record filesize */
+ tftp->filesize = filesize;
+
+ /* Notify recipient of file size */
+ xfer_seek ( &tftp->xfer, filesize, SEEK_SET );
+ xfer_seek ( &tftp->xfer, 0, SEEK_SET );
+
+ /* Calculate expected number of blocks. Note that files whose
+ * length is an exact multiple of the blocksize will have a
+ * trailing zero-length block, which must be included.
+ */
+ num_blocks = ( ( filesize / tftp->blksize ) + 1 );
+ if ( ( rc = bitmap_resize ( &tftp->bitmap, num_blocks ) ) != 0 ) {
+ DBGC ( tftp, "TFTP %p could not resize bitmap to %d blocks: "
+ "%s\n", tftp, num_blocks, strerror ( rc ) );
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * TFTP requested blocksize
+ *
+ * This is treated as a global configuration parameter.
+ */
+static unsigned int tftp_request_blksize = TFTP_MAX_BLKSIZE;
+
+/**
+ * Set TFTP request blocksize
+ *
+ * @v blksize Requested block size
+ */
+void tftp_set_request_blksize ( unsigned int blksize ) {
+ if ( blksize < TFTP_DEFAULT_BLKSIZE )
+ blksize = TFTP_DEFAULT_BLKSIZE;
+ tftp_request_blksize = blksize;
+}
+
+/**
+ * MTFTP multicast receive address
+ *
+ * This is treated as a global configuration parameter.
+ */
+static struct sockaddr_in tftp_mtftp_socket = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl ( 0xefff0101 ),
+ .sin_port = htons ( 3001 ),
+};
+
+/**
+ * Set MTFTP multicast address
+ *
+ * @v address Multicast IPv4 address
+ */
+void tftp_set_mtftp_address ( struct in_addr address ) {
+ tftp_mtftp_socket.sin_addr = address;
+}
+
+/**
+ * Set MTFTP multicast port
+ *
+ * @v port Multicast port
+ */
+void tftp_set_mtftp_port ( unsigned int port ) {
+ tftp_mtftp_socket.sin_port = htons ( port );
+}
+
+/**
+ * Transmit RRQ
+ *
+ * @v tftp TFTP connection
+ * @ret rc Return status code
+ */
+static int tftp_send_rrq ( struct tftp_request *tftp ) {
+ struct tftp_rrq *rrq;
+ const char *path = tftp->uri->path;
+ size_t len = ( sizeof ( *rrq ) + strlen ( path ) + 1 /* NUL */
+ + 5 + 1 /* "octet" + NUL */
+ + 7 + 1 + 5 + 1 /* "blksize" + NUL + ddddd + NUL */
+ + 5 + 1 + 1 + 1 /* "tsize" + NUL + "0" + NUL */
+ + 9 + 1 + 1 /* "multicast" + NUL + NUL */ );
+ struct io_buffer *iobuf;
+
+ DBGC ( tftp, "TFTP %p requesting \"%s\"\n", tftp, path );
+
+ /* Allocate buffer */
+ iobuf = xfer_alloc_iob ( &tftp->socket, len );
+ if ( ! iobuf )
+ return -ENOMEM;
+
+ /* Build request */
+ rrq = iob_put ( iobuf, sizeof ( *rrq ) );
+ rrq->opcode = htons ( TFTP_RRQ );
+ iob_put ( iobuf, snprintf ( iobuf->tail, iob_tailroom ( iobuf ),
+ "%s%coctet", path, 0 ) + 1 );
+ if ( tftp->flags & TFTP_FL_RRQ_SIZES ) {
+ iob_put ( iobuf, snprintf ( iobuf->tail,
+ iob_tailroom ( iobuf ),
+ "blksize%c%d%ctsize%c0", 0,
+ tftp_request_blksize, 0, 0 ) + 1 );
+ }
+ if ( tftp->flags & TFTP_FL_RRQ_MULTICAST ) {
+ iob_put ( iobuf, snprintf ( iobuf->tail,
+ iob_tailroom ( iobuf ),
+ "multicast%c", 0 ) + 1 );
+ }
+
+ /* RRQ always goes to the address specified in the initial
+ * xfer_open() call
+ */
+ return xfer_deliver_iob ( &tftp->socket, iobuf );
+}
+
+/**
+ * Transmit ACK
+ *
+ * @v tftp TFTP connection
+ * @ret rc Return status code
+ */
+static int tftp_send_ack ( struct tftp_request *tftp ) {
+ struct tftp_ack *ack;
+ struct io_buffer *iobuf;
+ struct xfer_metadata meta = {
+ .dest = ( struct sockaddr * ) &tftp->peer,
+ };
+ unsigned int block;
+
+ /* Determine next required block number */
+ block = bitmap_first_gap ( &tftp->bitmap );
+ DBGC2 ( tftp, "TFTP %p sending ACK for block %d\n", tftp, block );
+
+ /* Allocate buffer */
+ iobuf = xfer_alloc_iob ( &tftp->socket, sizeof ( *ack ) );
+ if ( ! iobuf )
+ return -ENOMEM;
+
+ /* Build ACK */
+ ack = iob_put ( iobuf, sizeof ( *ack ) );
+ ack->opcode = htons ( TFTP_ACK );
+ ack->block = htons ( block );
+
+ /* ACK always goes to the peer recorded from the RRQ response */
+ return xfer_deliver_iob_meta ( &tftp->socket, iobuf, &meta );
+}
+
+/**
+ * Transmit next relevant packet
+ *
+ * @v tftp TFTP connection
+ * @ret rc Return status code
+ */
+static int tftp_send_packet ( struct tftp_request *tftp ) {
+
+ /* Update retransmission timer */
+ stop_timer ( &tftp->timer );
+ start_timer ( &tftp->timer );
+
+ /* Send RRQ or ACK as appropriate */
+ if ( ! tftp->peer.st_family ) {
+ return tftp_send_rrq ( tftp );
+ } else {
+ if ( tftp->flags & TFTP_FL_SEND_ACK ) {
+ return tftp_send_ack ( tftp );
+ } else {
+ return 0;
+ }
+ }
+}
+
+/**
+ * Handle TFTP retransmission timer expiry
+ *
+ * @v timer Retry timer
+ * @v fail Failure indicator
+ */
+static void tftp_timer_expired ( struct retry_timer *timer, int fail ) {
+ struct tftp_request *tftp =
+ container_of ( timer, struct tftp_request, timer );
+ int rc;
+
+ /* If we are doing MTFTP, attempt the various recovery strategies */
+ if ( tftp->flags & TFTP_FL_MTFTP_RECOVERY ) {
+ if ( tftp->peer.st_family ) {
+ /* If we have received any response from the server,
+ * try resending the RRQ to restart the download.
+ */
+ DBGC ( tftp, "TFTP %p attempting reopen\n", tftp );
+ if ( ( rc = tftp_reopen ( tftp ) ) != 0 )
+ goto err;
+ } else {
+ /* Fall back to plain TFTP after several attempts */
+ tftp->mtftp_timeouts++;
+ DBGC ( tftp, "TFTP %p timeout %d waiting for MTFTP "
+ "open\n", tftp, tftp->mtftp_timeouts );
+
+ if ( tftp->mtftp_timeouts > MTFTP_MAX_TIMEOUTS ) {
+ DBGC ( tftp, "TFTP %p falling back to plain "
+ "TFTP\n", tftp );
+ tftp->flags = TFTP_FL_RRQ_SIZES;
+
+ /* Close multicast socket */
+ xfer_close ( &tftp->mc_socket, 0 );
+
+ /* Reset retry timer */
+ start_timer_nodelay ( &tftp->timer );
+
+ /* The blocksize may change: discard
+ * the block bitmap
+ */
+ bitmap_free ( &tftp->bitmap );
+ memset ( &tftp->bitmap, 0,
+ sizeof ( tftp->bitmap ) );
+
+ /* Reopen on standard TFTP port */
+ tftp->port = TFTP_PORT;
+ if ( ( rc = tftp_reopen ( tftp ) ) != 0 )
+ goto err;
+ }
+ }
+ } else {
+ /* Not doing MTFTP (or have fallen back to plain
+ * TFTP); fail as per normal.
+ */
+ if ( fail ) {
+ rc = -ETIMEDOUT;
+ goto err;
+ }
+ }
+ tftp_send_packet ( tftp );
+ return;
+
+ err:
+ tftp_done ( tftp, rc );
+}
+
+/**
+ * Process TFTP "blksize" option
+ *
+ * @v tftp TFTP connection
+ * @v value Option value
+ * @ret rc Return status code
+ */
+static int tftp_process_blksize ( struct tftp_request *tftp,
+ const char *value ) {
+ char *end;
+
+ tftp->blksize = strtoul ( value, &end, 10 );
+ if ( *end ) {
+ DBGC ( tftp, "TFTP %p got invalid blksize \"%s\"\n",
+ tftp, value );
+ return -EINVAL;
+ }
+ DBGC ( tftp, "TFTP %p blksize=%d\n", tftp, tftp->blksize );
+
+ return 0;
+}
+
+/**
+ * Process TFTP "tsize" option
+ *
+ * @v tftp TFTP connection
+ * @v value Option value
+ * @ret rc Return status code
+ */
+static int tftp_process_tsize ( struct tftp_request *tftp,
+ const char *value ) {
+ char *end;
+
+ tftp->tsize = strtoul ( value, &end, 10 );
+ if ( *end ) {
+ DBGC ( tftp, "TFTP %p got invalid tsize \"%s\"\n",
+ tftp, value );
+ return -EINVAL;
+ }
+ DBGC ( tftp, "TFTP %p tsize=%ld\n", tftp, tftp->tsize );
+
+ return 0;
+}
+
+/**
+ * Process TFTP "multicast" option
+ *
+ * @v tftp TFTP connection
+ * @v value Option value
+ * @ret rc Return status code
+ */
+static int tftp_process_multicast ( struct tftp_request *tftp,
+ const char *value ) {
+ union {
+ struct sockaddr sa;
+ struct sockaddr_in sin;
+ } socket;
+ char buf[ strlen ( value ) + 1 ];
+ char *addr;
+ char *port;
+ char *port_end;
+ char *mc;
+ char *mc_end;
+ int rc;
+
+ /* Split value into "addr,port,mc" fields */
+ memcpy ( buf, value, sizeof ( buf ) );
+ addr = buf;
+ port = strchr ( addr, ',' );
+ if ( ! port ) {
+ DBGC ( tftp, "TFTP %p multicast missing port,mc\n", tftp );
+ return -EINVAL;
+ }
+ *(port++) = '\0';
+ mc = strchr ( port, ',' );
+ if ( ! mc ) {
+ DBGC ( tftp, "TFTP %p multicast missing mc\n", tftp );
+ return -EINVAL;
+ }
+ *(mc++) = '\0';
+
+ /* Parse parameters */
+ if ( strtoul ( mc, &mc_end, 0 ) == 0 )
+ tftp->flags &= ~TFTP_FL_SEND_ACK;
+ if ( *mc_end ) {
+ DBGC ( tftp, "TFTP %p multicast invalid mc %s\n", tftp, mc );
+ return -EINVAL;
+ }
+ DBGC ( tftp, "TFTP %p is%s the master client\n",
+ tftp, ( ( tftp->flags & TFTP_FL_SEND_ACK ) ? "" : " not" ) );
+ if ( *addr && *port ) {
+ socket.sin.sin_family = AF_INET;
+ if ( inet_aton ( addr, &socket.sin.sin_addr ) == 0 ) {
+ DBGC ( tftp, "TFTP %p multicast invalid IP address "
+ "%s\n", tftp, addr );
+ return -EINVAL;
+ }
+ DBGC ( tftp, "TFTP %p multicast IP address %s\n",
+ tftp, inet_ntoa ( socket.sin.sin_addr ) );
+ socket.sin.sin_port = htons ( strtoul ( port, &port_end, 0 ) );
+ if ( *port_end ) {
+ DBGC ( tftp, "TFTP %p multicast invalid port %s\n",
+ tftp, port );
+ return -EINVAL;
+ }
+ DBGC ( tftp, "TFTP %p multicast port %d\n",
+ tftp, ntohs ( socket.sin.sin_port ) );
+ if ( ( rc = tftp_reopen_mc ( tftp, &socket.sa ) ) != 0 )
+ return rc;
+ }
+
+ return 0;
+}
+
+/** A TFTP option */
+struct tftp_option {
+ /** Option name */
+ const char *name;
+ /** Option processor
+ *
+ * @v tftp TFTP connection
+ * @v value Option value
+ * @ret rc Return status code
+ */
+ int ( * process ) ( struct tftp_request *tftp, const char *value );
+};
+
+/** Recognised TFTP options */
+static struct tftp_option tftp_options[] = {
+ { "blksize", tftp_process_blksize },
+ { "tsize", tftp_process_tsize },
+ { "multicast", tftp_process_multicast },
+ { NULL, NULL }
+};
+
+/**
+ * Process TFTP option
+ *
+ * @v tftp TFTP connection
+ * @v name Option name
+ * @v value Option value
+ * @ret rc Return status code
+ */
+static int tftp_process_option ( struct tftp_request *tftp,
+ const char *name, const char *value ) {
+ struct tftp_option *option;
+
+ for ( option = tftp_options ; option->name ; option++ ) {
+ if ( strcasecmp ( name, option->name ) == 0 )
+ return option->process ( tftp, value );
+ }
+
+ DBGC ( tftp, "TFTP %p received unknown option \"%s\" = \"%s\"\n",
+ tftp, name, value );
+
+ /* Unknown options should be silently ignored */
+ return 0;
+}
+
+/**
+ * Receive OACK
+ *
+ * @v tftp TFTP connection
+ * @v buf Temporary data buffer
+ * @v len Length of temporary data buffer
+ * @ret rc Return status code
+ */
+static int tftp_rx_oack ( struct tftp_request *tftp, void *buf, size_t len ) {
+ struct tftp_oack *oack = buf;
+ char *end = buf + len;
+ char *name;
+ char *value;
+ int rc = 0;
+
+ /* Sanity check */
+ if ( len < sizeof ( *oack ) ) {
+ DBGC ( tftp, "TFTP %p received underlength OACK packet "
+ "length %zd\n", tftp, len );
+ rc = -EINVAL;
+ goto done;
+ }
+ if ( end[-1] != '\0' ) {
+ DBGC ( tftp, "TFTP %p received OACK missing final NUL\n",
+ tftp );
+ rc = -EINVAL;
+ goto done;
+ }
+
+ /* Process each option in turn */
+ name = oack->data;
+ while ( name < end ) {
+ value = ( name + strlen ( name ) + 1 );
+ if ( value == end ) {
+ DBGC ( tftp, "TFTP %p received OACK missing value "
+ "for option \"%s\"\n", tftp, name );
+ rc = -EINVAL;
+ goto done;
+ }
+ if ( ( rc = tftp_process_option ( tftp, name, value ) ) != 0 )
+ goto done;
+ name = ( value + strlen ( value ) + 1 );
+ }
+
+ /* Process tsize information, if available */
+ if ( tftp->tsize ) {
+ if ( ( rc = tftp_presize ( tftp, tftp->tsize ) ) != 0 )
+ goto done;
+ }
+
+ /* Request next data block */
+ tftp_send_packet ( tftp );
+
+ done:
+ if ( rc )
+ tftp_done ( tftp, rc );
+ return rc;
+}
+
+/**
+ * Receive DATA
+ *
+ * @v tftp TFTP connection
+ * @v iobuf I/O buffer
+ * @ret rc Return status code
+ *
+ * Takes ownership of I/O buffer.
+ */
+static int tftp_rx_data ( struct tftp_request *tftp,
+ struct io_buffer *iobuf ) {
+ struct tftp_data *data = iobuf->data;
+ struct xfer_metadata meta;
+ int block;
+ off_t offset;
+ size_t data_len;
+ int rc;
+
+ /* Sanity check */
+ if ( iob_len ( iobuf ) < sizeof ( *data ) ) {
+ DBGC ( tftp, "TFTP %p received underlength DATA packet "
+ "length %zd\n", tftp, iob_len ( iobuf ) );
+ rc = -EINVAL;
+ goto done;
+ }
+
+ /* Extract data */
+ block = ( ntohs ( data->block ) - 1 );
+ offset = ( block * tftp->blksize );
+ iob_pull ( iobuf, sizeof ( *data ) );
+ data_len = iob_len ( iobuf );
+ if ( data_len > tftp->blksize ) {
+ DBGC ( tftp, "TFTP %p received overlength DATA packet "
+ "length %zd\n", tftp, data_len );
+ rc = -EINVAL;
+ goto done;
+ }
+
+ /* Deliver data */
+ memset ( &meta, 0, sizeof ( meta ) );
+ meta.whence = SEEK_SET;
+ meta.offset = offset;
+ rc = xfer_deliver_iob_meta ( &tftp->xfer, iobuf, &meta );
+ iobuf = NULL;
+ if ( rc != 0 ) {
+ DBGC ( tftp, "TFTP %p could not deliver data: %s\n",
+ tftp, strerror ( rc ) );
+ goto done;
+ }
+
+ /* Ensure block bitmap is ready */
+ if ( ( rc = tftp_presize ( tftp, ( offset + data_len ) ) ) != 0 )
+ goto done;
+
+ /* Mark block as received */
+ bitmap_set ( &tftp->bitmap, block );
+
+ /* Acknowledge block */
+ tftp_send_packet ( tftp );
+
+ /* If all blocks have been received, finish. */
+ if ( bitmap_full ( &tftp->bitmap ) )
+ tftp_done ( tftp, 0 );
+
+ done:
+ free_iob ( iobuf );
+ if ( rc )
+ tftp_done ( tftp, rc );
+ return rc;
+}
+
+/** Translation between TFTP errors and internal error numbers */
+static const int tftp_errors[] = {
+ [TFTP_ERR_FILE_NOT_FOUND] = ENOENT,
+ [TFTP_ERR_ACCESS_DENIED] = EACCES,
+ [TFTP_ERR_ILLEGAL_OP] = ENOTSUP,
+};
+
+/**
+ * Receive ERROR
+ *
+ * @v tftp TFTP connection
+ * @v buf Temporary data buffer
+ * @v len Length of temporary data buffer
+ * @ret rc Return status code
+ */
+static int tftp_rx_error ( struct tftp_request *tftp, void *buf, size_t len ) {
+ struct tftp_error *error = buf;
+ unsigned int err;
+ int rc = 0;
+
+ /* Sanity check */
+ if ( len < sizeof ( *error ) ) {
+ DBGC ( tftp, "TFTP %p received underlength ERROR packet "
+ "length %zd\n", tftp, len );
+ return -EINVAL;
+ }
+
+ DBGC ( tftp, "TFTP %p received ERROR packet with code %d, message "
+ "\"%s\"\n", tftp, ntohs ( error->errcode ), error->errmsg );
+
+ /* Determine final operation result */
+ err = ntohs ( error->errcode );
+ if ( err < ( sizeof ( tftp_errors ) / sizeof ( tftp_errors[0] ) ) )
+ rc = -tftp_errors[err];
+ if ( ! rc )
+ rc = -ENOTSUP;
+
+ /* Close TFTP request */
+ tftp_done ( tftp, rc );
+
+ return 0;
+}
+
+/**
+ * Receive new data
+ *
+ * @v tftp TFTP connection
+ * @v iobuf I/O buffer
+ * @v meta Transfer metadata, or NULL
+ * @ret rc Return status code
+ */
+static int tftp_rx ( struct tftp_request *tftp,
+ struct io_buffer *iobuf,
+ struct xfer_metadata *meta ) {
+ struct sockaddr_tcpip *st_src;
+ struct tftp_common *common = iobuf->data;
+ size_t len = iob_len ( iobuf );
+ int rc = -EINVAL;
+
+ /* Sanity checks */
+ if ( len < sizeof ( *common ) ) {
+ DBGC ( tftp, "TFTP %p received underlength packet length "
+ "%zd\n", tftp, len );
+ goto done;
+ }
+ if ( ! meta ) {
+ DBGC ( tftp, "TFTP %p received packet without metadata\n",
+ tftp );
+ goto done;
+ }
+ if ( ! meta->src ) {
+ DBGC ( tftp, "TFTP %p received packet without source port\n",
+ tftp );
+ goto done;
+ }
+
+ /* Filter by TID. Set TID on first response received */
+ st_src = ( struct sockaddr_tcpip * ) meta->src;
+ if ( ! tftp->peer.st_family ) {
+ memcpy ( &tftp->peer, st_src, sizeof ( tftp->peer ) );
+ DBGC ( tftp, "TFTP %p using remote port %d\n", tftp,
+ ntohs ( tftp->peer.st_port ) );
+ } else if ( memcmp ( &tftp->peer, st_src,
+ sizeof ( tftp->peer ) ) != 0 ) {
+ DBGC ( tftp, "TFTP %p received packet from wrong source (got "
+ "%d, wanted %d)\n", tftp, ntohs ( st_src->st_port ),
+ ntohs ( tftp->peer.st_port ) );
+ goto done;
+ }
+
+ switch ( common->opcode ) {
+ case htons ( TFTP_OACK ):
+ rc = tftp_rx_oack ( tftp, iobuf->data, len );
+ break;
+ case htons ( TFTP_DATA ):
+ rc = tftp_rx_data ( tftp, iobuf );
+ iobuf = NULL;
+ break;
+ case htons ( TFTP_ERROR ):
+ rc = tftp_rx_error ( tftp, iobuf->data, len );
+ break;
+ default:
+ DBGC ( tftp, "TFTP %p received strange packet type %d\n",
+ tftp, ntohs ( common->opcode ) );
+ break;
+ };
+
+ done:
+ free_iob ( iobuf );
+ return rc;
+}
+
+/**
+ * Receive new data via socket
+ *
+ * @v socket Transport layer interface
+ * @v iobuf I/O buffer
+ * @v meta Transfer metadata, or NULL
+ * @ret rc Return status code
+ */
+static int tftp_socket_deliver_iob ( struct xfer_interface *socket,
+ struct io_buffer *iobuf,
+ struct xfer_metadata *meta ) {
+ struct tftp_request *tftp =
+ container_of ( socket, struct tftp_request, socket );
+
+ /* Enable sending ACKs when we receive a unicast packet. This
+ * covers three cases:
+ *
+ * 1. Standard TFTP; we should always send ACKs, and will
+ * always receive a unicast packet before we need to send the
+ * first ACK.
+ *
+ * 2. RFC2090 multicast TFTP; the only unicast packets we will
+ * receive are the OACKs; enable sending ACKs here (before
+ * processing the OACK) and disable it when processing the
+ * multicast option if we are not the master client.
+ *
+ * 3. MTFTP; receiving a unicast datagram indicates that we
+ * are the "master client" and should send ACKs.
+ */
+ tftp->flags |= TFTP_FL_SEND_ACK;
+
+ return tftp_rx ( tftp, iobuf, meta );
+}
+
+/** TFTP socket operations */
+static struct xfer_interface_operations tftp_socket_operations = {
+ .close = ignore_xfer_close,
+ .vredirect = xfer_vopen,
+ .window = unlimited_xfer_window,
+ .alloc_iob = default_xfer_alloc_iob,
+ .deliver_iob = tftp_socket_deliver_iob,
+ .deliver_raw = xfer_deliver_as_iob,
+};
+
+/**
+ * Receive new data via multicast socket
+ *
+ * @v mc_socket Multicast transport layer interface
+ * @v iobuf I/O buffer
+ * @v meta Transfer metadata, or NULL
+ * @ret rc Return status code
+ */
+static int tftp_mc_socket_deliver_iob ( struct xfer_interface *mc_socket,
+ struct io_buffer *iobuf,
+ struct xfer_metadata *meta ) {
+ struct tftp_request *tftp =
+ container_of ( mc_socket, struct tftp_request, mc_socket );
+
+ return tftp_rx ( tftp, iobuf, meta );
+}
+
+/** TFTP multicast socket operations */
+static struct xfer_interface_operations tftp_mc_socket_operations = {
+ .close = ignore_xfer_close,
+ .vredirect = xfer_vopen,
+ .window = unlimited_xfer_window,
+ .alloc_iob = default_xfer_alloc_iob,
+ .deliver_iob = tftp_mc_socket_deliver_iob,
+ .deliver_raw = xfer_deliver_as_iob,
+};
+
+/**
+ * Close TFTP data transfer interface
+ *
+ * @v xfer Data transfer interface
+ * @v rc Reason for close
+ */
+static void tftp_xfer_close ( struct xfer_interface *xfer, int rc ) {
+ struct tftp_request *tftp =
+ container_of ( xfer, struct tftp_request, xfer );
+
+ DBGC ( tftp, "TFTP %p interface closed: %s\n",
+ tftp, strerror ( rc ) );
+
+ tftp_done ( tftp, rc );
+}
+
+/** TFTP data transfer interface operations */
+static struct xfer_interface_operations tftp_xfer_operations = {
+ .close = tftp_xfer_close,
+ .vredirect = ignore_xfer_vredirect,
+ .window = unlimited_xfer_window,
+ .alloc_iob = default_xfer_alloc_iob,
+ .deliver_iob = xfer_deliver_as_raw,
+ .deliver_raw = ignore_xfer_deliver_raw,
+};
+
+/**
+ * Initiate TFTP/TFTM/MTFTP download
+ *
+ * @v xfer Data transfer interface
+ * @v uri Uniform Resource Identifier
+ * @ret rc Return status code
+ */
+static int tftp_core_open ( struct xfer_interface *xfer, struct uri *uri,
+ unsigned int default_port,
+ struct sockaddr *multicast,
+ unsigned int flags ) {
+ struct tftp_request *tftp;
+ int rc;
+
+ /* Sanity checks */
+ if ( ! uri->host )
+ return -EINVAL;
+ if ( ! uri->path )
+ return -EINVAL;
+
+ /* Allocate and populate TFTP structure */
+ tftp = zalloc ( sizeof ( *tftp ) );
+ if ( ! tftp )
+ return -ENOMEM;
+ tftp->refcnt.free = tftp_free;
+ xfer_init ( &tftp->xfer, &tftp_xfer_operations, &tftp->refcnt );
+ tftp->uri = uri_get ( uri );
+ xfer_init ( &tftp->socket, &tftp_socket_operations, &tftp->refcnt );
+ xfer_init ( &tftp->mc_socket, &tftp_mc_socket_operations,
+ &tftp->refcnt );
+ tftp->blksize = TFTP_DEFAULT_BLKSIZE;
+ tftp->flags = flags;
+ tftp->timer.expired = tftp_timer_expired;
+
+ /* Open socket */
+ tftp->port = uri_port ( tftp->uri, default_port );
+ if ( ( rc = tftp_reopen ( tftp ) ) != 0 )
+ goto err;
+
+ /* Open multicast socket */
+ if ( multicast ) {
+ if ( ( rc = tftp_reopen_mc ( tftp, multicast ) ) != 0 )
+ goto err;
+ }
+
+ /* Start timer to initiate RRQ */
+ start_timer_nodelay ( &tftp->timer );
+
+ /* Attach to parent interface, mortalise self, and return */
+ xfer_plug_plug ( &tftp->xfer, xfer );
+ ref_put ( &tftp->refcnt );
+ return 0;
+
+ err:
+ DBGC ( tftp, "TFTP %p could not create request: %s\n",
+ tftp, strerror ( rc ) );
+ tftp_done ( tftp, rc );
+ ref_put ( &tftp->refcnt );
+ return rc;
+}
+
+/**
+ * Initiate TFTP download
+ *
+ * @v xfer Data transfer interface
+ * @v uri Uniform Resource Identifier
+ * @ret rc Return status code
+ */
+static int tftp_open ( struct xfer_interface *xfer, struct uri *uri ) {
+ return tftp_core_open ( xfer, uri, TFTP_PORT, NULL,
+ TFTP_FL_RRQ_SIZES );
+
+}
+
+/** TFTP URI opener */
+struct uri_opener tftp_uri_opener __uri_opener = {
+ .scheme = "tftp",
+ .open = tftp_open,
+};
+
+/**
+ * Initiate TFTM download
+ *
+ * @v xfer Data transfer interface
+ * @v uri Uniform Resource Identifier
+ * @ret rc Return status code
+ */
+static int tftm_open ( struct xfer_interface *xfer, struct uri *uri ) {
+ return tftp_core_open ( xfer, uri, TFTP_PORT, NULL,
+ ( TFTP_FL_RRQ_SIZES |
+ TFTP_FL_RRQ_MULTICAST ) );
+
+}
+
+/** TFTM URI opener */
+struct uri_opener tftm_uri_opener __uri_opener = {
+ .scheme = "tftm",
+ .open = tftm_open,
+};
+
+/**
+ * Initiate MTFTP download
+ *
+ * @v xfer Data transfer interface
+ * @v uri Uniform Resource Identifier
+ * @ret rc Return status code
+ */
+static int mtftp_open ( struct xfer_interface *xfer, struct uri *uri ) {
+ return tftp_core_open ( xfer, uri, MTFTP_PORT,
+ ( struct sockaddr * ) &tftp_mtftp_socket,
+ TFTP_FL_MTFTP_RECOVERY );
+}
+
+/** MTFTP URI opener */
+struct uri_opener mtftp_uri_opener __uri_opener = {
+ .scheme = "mtftp",
+ .open = mtftp_open,
+};
+
+/******************************************************************************
+ *
+ * Settings
+ *
+ ******************************************************************************
+ */
+
+/** TFTP server setting */
+struct setting next_server_setting __setting = {
+ .name = "next-server",
+ .description = "TFTP server",
+ .tag = DHCP_EB_SIADDR,
+ .type = &setting_type_ipv4,
+};
+
+/**
+ * Apply TFTP configuration settings
+ *
+ * @ret rc Return status code
+ */
+static int tftp_apply_settings ( void ) {
+ static struct in_addr tftp_server = { 0 };
+ struct in_addr last_tftp_server;
+ char uri_string[32];
+ struct uri *uri;
+
+ /* Retrieve TFTP server setting */
+ last_tftp_server = tftp_server;
+ fetch_ipv4_setting ( NULL, &next_server_setting, &tftp_server );
+
+ /* If TFTP server setting has changed, set the current working
+ * URI to match. Do it only when the TFTP server has changed
+ * to try to minimise surprises to the user, who probably
+ * won't expect the CWURI to change just because they updated
+ * an unrelated setting and triggered all the settings
+ * applicators.
+ */
+ if ( tftp_server.s_addr != last_tftp_server.s_addr ) {
+ snprintf ( uri_string, sizeof ( uri_string ),
+ "tftp://%s/", inet_ntoa ( tftp_server ) );
+ uri = parse_uri ( uri_string );
+ if ( ! uri )
+ return -ENOMEM;
+ churi ( uri );
+ uri_put ( uri );
+ }
+
+ return 0;
+}
+
+/** TFTP settings applicator */
+struct settings_applicator tftp_settings_applicator __settings_applicator = {
+ .apply = tftp_apply_settings,
+};