diff mbox

[RFC,1/3] linux-user: add rtnetlink(7) support

Message ID 1454192820-5095-2-git-send-email-laurent@vivier.eu (mailing list archive)
State New, archived
Headers show

Commit Message

Laurent Vivier Jan. 30, 2016, 10:26 p.m. UTC
rtnetlink is needed to use iproute package (ip addr, ip route)
and dhcp client.

Examples:

Without this patch:
    # ip link
    Cannot open netlink socket: Address family not supported by protocol
    # ip addr
    Cannot open netlink socket: Address family not supported by protocol
    # ip route
    Cannot open netlink socket: Address family not supported by protocol
    # dhclient eth0
    Cannot open netlink socket: Address family not supported by protocol
    Cannot open netlink socket: Address family not supported by protocol

With this patch:
    # ip link
    1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT
        link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    51: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP mode DEFAULT qlen 1000
        link/ether 00:16:3e:89:6b:d7 brd ff:ff:ff:ff:ff:ff
    # ip addr show eth0
    51: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP qlen 1000
        link/ether 00:16:3e:89:6b:d7 brd ff:ff:ff:ff:ff:ff
        inet 192.168.122.197/24 brd 192.168.122.255 scope global eth0
           valid_lft forever preferred_lft forever
        inet6 fe80::216:3eff:fe89:6bd7/64 scope link
           valid_lft forever preferred_lft forever
    # ip route
    default via 192.168.122.1 dev eth0
    192.168.122.0/24 dev eth0  proto kernel  scope link  src 192.168.122.197
    # ip addr flush eth0
    # ip addr add 192.168.122.10 dev eth0
    # ip addr show eth0
    51: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP qlen 1000
        link/ether 00:16:3e:89:6b:d7 brd ff:ff:ff:ff:ff:ff
        inet 192.168.122.10/32 scope global eth0
           valid_lft forever preferred_lft forever
    # ip route add 192.168.122.0/24 via 192.168.122.10
    # ip route
        192.168.122.0/24 via 192.168.122.10 dev eth0

Signed-off-by: Laurent Vivier <laurent@vivier.eu>
---
 linux-user/syscall.c | 477 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 471 insertions(+), 6 deletions(-)

Comments

Peter Maydell May 13, 2016, 4:40 p.m. UTC | #1
On 30 January 2016 at 22:26, Laurent Vivier <laurent@vivier.eu> wrote:
> rtnetlink is needed to use iproute package (ip addr, ip route)
> and dhcp client.
>
> Examples:
>
> Without this patch:
>     # ip link
>     Cannot open netlink socket: Address family not supported by protocol
>     # ip addr
>     Cannot open netlink socket: Address family not supported by protocol
>     # ip route
>     Cannot open netlink socket: Address family not supported by protocol
>     # dhclient eth0
>     Cannot open netlink socket: Address family not supported by protocol
>     Cannot open netlink socket: Address family not supported by protocol
>
> With this patch:
>     # ip link
>     1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT
>         link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
>     51: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP mode DEFAULT qlen 1000
>         link/ether 00:16:3e:89:6b:d7 brd ff:ff:ff:ff:ff:ff
>     # ip addr show eth0
>     51: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP qlen 1000
>         link/ether 00:16:3e:89:6b:d7 brd ff:ff:ff:ff:ff:ff
>         inet 192.168.122.197/24 brd 192.168.122.255 scope global eth0
>            valid_lft forever preferred_lft forever
>         inet6 fe80::216:3eff:fe89:6bd7/64 scope link
>            valid_lft forever preferred_lft forever
>     # ip route
>     default via 192.168.122.1 dev eth0
>     192.168.122.0/24 dev eth0  proto kernel  scope link  src 192.168.122.197
>     # ip addr flush eth0
>     # ip addr add 192.168.122.10 dev eth0
>     # ip addr show eth0
>     51: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP qlen 1000
>         link/ether 00:16:3e:89:6b:d7 brd ff:ff:ff:ff:ff:ff
>         inet 192.168.122.10/32 scope global eth0
>            valid_lft forever preferred_lft forever
>     # ip route add 192.168.122.0/24 via 192.168.122.10
>     # ip route
>         192.168.122.0/24 via 192.168.122.10 dev eth0
>
> Signed-off-by: Laurent Vivier <laurent@vivier.eu>
> ---
>  linux-user/syscall.c | 477 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 471 insertions(+), 6 deletions(-)

Apologies for not getting to this patch earlier. Mostly it looks
OK but I have a few questions/suggestions below.

> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
> index dac5518..a1ed2f5 100644
> --- a/linux-user/syscall.c
> +++ b/linux-user/syscall.c
> @@ -100,6 +100,8 @@ int __clone2(int (*fn)(void *), void *child_stack_base,
>  #include <linux/filter.h>
>  #include <linux/blkpg.h>
>  #include "linux_loop.h"
> +#include <linux/netlink.h>
> +#include <linux/rtnetlink.h>
>  #include "uname.h"
>
>  #include "qemu.h"
> @@ -295,6 +297,14 @@ static TargetFdTrans **target_fd_trans;
>
>  static unsigned int target_fd_max;
>
> +static TargetFdDataFunc fd_trans_target_to_host_data(int fd)
> +{
> +    if (fd >= 0 && fd < target_fd_max && target_fd_trans[fd]) {
> +        return target_fd_trans[fd]->target_to_host_data;
> +    }
> +    return NULL;
> +}
> +
>  static TargetFdDataFunc fd_trans_host_to_target_data(int fd)
>  {
>      if (fd >= 0 && fd < target_fd_max && target_fd_trans[fd]) {
> @@ -1222,6 +1232,11 @@ static inline abi_long host_to_target_sockaddr(abi_ulong target_addr,
>          return -TARGET_EFAULT;
>      memcpy(target_saddr, addr, len);
>      target_saddr->sa_family = tswap16(addr->sa_family);
> +    if (addr->sa_family == AF_NETLINK) {
> +        struct sockaddr_nl *target_nl = (struct sockaddr_nl *)target_saddr;
> +        target_nl->nl_pid = tswap32(target_nl->nl_pid);
> +        target_nl->nl_groups = tswap32(target_nl->nl_groups);
> +    }
>      unlock_user(target_saddr, target_addr, len);
>
>      return 0;
> @@ -1453,6 +1468,416 @@ static inline abi_long host_to_target_cmsg(struct target_msghdr *target_msgh,
>      return 0;
>  }
>
> +static void tswap_nlmsghdr(struct nlmsghdr *nlh)
> +{
> +    nlh->nlmsg_len = tswap32(nlh->nlmsg_len);
> +    nlh->nlmsg_type = tswap16(nlh->nlmsg_type);
> +    nlh->nlmsg_flags = tswap16(nlh->nlmsg_flags);
> +    nlh->nlmsg_seq = tswap32(nlh->nlmsg_seq);
> +    nlh->nlmsg_pid = tswap32(nlh->nlmsg_pid);
> +}
> +
> +static abi_long host_to_target_for_each_nlmsg(struct nlmsghdr *nlh,
> +                                              size_t len,
> +                                              abi_long (*host_to_target_nlmsg)
> +                                                       (struct nlmsghdr *))
> +{
> +    uint32_t nlmsg_len;
> +    abi_long ret;
> +
> +    while (len > (int)sizeof(struct nlmsghdr)) {

What is this cast to int for ?

> +
> +        nlmsg_len = nlh->nlmsg_len;
> +        if (nlmsg_len < sizeof(struct nlmsghdr) ||
> +            nlmsg_len > len) {
> +            break;
> +        }
> +
> +        if (nlh->nlmsg_type == NLMSG_DONE) {
> +            tswap_nlmsghdr(nlh);
> +            break;
> +        }

Why not put this one inside the switch too? (you'd just need to
change the 'break' to 'return 0'.)

> +        switch (nlh->nlmsg_type) {
> +        case NLMSG_NOOP:
> +            break;
> +        case NLMSG_ERROR: {

Odd brace placement.

> +                struct nlmsgerr *e = NLMSG_DATA(nlh);
> +                e->error = tswap32(e->error);
> +                tswap_nlmsghdr(&e->msg);
> +                tswap_nlmsghdr(nlh);
> +            }
> +            return 0;
> +        default:
> +            ret = host_to_target_nlmsg(nlh);
> +            if (ret < 0) {
> +                tswap_nlmsghdr(nlh);
> +                return ret;
> +            }
> +            break;
> +        }
> +        tswap_nlmsghdr(nlh);
> +        len -= NLMSG_ALIGN(nlmsg_len);
> +        nlh = (struct nlmsghdr *)(((char*)nlh) + NLMSG_ALIGN(nlmsg_len));
> +    }
> +    return 0;
> +}
> +
> +static abi_long target_to_host_for_each_nlmsg(struct nlmsghdr *nlh,
> +                                              size_t len,
> +                                              abi_long (*target_to_host_nlmsg)
> +                                                       (struct nlmsghdr *))
> +{
> +    int ret;
> +
> +    while (len > (int)sizeof(struct nlmsghdr)) {
> +        if (tswap32(nlh->nlmsg_len) < sizeof(struct nlmsghdr) ||
> +            tswap32(nlh->nlmsg_len) > len) {
> +            break;
> +        }
> +        tswap_nlmsghdr(nlh);
> +        if (nlh->nlmsg_type == NLMSG_DONE) {
> +            break;
> +        }
> +        switch (nlh->nlmsg_type) {
> +        case NLMSG_NOOP:
> +            break;
> +        case NLMSG_ERROR: {
> +                struct nlmsgerr *e = NLMSG_DATA(nlh);
> +                e->error = tswap32(e->error);
> +                tswap_nlmsghdr(&e->msg);
> +            }
> +        default:
> +            ret = target_to_host_nlmsg(nlh);
> +            if (ret < 0) {
> +                return ret;
> +            }
> +        }
> +        len -= NLMSG_ALIGN(nlh->nlmsg_len);
> +        nlh = (struct nlmsghdr *)(((char *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));

You could use 'nlh = NLMSG_NEXT(nlh, len);' here rather than
manually adjusting len and nlh (though maybe you prefer the
parallel with the host_to_target function, and there we can't
use NLMSG_NEXT).

> +    }
> +    return 0;
> +}
> +
> +static abi_long host_to_target_for_each_rtattr(struct rtattr *rtattr,
> +                                               size_t len,
> +                                               abi_long (*host_to_target_rtattr)
> +                                                        (struct rtattr *))
> +{
> +    unsigned short rta_len;
> +    abi_long ret;
> +
> +    while (len > (int)sizeof(struct rtattr)) {
> +        rta_len = rtattr->rta_len;
> +        if (rta_len < sizeof(struct rtattr) ||
> +            rta_len > len) {
> +            rtattr->rta_len = tswap16(rtattr->rta_len);
> +            rtattr->rta_type = tswap16(rtattr->rta_type);

In the code above for iterating nlmsgs, we don't try to
touch the fields if the length claims to be out of bounds;
here we do. Why the difference ?

> +            break;
> +        }
> +        ret = host_to_target_rtattr(rtattr);
> +        rtattr->rta_len = tswap16(rtattr->rta_len);
> +        rtattr->rta_type = tswap16(rtattr->rta_type);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        len -= RTA_ALIGN(rta_len);
> +        rtattr = (struct rtattr *)(((char *)rtattr) + RTA_ALIGN(rta_len));
> +    }
> +    return 0;
> +}
> +
> +static abi_long host_to_target_data_link_rtattr(struct rtattr *rtattr)
> +{
> +    uint32_t *u32;
> +
> +    switch (rtattr->rta_type) {
> +    case IFLA_ADDRESS:
> +    case IFLA_BROADCAST:
> +    case IFLA_IFNAME:

Comments here about the types we are converting (or not) would be nice.

> +        break;
> +    case IFLA_MTU:
> +    case IFLA_LINK:
> +    case IFLA_WEIGHT:
> +    case IFLA_TXQLEN:
> +    case IFLA_CARRIER_CHANGES:
> +    case IFLA_NUM_RX_QUEUES:
> +    case IFLA_NUM_TX_QUEUES:
> +    case IFLA_PROMISCUITY:
> +    case IFLA_EXT_MASK:
> +    case IFLA_LINK_NETNSID:
> +    case IFLA_GROUP:
> +    case IFLA_MASTER:
> +    case IFLA_NUM_VF:
> +    case IFLA_STATS:
> +        u32 = RTA_DATA(rtattr);
> +        *u32 = tswap32(*u32);
> +        break;
> +    case IFLA_QDISC:
> +    case IFLA_COST:
> +    case IFLA_MAP:
> +    case IFLA_OPERSTATE:
> +    case IFLA_LINKMODE:
> +    case IFLA_CARRIER:
> +    case IFLA_STATS64:
> +    case IFLA_AF_SPEC:
> +    case IFLA_LINKINFO:

What order are these cases in (ie why is this set of "do nothing"
different from the ones at the top of the switch) ?

> +        break;
> +    default:

Should we log something about an unsupported IFLA type here?

> +        break;


Looking at
http://lxr.free-electrons.com/source/net/core/rtnetlink.c
I'm not sure all of these are right. I think IFLA_MAP means we have a
pointer to a struct rtnl_link_ifmap, for instance, which would need
some of its fields swapping. IFLA_STATS is an rtnl_link_stats
struct, and so on.

> +    }
> +    return 0;
> +}
> +
> +static abi_long host_to_target_data_addr_rtattr(struct rtattr *rtattr)
> +{
> +    uint32_t *u32;
> +    struct ifa_cacheinfo *ci;
> +
> +    switch (rtattr->rta_type) {
> +    case IFA_FLAGS:
> +        u32 = RTA_DATA(rtattr);
> +        *u32 = tswap32(*u32);
> +        break;
> +    case IFA_LOCAL:
> +    case IFA_ADDRESS:
> +    case IFA_BROADCAST:
> +    case IFA_LABEL:
> +        break;
> +    case IFA_CACHEINFO:
> +        ci = RTA_DATA(rtattr);
> +        ci->ifa_prefered = tswap32(ci->ifa_prefered);
> +        ci->ifa_valid = tswap32(ci->ifa_valid);
> +        ci->cstamp = tswap32(ci->cstamp);
> +        ci->tstamp = tswap32(ci->tstamp);
> +        break;
> +    default:
> +        break;

Log about unsupported values?

> +    }
> +    return 0;
> +}
> +
> +static abi_long host_to_target_data_route_rtattr(struct rtattr *rtattr)
> +{
> +    uint32_t *u32;
> +    switch (rtattr->rta_type) {
> +    case RTA_GATEWAY:
> +    case RTA_DST:
> +    case RTA_PREFSRC:
> +        break;
> +    case RTA_PRIORITY:
> +    case RTA_TABLE:
> +    case RTA_OIF:
> +        u32 = RTA_DATA(rtattr);
> +        *u32 = tswap32(*u32);
> +        break;

This doesn't seem to be the complete list of RTA_ values; what
decides which ones are listed here?

> +    default:
> +        break;
> +    }
> +    return 0;
> +}
> +
> +static abi_long host_to_target_link_rtattr(struct rtattr *rtattr,
> +                                         uint32_t rtattr_len)
> +{
> +    return host_to_target_for_each_rtattr(rtattr, rtattr_len,
> +                                          host_to_target_data_link_rtattr);
> +}
> +
> +static abi_long host_to_target_addr_rtattr(struct rtattr *rtattr,
> +                                         uint32_t rtattr_len)
> +{
> +    return host_to_target_for_each_rtattr(rtattr, rtattr_len,
> +                                          host_to_target_data_addr_rtattr);
> +}
> +
> +static abi_long host_to_target_route_rtattr(struct rtattr *rtattr,
> +                                         uint32_t rtattr_len)
> +{
> +    return host_to_target_for_each_rtattr(rtattr, rtattr_len,
> +                                          host_to_target_data_route_rtattr);
> +}
> +
> +static abi_long host_to_target_data_route(struct nlmsghdr *nlh)
> +{
> +    uint32_t nlmsg_len;
> +    struct ifinfomsg *ifi;
> +    struct ifaddrmsg *ifa;
> +    struct rtmsg *rtm;
> +
> +    nlmsg_len = nlh->nlmsg_len;
> +    switch (nlh->nlmsg_type) {
> +    case RTM_NEWLINK:
> +    case RTM_DELLINK:
> +    case RTM_GETLINK:
> +        ifi = NLMSG_DATA(nlh);
> +        ifi->ifi_type = tswap16(ifi->ifi_type);
> +        ifi->ifi_index = tswap32(ifi->ifi_index);
> +        ifi->ifi_flags = tswap32(ifi->ifi_flags);
> +        ifi->ifi_change = tswap32(ifi->ifi_change);
> +        host_to_target_link_rtattr(IFLA_RTA(ifi),
> +                                   nlmsg_len - NLMSG_LENGTH(sizeof(*ifi)));
> +        break;
> +    case RTM_NEWADDR:
> +    case RTM_DELADDR:
> +    case RTM_GETADDR:
> +        ifa = NLMSG_DATA(nlh);
> +        ifa->ifa_index = tswap32(ifa->ifa_index);
> +        host_to_target_addr_rtattr(IFA_RTA(ifa),
> +                                   nlmsg_len - NLMSG_LENGTH(sizeof(*ifa)));
> +        break;
> +    case RTM_NEWROUTE:
> +    case RTM_DELROUTE:
> +    case RTM_GETROUTE:
> +        rtm = NLMSG_DATA(nlh);
> +        rtm->rtm_flags = tswap32(rtm->rtm_flags);
> +        host_to_target_route_rtattr(RTM_RTA(rtm),
> +                                    nlmsg_len - NLMSG_LENGTH(sizeof(*rtm)));
> +        break;
> +    default:
> +        return -TARGET_EINVAL;
> +    }
> +    return 0;
> +}
> +
> +static inline abi_long host_to_target_nlmsg_route(struct nlmsghdr *nlh,
> +                                                  size_t len)
> +{
> +    return host_to_target_for_each_nlmsg(nlh, len, host_to_target_data_route);
> +}
> +
> +static abi_long target_to_host_for_each_rtattr(struct rtattr *rtattr,
> +                                               size_t len,
> +                                               abi_long (*target_to_host_rtattr)
> +                                                        (struct rtattr *))
> +{
> +    abi_long ret;
> +
> +    while (len >= (int)sizeof(struct rtattr)) {
> +        rtattr->rta_len = tswap16(rtattr->rta_len);
> +        rtattr->rta_type = tswap16(rtattr->rta_type);
> +        if (rtattr->rta_len < sizeof(struct rtattr) ||
> +            rtattr->rta_len > len) {
> +            break;
> +        }

Length check after swap of len/type again.

> +        ret = target_to_host_rtattr(rtattr);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        len -= RTA_ALIGN(rtattr->rta_len);
> +        rtattr = (struct rtattr *)(((char *)rtattr) +
> +                 RTA_ALIGN(rtattr->rta_len));
> +    }
> +    return 0;
> +}
> +
> +static abi_long target_to_host_data_link_rtattr(struct rtattr *rtattr)
> +{
> +    switch (rtattr->rta_type) {
> +    default:
> +        break;
> +    }
> +    return 0;
> +}
> +
> +static abi_long target_to_host_data_addr_rtattr(struct rtattr *rtattr)
> +{
> +    switch (rtattr->rta_type) {
> +    case IFA_LOCAL:
> +    case IFA_ADDRESS:
> +        break;
> +    default:
> +        break;
> +    }
> +    return 0;
> +}
> +
> +static abi_long target_to_host_data_route_rtattr(struct rtattr *rtattr)
> +{
> +    uint32_t *u32;
> +    switch (rtattr->rta_type) {
> +    case RTA_DST:
> +    case RTA_SRC:
> +    case RTA_GATEWAY:
> +        /* nothing to do */
> +        break;
> +    case RTA_OIF:
> +        u32 = RTA_DATA(rtattr);
> +        *u32 = tswap32(*u32);
> +        break;
> +    default:
> +        break;
> +    }
> +    return 0;
> +}
> +
> +static void target_to_host_link_rtattr(struct rtattr *rtattr,
> +                                       uint32_t rtattr_len)
> +{
> +    target_to_host_for_each_rtattr(rtattr, rtattr_len,
> +                                   target_to_host_data_link_rtattr);
> +}
> +
> +static void target_to_host_addr_rtattr(struct rtattr *rtattr,
> +                                     uint32_t rtattr_len)
> +{
> +    target_to_host_for_each_rtattr(rtattr, rtattr_len,
> +                                   target_to_host_data_addr_rtattr);
> +}
> +
> +static void target_to_host_route_rtattr(struct rtattr *rtattr,
> +                                     uint32_t rtattr_len)
> +{
> +    target_to_host_for_each_rtattr(rtattr, rtattr_len,
> +                                   target_to_host_data_route_rtattr);
> +}
> +
> +static abi_long target_to_host_data_route(struct nlmsghdr *nlh)
> +{
> +    struct ifinfomsg *ifi;
> +    struct ifaddrmsg *ifa;
> +    struct rtmsg *rtm;
> +
> +    switch (nlh->nlmsg_type) {
> +    case RTM_GETLINK:
> +        break;
> +    case RTM_NEWLINK:
> +    case RTM_DELLINK:
> +        ifi = NLMSG_DATA(nlh);
> +        ifi->ifi_type = tswap16(ifi->ifi_type);
> +        ifi->ifi_index = tswap32(ifi->ifi_index);
> +        ifi->ifi_flags = tswap32(ifi->ifi_flags);
> +        ifi->ifi_change = tswap32(ifi->ifi_change);
> +        target_to_host_link_rtattr(IFLA_RTA(ifi), nlh->nlmsg_len -
> +                                   NLMSG_LENGTH(sizeof(*ifi)));
> +        break;
> +    case RTM_GETADDR:
> +    case RTM_NEWADDR:
> +    case RTM_DELADDR:
> +        ifa = NLMSG_DATA(nlh);
> +        ifa->ifa_index = tswap32(ifa->ifa_index);
> +        target_to_host_addr_rtattr(IFA_RTA(ifa), nlh->nlmsg_len -
> +                                   NLMSG_LENGTH(sizeof(*ifa)));
> +        break;
> +    case RTM_GETROUTE:
> +        break;
> +    case RTM_NEWROUTE:
> +    case RTM_DELROUTE:
> +        rtm = NLMSG_DATA(nlh);
> +        rtm->rtm_flags = tswap32(rtm->rtm_flags);
> +        target_to_host_route_rtattr(RTM_RTA(rtm), nlh->nlmsg_len -
> +                                    NLMSG_LENGTH(sizeof(*rtm)));
> +        break;
> +    default:
> +        return -TARGET_EOPNOTSUPP;
> +    }
> +    return 0;
> +}
> +
> +static abi_long target_to_host_nlmsg_route(struct nlmsghdr *nlh, size_t len)
> +{
> +    return target_to_host_for_each_nlmsg(nlh, len, target_to_host_data_route);
> +}
> +
>  /* do_setsockopt() Must return target values and target errnos. */
>  static abi_long do_setsockopt(int sockfd, int level, int optname,
>                                abi_ulong optval_addr, socklen_t optlen)
> @@ -2103,6 +2528,21 @@ static TargetFdTrans target_packet_trans = {
>      .target_to_host_addr = packet_target_to_host_sockaddr,
>  };
>
> +static abi_long netlink_route_target_to_host(void *buf, size_t len)
> +{
> +    return target_to_host_nlmsg_route(buf, len);
> +}
> +
> +static abi_long netlink_route_host_to_target(void *buf, size_t len)
> +{
> +    return host_to_target_nlmsg_route(buf, len);
> +}
> +
> +static TargetFdTrans target_netlink_route_trans = {
> +    .target_to_host_data = netlink_route_target_to_host,
> +    .host_to_target_data = netlink_route_host_to_target,
> +};
> +
>  /* do_socket() Must return target values and target errnos. */
>  static abi_long do_socket(int domain, int type, int protocol)
>  {
> @@ -2114,9 +2554,6 @@ static abi_long do_socket(int domain, int type, int protocol)
>          return ret;
>      }
>
> -    if (domain == PF_NETLINK)
> -        return -TARGET_EAFNOSUPPORT;
> -
>      if (domain == AF_PACKET ||
>          (domain == AF_INET && type == SOCK_PACKET)) {
>          protocol = tswap16(protocol);
> @@ -2130,6 +2567,16 @@ static abi_long do_socket(int domain, int type, int protocol)
>               * if socket type is SOCK_PACKET, bind by name
>               */
>              fd_trans_register(ret, &target_packet_trans);
> +        } else if (domain == PF_NETLINK) {
> +            switch (protocol) {
> +            case NETLINK_ROUTE:
> +                fd_trans_register(ret, &target_netlink_route_trans);
> +                break;
> +            default:
> +                close(ret);
> +                ret = -EPFNOSUPPORT;

Can we handle the "PF_NETLINK but unsupported protocol" check
before we try to open the host socket, rather than having to
back it out here?

> +                break;
> +            }
>          }
>      }
>      return ret;
> @@ -2214,14 +2661,25 @@ static abi_long do_sendrecvmsg_locked(int fd, struct target_msghdr *msgp,
>      msg.msg_iov = vec;
>
>      if (send) {
> -        ret = target_to_host_cmsg(&msg, msgp);
> -        if (ret == 0)
> +        if (fd_trans_target_to_host_data(fd)) {
> +            ret = fd_trans_target_to_host_data(fd)(msg.msg_iov->iov_base,
> +                                                   msg.msg_iov->iov_len);
> +        } else {
> +            ret = target_to_host_cmsg(&msg, msgp);
> +        }
> +        if (ret == 0) {
>              ret = get_errno(sendmsg(fd, &msg, flags));
> +        }
>      } else {
>          ret = get_errno(recvmsg(fd, &msg, flags));
>          if (!is_error(ret)) {
>              len = ret;
> -            ret = host_to_target_cmsg(msgp, &msg);
> +            if (fd_trans_host_to_target_data(fd)) {
> +                ret = fd_trans_host_to_target_data(fd)(msg.msg_iov->iov_base,
> +                                                       msg.msg_iov->iov_len);
> +            } else {
> +                ret = host_to_target_cmsg(msgp, &msg);
> +            }
>              if (!is_error(ret)) {
>                  msgp->msg_namelen = tswap32(msg.msg_namelen);
>                  if (msg.msg_name != NULL) {
> @@ -2448,6 +2906,13 @@ static abi_long do_sendto(int fd, abi_ulong msg, size_t len, int flags,
>      host_msg = lock_user(VERIFY_READ, msg, len, 1);
>      if (!host_msg)
>          return -TARGET_EFAULT;
> +    if (fd_trans_target_to_host_data(fd)) {
> +        ret = fd_trans_target_to_host_data(fd)(host_msg, len);
> +        if (ret < 0) {
> +            unlock_user(host_msg, msg, 0);
> +            return ret;
> +        }
> +    }
>      if (target_addr) {
>          addr = alloca(addrlen+1);
>          ret = target_to_host_sockaddr(fd, addr, target_addr, addrlen);
> --
> 2.5.0

This felt like a pretty long patch to wade through on review; if
there's an easy way to split it up that would be nice for v2,
but I don't insist on it if there doesn't seem to be a nice
splitting point.

thanks
-- PMM
Laurent Vivier May 14, 2016, 9:37 a.m. UTC | #2
Le 13/05/2016 à 18:40, Peter Maydell a écrit :
> On 30 January 2016 at 22:26, Laurent Vivier <laurent@vivier.eu> wrote:
>> rtnetlink is needed to use iproute package (ip addr, ip route)
>> and dhcp client.
>>
>> Examples:
>>
>> Without this patch:
>>     # ip link
>>     Cannot open netlink socket: Address family not supported by protocol
>>     # ip addr
>>     Cannot open netlink socket: Address family not supported by protocol
>>     # ip route
>>     Cannot open netlink socket: Address family not supported by protocol
>>     # dhclient eth0
>>     Cannot open netlink socket: Address family not supported by protocol
>>     Cannot open netlink socket: Address family not supported by protocol
>>
>> With this patch:
>>     # ip link
>>     1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT
>>         link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
>>     51: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP mode DEFAULT qlen 1000
>>         link/ether 00:16:3e:89:6b:d7 brd ff:ff:ff:ff:ff:ff
>>     # ip addr show eth0
>>     51: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP qlen 1000
>>         link/ether 00:16:3e:89:6b:d7 brd ff:ff:ff:ff:ff:ff
>>         inet 192.168.122.197/24 brd 192.168.122.255 scope global eth0
>>            valid_lft forever preferred_lft forever
>>         inet6 fe80::216:3eff:fe89:6bd7/64 scope link
>>            valid_lft forever preferred_lft forever
>>     # ip route
>>     default via 192.168.122.1 dev eth0
>>     192.168.122.0/24 dev eth0  proto kernel  scope link  src 192.168.122.197
>>     # ip addr flush eth0
>>     # ip addr add 192.168.122.10 dev eth0
>>     # ip addr show eth0
>>     51: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP qlen 1000
>>         link/ether 00:16:3e:89:6b:d7 brd ff:ff:ff:ff:ff:ff
>>         inet 192.168.122.10/32 scope global eth0
>>            valid_lft forever preferred_lft forever
>>     # ip route add 192.168.122.0/24 via 192.168.122.10
>>     # ip route
>>         192.168.122.0/24 via 192.168.122.10 dev eth0
>>
>> Signed-off-by: Laurent Vivier <laurent@vivier.eu>
>> ---
>>  linux-user/syscall.c | 477 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 471 insertions(+), 6 deletions(-)
> 
> Apologies for not getting to this patch earlier. Mostly it looks
> OK but I have a few questions/suggestions below.
> 
>> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
>> index dac5518..a1ed2f5 100644
>> --- a/linux-user/syscall.c
>> +++ b/linux-user/syscall.c
>> @@ -100,6 +100,8 @@ int __clone2(int (*fn)(void *), void *child_stack_base,
>>  #include <linux/filter.h>
>>  #include <linux/blkpg.h>
>>  #include "linux_loop.h"
>> +#include <linux/netlink.h>
>> +#include <linux/rtnetlink.h>
>>  #include "uname.h"
>>
>>  #include "qemu.h"
>> @@ -295,6 +297,14 @@ static TargetFdTrans **target_fd_trans;
>>
>>  static unsigned int target_fd_max;
>>
>> +static TargetFdDataFunc fd_trans_target_to_host_data(int fd)
>> +{
>> +    if (fd >= 0 && fd < target_fd_max && target_fd_trans[fd]) {
>> +        return target_fd_trans[fd]->target_to_host_data;
>> +    }
>> +    return NULL;
>> +}
>> +
>>  static TargetFdDataFunc fd_trans_host_to_target_data(int fd)
>>  {
>>      if (fd >= 0 && fd < target_fd_max && target_fd_trans[fd]) {
>> @@ -1222,6 +1232,11 @@ static inline abi_long host_to_target_sockaddr(abi_ulong target_addr,
>>          return -TARGET_EFAULT;
>>      memcpy(target_saddr, addr, len);
>>      target_saddr->sa_family = tswap16(addr->sa_family);
>> +    if (addr->sa_family == AF_NETLINK) {
>> +        struct sockaddr_nl *target_nl = (struct sockaddr_nl *)target_saddr;
>> +        target_nl->nl_pid = tswap32(target_nl->nl_pid);
>> +        target_nl->nl_groups = tswap32(target_nl->nl_groups);
>> +    }
>>      unlock_user(target_saddr, target_addr, len);
>>
>>      return 0;
>> @@ -1453,6 +1468,416 @@ static inline abi_long host_to_target_cmsg(struct target_msghdr *target_msgh,
>>      return 0;
>>  }
>>
>> +static void tswap_nlmsghdr(struct nlmsghdr *nlh)
>> +{
>> +    nlh->nlmsg_len = tswap32(nlh->nlmsg_len);
>> +    nlh->nlmsg_type = tswap16(nlh->nlmsg_type);
>> +    nlh->nlmsg_flags = tswap16(nlh->nlmsg_flags);
>> +    nlh->nlmsg_seq = tswap32(nlh->nlmsg_seq);
>> +    nlh->nlmsg_pid = tswap32(nlh->nlmsg_pid);
>> +}
>> +
>> +static abi_long host_to_target_for_each_nlmsg(struct nlmsghdr *nlh,
>> +                                              size_t len,
>> +                                              abi_long (*host_to_target_nlmsg)
>> +                                                       (struct nlmsghdr *))
>> +{
>> +    uint32_t nlmsg_len;
>> +    abi_long ret;
>> +
>> +    while (len > (int)sizeof(struct nlmsghdr)) {
> 
> What is this cast to int for ?
>

I agree it seems useless, I have copied some parts from the kernel :

/usr/include/linux/netlink.h

#define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \
...

so the "(int)" comes from this.

>> +
>> +        nlmsg_len = nlh->nlmsg_len;
>> +        if (nlmsg_len < sizeof(struct nlmsghdr) ||
>> +            nlmsg_len > len) {
>> +            break;
>> +        }
>> +
>> +        if (nlh->nlmsg_type == NLMSG_DONE) {
>> +            tswap_nlmsghdr(nlh);
>> +            break;
>> +        }
> 
> Why not put this one inside the switch too? (you'd just need to
> change the 'break' to 'return 0'.)

You're right.
The idea was to break the while without exiting the function.

> 
>> +        switch (nlh->nlmsg_type) {
>> +        case NLMSG_NOOP:
>> +            break;
>> +        case NLMSG_ERROR: {
> 
> Odd brace placement.

OK

>> +                struct nlmsgerr *e = NLMSG_DATA(nlh);
>> +                e->error = tswap32(e->error);
>> +                tswap_nlmsghdr(&e->msg);
>> +                tswap_nlmsghdr(nlh);
>> +            }
>> +            return 0;
>> +        default:
>> +            ret = host_to_target_nlmsg(nlh);
>> +            if (ret < 0) {
>> +                tswap_nlmsghdr(nlh);
>> +                return ret;
>> +            }
>> +            break;
>> +        }
>> +        tswap_nlmsghdr(nlh);
>> +        len -= NLMSG_ALIGN(nlmsg_len);
>> +        nlh = (struct nlmsghdr *)(((char*)nlh) + NLMSG_ALIGN(nlmsg_len));
>> +    }
>> +    return 0;
>> +}
>> +
>> +static abi_long target_to_host_for_each_nlmsg(struct nlmsghdr *nlh,
>> +                                              size_t len,
>> +                                              abi_long (*target_to_host_nlmsg)
>> +                                                       (struct nlmsghdr *))
>> +{
>> +    int ret;
>> +
>> +    while (len > (int)sizeof(struct nlmsghdr)) {
>> +        if (tswap32(nlh->nlmsg_len) < sizeof(struct nlmsghdr) ||
>> +            tswap32(nlh->nlmsg_len) > len) {
>> +            break;
>> +        }
>> +        tswap_nlmsghdr(nlh);
>> +        if (nlh->nlmsg_type == NLMSG_DONE) {
>> +            break;
>> +        }
>> +        switch (nlh->nlmsg_type) {
>> +        case NLMSG_NOOP:
>> +            break;
>> +        case NLMSG_ERROR: {
>> +                struct nlmsgerr *e = NLMSG_DATA(nlh);
>> +                e->error = tswap32(e->error);
>> +                tswap_nlmsghdr(&e->msg);
>> +            }
>> +        default:
>> +            ret = target_to_host_nlmsg(nlh);
>> +            if (ret < 0) {
>> +                return ret;
>> +            }
>> +        }
>> +        len -= NLMSG_ALIGN(nlh->nlmsg_len);
>> +        nlh = (struct nlmsghdr *)(((char *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
> 
> You could use 'nlh = NLMSG_NEXT(nlh, len);' here rather than
> manually adjusting len and nlh (though maybe you prefer the
> parallel with the host_to_target function, and there we can't
> use NLMSG_NEXT).

Yes, this is the reason.

>> +    }
>> +    return 0;
>> +}
>> +
>> +static abi_long host_to_target_for_each_rtattr(struct rtattr *rtattr,
>> +                                               size_t len,
>> +                                               abi_long (*host_to_target_rtattr)
>> +                                                        (struct rtattr *))
>> +{
>> +    unsigned short rta_len;
>> +    abi_long ret;
>> +
>> +    while (len > (int)sizeof(struct rtattr)) {
>> +        rta_len = rtattr->rta_len;
>> +        if (rta_len < sizeof(struct rtattr) ||
>> +            rta_len > len) {
>> +            rtattr->rta_len = tswap16(rtattr->rta_len);
>> +            rtattr->rta_type = tswap16(rtattr->rta_type);
> 
> In the code above for iterating nlmsgs, we don't try to
> touch the fields if the length claims to be out of bounds;
> here we do. Why the difference ?

This series has the "RFC" tag because there are some issues with them,
and in some cases I've tried to add workaround when the guest client
crashes. And I think it was the case for the RTATTR scan and not for the
NLMSG case. I will remove this here and check what happens.

>> +            break;
>> +        }
>> +        ret = host_to_target_rtattr(rtattr);
>> +        rtattr->rta_len = tswap16(rtattr->rta_len);
>> +        rtattr->rta_type = tswap16(rtattr->rta_type);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +        len -= RTA_ALIGN(rta_len);
>> +        rtattr = (struct rtattr *)(((char *)rtattr) + RTA_ALIGN(rta_len));
>> +    }
>> +    return 0;
>> +}
>> +
>> +static abi_long host_to_target_data_link_rtattr(struct rtattr *rtattr)
>> +{
>> +    uint32_t *u32;
>> +
>> +    switch (rtattr->rta_type) {
>> +    case IFLA_ADDRESS:
>> +    case IFLA_BROADCAST:
>> +    case IFLA_IFNAME:
> 
> Comments here about the types we are converting (or not) would be nice.

OK

> 
>> +        break;
>> +    case IFLA_MTU:
>> +    case IFLA_LINK:
>> +    case IFLA_WEIGHT:
>> +    case IFLA_TXQLEN:
>> +    case IFLA_CARRIER_CHANGES:
>> +    case IFLA_NUM_RX_QUEUES:
>> +    case IFLA_NUM_TX_QUEUES:
>> +    case IFLA_PROMISCUITY:
>> +    case IFLA_EXT_MASK:
>> +    case IFLA_LINK_NETNSID:
>> +    case IFLA_GROUP:
>> +    case IFLA_MASTER:
>> +    case IFLA_NUM_VF:
>> +    case IFLA_STATS:
>> +        u32 = RTA_DATA(rtattr);
>> +        *u32 = tswap32(*u32);
>> +        break;
>> +    case IFLA_QDISC:
>> +    case IFLA_COST:
>> +    case IFLA_MAP:
>> +    case IFLA_OPERSTATE:
>> +    case IFLA_LINKMODE:
>> +    case IFLA_CARRIER:
>> +    case IFLA_STATS64:
>> +    case IFLA_AF_SPEC:
>> +    case IFLA_LINKINFO:
> 
> What order are these cases in (ie why is this set of "do nothing"
> different from the ones at the top of the switch) ?

I've tried to keep the order of then enum in
/usr/include/linux/if_link.h. It is easier to debug because the assigned
numbers are not explicit.

> 
>> +        break;
>> +    default:
> 
> Should we log something about an unsupported IFLA type here?

Yes, I use that to debug too. But it is "fprintf()".
Is "gemu_log()" a good choice?

> 
>> +        break;
> 
> 
> Looking at
> http://lxr.free-electrons.com/source/net/core/rtnetlink.c
> I'm not sure all of these are right. I think IFLA_MAP means we have a
> pointer to a struct rtnl_link_ifmap, for instance, which would need
> some of its fields swapping. IFLA_STATS is an rtnl_link_stats
> struct, and so on.

Yes, you're right. I'll check that.

>> +    }
>> +    return 0;
>> +}
>> +
>> +static abi_long host_to_target_data_addr_rtattr(struct rtattr *rtattr)
>> +{
>> +    uint32_t *u32;
>> +    struct ifa_cacheinfo *ci;
>> +
>> +    switch (rtattr->rta_type) {
>> +    case IFA_FLAGS:
>> +        u32 = RTA_DATA(rtattr);
>> +        *u32 = tswap32(*u32);
>> +        break;
>> +    case IFA_LOCAL:
>> +    case IFA_ADDRESS:
>> +    case IFA_BROADCAST:
>> +    case IFA_LABEL:
>> +        break;
>> +    case IFA_CACHEINFO:
>> +        ci = RTA_DATA(rtattr);
>> +        ci->ifa_prefered = tswap32(ci->ifa_prefered);
>> +        ci->ifa_valid = tswap32(ci->ifa_valid);
>> +        ci->cstamp = tswap32(ci->cstamp);
>> +        ci->tstamp = tswap32(ci->tstamp);
>> +        break;
>> +    default:
>> +        break;
> 
> Log about unsupported values?

OK

> 
>> +    }
>> +    return 0;
>> +}
>> +
>> +static abi_long host_to_target_data_route_rtattr(struct rtattr *rtattr)
>> +{
>> +    uint32_t *u32;
>> +    switch (rtattr->rta_type) {
>> +    case RTA_GATEWAY:
>> +    case RTA_DST:
>> +    case RTA_PREFSRC:
>> +        break;
>> +    case RTA_PRIORITY:
>> +    case RTA_TABLE:
>> +    case RTA_OIF:
>> +        u32 = RTA_DATA(rtattr);
>> +        *u32 = tswap32(*u32);
>> +        break;
> 
> This doesn't seem to be the complete list of RTA_ values; what
> decides which ones are listed here?

Only testing. There are too many values (and the structure of the value
can depend on the interface type). I've in fact some logs in the default
case, to log undecoded entries and then I boot a container
(ppc64/ppc/m68k/sh4/...) and I try to implement missing entries.

> 
>> +    default:
>> +        break;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static abi_long host_to_target_link_rtattr(struct rtattr *rtattr,
>> +                                         uint32_t rtattr_len)
>> +{
>> +    return host_to_target_for_each_rtattr(rtattr, rtattr_len,
>> +                                          host_to_target_data_link_rtattr);
>> +}
>> +
>> +static abi_long host_to_target_addr_rtattr(struct rtattr *rtattr,
>> +                                         uint32_t rtattr_len)
>> +{
>> +    return host_to_target_for_each_rtattr(rtattr, rtattr_len,
>> +                                          host_to_target_data_addr_rtattr);
>> +}
>> +
>> +static abi_long host_to_target_route_rtattr(struct rtattr *rtattr,
>> +                                         uint32_t rtattr_len)
>> +{
>> +    return host_to_target_for_each_rtattr(rtattr, rtattr_len,
>> +                                          host_to_target_data_route_rtattr);
>> +}
>> +
>> +static abi_long host_to_target_data_route(struct nlmsghdr *nlh)
>> +{
>> +    uint32_t nlmsg_len;
>> +    struct ifinfomsg *ifi;
>> +    struct ifaddrmsg *ifa;
>> +    struct rtmsg *rtm;
>> +
>> +    nlmsg_len = nlh->nlmsg_len;
>> +    switch (nlh->nlmsg_type) {
>> +    case RTM_NEWLINK:
>> +    case RTM_DELLINK:
>> +    case RTM_GETLINK:
>> +        ifi = NLMSG_DATA(nlh);
>> +        ifi->ifi_type = tswap16(ifi->ifi_type);
>> +        ifi->ifi_index = tswap32(ifi->ifi_index);
>> +        ifi->ifi_flags = tswap32(ifi->ifi_flags);
>> +        ifi->ifi_change = tswap32(ifi->ifi_change);
>> +        host_to_target_link_rtattr(IFLA_RTA(ifi),
>> +                                   nlmsg_len - NLMSG_LENGTH(sizeof(*ifi)));
>> +        break;
>> +    case RTM_NEWADDR:
>> +    case RTM_DELADDR:
>> +    case RTM_GETADDR:
>> +        ifa = NLMSG_DATA(nlh);
>> +        ifa->ifa_index = tswap32(ifa->ifa_index);
>> +        host_to_target_addr_rtattr(IFA_RTA(ifa),
>> +                                   nlmsg_len - NLMSG_LENGTH(sizeof(*ifa)));
>> +        break;
>> +    case RTM_NEWROUTE:
>> +    case RTM_DELROUTE:
>> +    case RTM_GETROUTE:
>> +        rtm = NLMSG_DATA(nlh);
>> +        rtm->rtm_flags = tswap32(rtm->rtm_flags);
>> +        host_to_target_route_rtattr(RTM_RTA(rtm),
>> +                                    nlmsg_len - NLMSG_LENGTH(sizeof(*rtm)));
>> +        break;
>> +    default:
>> +        return -TARGET_EINVAL;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static inline abi_long host_to_target_nlmsg_route(struct nlmsghdr *nlh,
>> +                                                  size_t len)
>> +{
>> +    return host_to_target_for_each_nlmsg(nlh, len, host_to_target_data_route);
>> +}
>> +
>> +static abi_long target_to_host_for_each_rtattr(struct rtattr *rtattr,
>> +                                               size_t len,
>> +                                               abi_long (*target_to_host_rtattr)
>> +                                                        (struct rtattr *))
>> +{
>> +    abi_long ret;
>> +
>> +    while (len >= (int)sizeof(struct rtattr)) {
>> +        rtattr->rta_len = tswap16(rtattr->rta_len);
>> +        rtattr->rta_type = tswap16(rtattr->rta_type);
>> +        if (rtattr->rta_len < sizeof(struct rtattr) ||
>> +            rtattr->rta_len > len) {
>> +            break;
>> +        }
> 
> Length check after swap of len/type again.

I don't understand: why "again"?

>> +        ret = target_to_host_rtattr(rtattr);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +        len -= RTA_ALIGN(rtattr->rta_len);
>> +        rtattr = (struct rtattr *)(((char *)rtattr) +
>> +                 RTA_ALIGN(rtattr->rta_len));
>> +    }
>> +    return 0;
>> +}
>> +
>> +static abi_long target_to_host_data_link_rtattr(struct rtattr *rtattr)
>> +{
>> +    switch (rtattr->rta_type) {
>> +    default:
>> +        break;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static abi_long target_to_host_data_addr_rtattr(struct rtattr *rtattr)
>> +{
>> +    switch (rtattr->rta_type) {
>> +    case IFA_LOCAL:
>> +    case IFA_ADDRESS:
>> +        break;
>> +    default:
>> +        break;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static abi_long target_to_host_data_route_rtattr(struct rtattr *rtattr)
>> +{
>> +    uint32_t *u32;
>> +    switch (rtattr->rta_type) {
>> +    case RTA_DST:
>> +    case RTA_SRC:
>> +    case RTA_GATEWAY:
>> +        /* nothing to do */
>> +        break;
>> +    case RTA_OIF:
>> +        u32 = RTA_DATA(rtattr);
>> +        *u32 = tswap32(*u32);
>> +        break;
>> +    default:
>> +        break;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static void target_to_host_link_rtattr(struct rtattr *rtattr,
>> +                                       uint32_t rtattr_len)
>> +{
>> +    target_to_host_for_each_rtattr(rtattr, rtattr_len,
>> +                                   target_to_host_data_link_rtattr);
>> +}
>> +
>> +static void target_to_host_addr_rtattr(struct rtattr *rtattr,
>> +                                     uint32_t rtattr_len)
>> +{
>> +    target_to_host_for_each_rtattr(rtattr, rtattr_len,
>> +                                   target_to_host_data_addr_rtattr);
>> +}
>> +
>> +static void target_to_host_route_rtattr(struct rtattr *rtattr,
>> +                                     uint32_t rtattr_len)
>> +{
>> +    target_to_host_for_each_rtattr(rtattr, rtattr_len,
>> +                                   target_to_host_data_route_rtattr);
>> +}
>> +
>> +static abi_long target_to_host_data_route(struct nlmsghdr *nlh)
>> +{
>> +    struct ifinfomsg *ifi;
>> +    struct ifaddrmsg *ifa;
>> +    struct rtmsg *rtm;
>> +
>> +    switch (nlh->nlmsg_type) {
>> +    case RTM_GETLINK:
>> +        break;
>> +    case RTM_NEWLINK:
>> +    case RTM_DELLINK:
>> +        ifi = NLMSG_DATA(nlh);
>> +        ifi->ifi_type = tswap16(ifi->ifi_type);
>> +        ifi->ifi_index = tswap32(ifi->ifi_index);
>> +        ifi->ifi_flags = tswap32(ifi->ifi_flags);
>> +        ifi->ifi_change = tswap32(ifi->ifi_change);
>> +        target_to_host_link_rtattr(IFLA_RTA(ifi), nlh->nlmsg_len -
>> +                                   NLMSG_LENGTH(sizeof(*ifi)));
>> +        break;
>> +    case RTM_GETADDR:
>> +    case RTM_NEWADDR:
>> +    case RTM_DELADDR:
>> +        ifa = NLMSG_DATA(nlh);
>> +        ifa->ifa_index = tswap32(ifa->ifa_index);
>> +        target_to_host_addr_rtattr(IFA_RTA(ifa), nlh->nlmsg_len -
>> +                                   NLMSG_LENGTH(sizeof(*ifa)));
>> +        break;
>> +    case RTM_GETROUTE:
>> +        break;
>> +    case RTM_NEWROUTE:
>> +    case RTM_DELROUTE:
>> +        rtm = NLMSG_DATA(nlh);
>> +        rtm->rtm_flags = tswap32(rtm->rtm_flags);
>> +        target_to_host_route_rtattr(RTM_RTA(rtm), nlh->nlmsg_len -
>> +                                    NLMSG_LENGTH(sizeof(*rtm)));
>> +        break;
>> +    default:
>> +        return -TARGET_EOPNOTSUPP;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static abi_long target_to_host_nlmsg_route(struct nlmsghdr *nlh, size_t len)
>> +{
>> +    return target_to_host_for_each_nlmsg(nlh, len, target_to_host_data_route);
>> +}
>> +
>>  /* do_setsockopt() Must return target values and target errnos. */
>>  static abi_long do_setsockopt(int sockfd, int level, int optname,
>>                                abi_ulong optval_addr, socklen_t optlen)
>> @@ -2103,6 +2528,21 @@ static TargetFdTrans target_packet_trans = {
>>      .target_to_host_addr = packet_target_to_host_sockaddr,
>>  };
>>
>> +static abi_long netlink_route_target_to_host(void *buf, size_t len)
>> +{
>> +    return target_to_host_nlmsg_route(buf, len);
>> +}
>> +
>> +static abi_long netlink_route_host_to_target(void *buf, size_t len)
>> +{
>> +    return host_to_target_nlmsg_route(buf, len);
>> +}
>> +
>> +static TargetFdTrans target_netlink_route_trans = {
>> +    .target_to_host_data = netlink_route_target_to_host,
>> +    .host_to_target_data = netlink_route_host_to_target,
>> +};
>> +
>>  /* do_socket() Must return target values and target errnos. */
>>  static abi_long do_socket(int domain, int type, int protocol)
>>  {
>> @@ -2114,9 +2554,6 @@ static abi_long do_socket(int domain, int type, int protocol)
>>          return ret;
>>      }
>>
>> -    if (domain == PF_NETLINK)
>> -        return -TARGET_EAFNOSUPPORT;
>> -
>>      if (domain == AF_PACKET ||
>>          (domain == AF_INET && type == SOCK_PACKET)) {
>>          protocol = tswap16(protocol);
>> @@ -2130,6 +2567,16 @@ static abi_long do_socket(int domain, int type, int protocol)
>>               * if socket type is SOCK_PACKET, bind by name
>>               */
>>              fd_trans_register(ret, &target_packet_trans);
>> +        } else if (domain == PF_NETLINK) {
>> +            switch (protocol) {
>> +            case NETLINK_ROUTE:
>> +                fd_trans_register(ret, &target_netlink_route_trans);
>> +                break;
>> +            default:
>> +                close(ret);
>> +                ret = -EPFNOSUPPORT;
> 
> Can we handle the "PF_NETLINK but unsupported protocol" check
> before we try to open the host socket, rather than having to
> back it out here?

Yes

>> +                break;
>> +            }
>>          }
>>      }
>>      return ret;
>> @@ -2214,14 +2661,25 @@ static abi_long do_sendrecvmsg_locked(int fd, struct target_msghdr *msgp,
>>      msg.msg_iov = vec;
>>
>>      if (send) {
>> -        ret = target_to_host_cmsg(&msg, msgp);
>> -        if (ret == 0)
>> +        if (fd_trans_target_to_host_data(fd)) {
>> +            ret = fd_trans_target_to_host_data(fd)(msg.msg_iov->iov_base,
>> +                                                   msg.msg_iov->iov_len);
>> +        } else {
>> +            ret = target_to_host_cmsg(&msg, msgp);
>> +        }
>> +        if (ret == 0) {
>>              ret = get_errno(sendmsg(fd, &msg, flags));
>> +        }
>>      } else {
>>          ret = get_errno(recvmsg(fd, &msg, flags));
>>          if (!is_error(ret)) {
>>              len = ret;
>> -            ret = host_to_target_cmsg(msgp, &msg);
>> +            if (fd_trans_host_to_target_data(fd)) {
>> +                ret = fd_trans_host_to_target_data(fd)(msg.msg_iov->iov_base,
>> +                                                       msg.msg_iov->iov_len);
>> +            } else {
>> +                ret = host_to_target_cmsg(msgp, &msg);
>> +            }
>>              if (!is_error(ret)) {
>>                  msgp->msg_namelen = tswap32(msg.msg_namelen);
>>                  if (msg.msg_name != NULL) {
>> @@ -2448,6 +2906,13 @@ static abi_long do_sendto(int fd, abi_ulong msg, size_t len, int flags,
>>      host_msg = lock_user(VERIFY_READ, msg, len, 1);
>>      if (!host_msg)
>>          return -TARGET_EFAULT;
>> +    if (fd_trans_target_to_host_data(fd)) {
>> +        ret = fd_trans_target_to_host_data(fd)(host_msg, len);
>> +        if (ret < 0) {
>> +            unlock_user(host_msg, msg, 0);
>> +            return ret;
>> +        }
>> +    }
>>      if (target_addr) {
>>          addr = alloca(addrlen+1);
>>          ret = target_to_host_sockaddr(fd, addr, target_addr, addrlen);
>> --
>> 2.5.0
> 
> This felt like a pretty long patch to wade through on review; if
> there's an easy way to split it up that would be nice for v2,
> but I don't insist on it if there doesn't seem to be a nice
> splitting point.

I like to have small patches, but this one is hard to split.
I will try, but I'm not sure I can.

> 
> thanks
> -- PMM
> 

Thanks for the review,
Laurent
Peter Maydell May 14, 2016, 10:13 a.m. UTC | #3
On 14 May 2016 at 10:37, Laurent Vivier <laurent@vivier.eu> wrote:
>
>
> Le 13/05/2016 à 18:40, Peter Maydell a écrit :
>> On 30 January 2016 at 22:26, Laurent Vivier <laurent@vivier.eu> wrote:
>>> +    while (len > (int)sizeof(struct nlmsghdr)) {
>>
>> What is this cast to int for ?
>>
>
> I agree it seems useless, I have copied some parts from the kernel :
>
> /usr/include/linux/netlink.h
>
> #define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \
> ...
>
> so the "(int)" comes from this.

Hmm. It may well be right, but I'd like to understand what
it's doing, really...

>>> +        break;
>>> +    default:
>>
>> Should we log something about an unsupported IFLA type here?
>
> Yes, I use that to debug too. But it is "fprintf()".
> Is "gemu_log()" a good choice?

It seems to be what we use elsewhere for similar logging.

>>> +    while (len >= (int)sizeof(struct rtattr)) {
>>> +        rtattr->rta_len = tswap16(rtattr->rta_len);
>>> +        rtattr->rta_type = tswap16(rtattr->rta_type);
>>> +        if (rtattr->rta_len < sizeof(struct rtattr) ||
>>> +            rtattr->rta_len > len) {
>>> +            break;
>>> +        }
>>
>> Length check after swap of len/type again.
>
> I don't understand: why "again"?

I tend to use "again" to mean "this is a similar kind of
code review comment to an issue I talked about in more
detail when it came up earlier in the patch".

thanks
-- PMM
diff mbox

Patch

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index dac5518..a1ed2f5 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -100,6 +100,8 @@  int __clone2(int (*fn)(void *), void *child_stack_base,
 #include <linux/filter.h>
 #include <linux/blkpg.h>
 #include "linux_loop.h"
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
 #include "uname.h"
 
 #include "qemu.h"
@@ -295,6 +297,14 @@  static TargetFdTrans **target_fd_trans;
 
 static unsigned int target_fd_max;
 
+static TargetFdDataFunc fd_trans_target_to_host_data(int fd)
+{
+    if (fd >= 0 && fd < target_fd_max && target_fd_trans[fd]) {
+        return target_fd_trans[fd]->target_to_host_data;
+    }
+    return NULL;
+}
+
 static TargetFdDataFunc fd_trans_host_to_target_data(int fd)
 {
     if (fd >= 0 && fd < target_fd_max && target_fd_trans[fd]) {
@@ -1222,6 +1232,11 @@  static inline abi_long host_to_target_sockaddr(abi_ulong target_addr,
         return -TARGET_EFAULT;
     memcpy(target_saddr, addr, len);
     target_saddr->sa_family = tswap16(addr->sa_family);
+    if (addr->sa_family == AF_NETLINK) {
+        struct sockaddr_nl *target_nl = (struct sockaddr_nl *)target_saddr;
+        target_nl->nl_pid = tswap32(target_nl->nl_pid);
+        target_nl->nl_groups = tswap32(target_nl->nl_groups);
+    }
     unlock_user(target_saddr, target_addr, len);
 
     return 0;
@@ -1453,6 +1468,416 @@  static inline abi_long host_to_target_cmsg(struct target_msghdr *target_msgh,
     return 0;
 }
 
+static void tswap_nlmsghdr(struct nlmsghdr *nlh)
+{
+    nlh->nlmsg_len = tswap32(nlh->nlmsg_len);
+    nlh->nlmsg_type = tswap16(nlh->nlmsg_type);
+    nlh->nlmsg_flags = tswap16(nlh->nlmsg_flags);
+    nlh->nlmsg_seq = tswap32(nlh->nlmsg_seq);
+    nlh->nlmsg_pid = tswap32(nlh->nlmsg_pid);
+}
+
+static abi_long host_to_target_for_each_nlmsg(struct nlmsghdr *nlh,
+                                              size_t len,
+                                              abi_long (*host_to_target_nlmsg)
+                                                       (struct nlmsghdr *))
+{
+    uint32_t nlmsg_len;
+    abi_long ret;
+
+    while (len > (int)sizeof(struct nlmsghdr)) {
+
+        nlmsg_len = nlh->nlmsg_len;
+        if (nlmsg_len < sizeof(struct nlmsghdr) ||
+            nlmsg_len > len) {
+            break;
+        }
+
+        if (nlh->nlmsg_type == NLMSG_DONE) {
+            tswap_nlmsghdr(nlh);
+            break;
+        }
+        switch (nlh->nlmsg_type) {
+        case NLMSG_NOOP:
+            break;
+        case NLMSG_ERROR: {
+                struct nlmsgerr *e = NLMSG_DATA(nlh);
+                e->error = tswap32(e->error);
+                tswap_nlmsghdr(&e->msg);
+                tswap_nlmsghdr(nlh);
+            }
+            return 0;
+        default:
+            ret = host_to_target_nlmsg(nlh);
+            if (ret < 0) {
+                tswap_nlmsghdr(nlh);
+                return ret;
+            }
+            break;
+        }
+        tswap_nlmsghdr(nlh);
+        len -= NLMSG_ALIGN(nlmsg_len);
+        nlh = (struct nlmsghdr *)(((char*)nlh) + NLMSG_ALIGN(nlmsg_len));
+    }
+    return 0;
+}
+
+static abi_long target_to_host_for_each_nlmsg(struct nlmsghdr *nlh,
+                                              size_t len,
+                                              abi_long (*target_to_host_nlmsg)
+                                                       (struct nlmsghdr *))
+{
+    int ret;
+
+    while (len > (int)sizeof(struct nlmsghdr)) {
+        if (tswap32(nlh->nlmsg_len) < sizeof(struct nlmsghdr) ||
+            tswap32(nlh->nlmsg_len) > len) {
+            break;
+        }
+        tswap_nlmsghdr(nlh);
+        if (nlh->nlmsg_type == NLMSG_DONE) {
+            break;
+        }
+        switch (nlh->nlmsg_type) {
+        case NLMSG_NOOP:
+            break;
+        case NLMSG_ERROR: {
+                struct nlmsgerr *e = NLMSG_DATA(nlh);
+                e->error = tswap32(e->error);
+                tswap_nlmsghdr(&e->msg);
+            }
+        default:
+            ret = target_to_host_nlmsg(nlh);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+        len -= NLMSG_ALIGN(nlh->nlmsg_len);
+        nlh = (struct nlmsghdr *)(((char *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
+    }
+    return 0;
+}
+
+static abi_long host_to_target_for_each_rtattr(struct rtattr *rtattr,
+                                               size_t len,
+                                               abi_long (*host_to_target_rtattr)
+                                                        (struct rtattr *))
+{
+    unsigned short rta_len;
+    abi_long ret;
+
+    while (len > (int)sizeof(struct rtattr)) {
+        rta_len = rtattr->rta_len;
+        if (rta_len < sizeof(struct rtattr) ||
+            rta_len > len) {
+            rtattr->rta_len = tswap16(rtattr->rta_len);
+            rtattr->rta_type = tswap16(rtattr->rta_type);
+            break;
+        }
+        ret = host_to_target_rtattr(rtattr);
+        rtattr->rta_len = tswap16(rtattr->rta_len);
+        rtattr->rta_type = tswap16(rtattr->rta_type);
+        if (ret < 0) {
+            return ret;
+        }
+        len -= RTA_ALIGN(rta_len);
+        rtattr = (struct rtattr *)(((char *)rtattr) + RTA_ALIGN(rta_len));
+    }
+    return 0;
+}
+
+static abi_long host_to_target_data_link_rtattr(struct rtattr *rtattr)
+{
+    uint32_t *u32;
+
+    switch (rtattr->rta_type) {
+    case IFLA_ADDRESS:
+    case IFLA_BROADCAST:
+    case IFLA_IFNAME:
+        break;
+    case IFLA_MTU:
+    case IFLA_LINK:
+    case IFLA_WEIGHT:
+    case IFLA_TXQLEN:
+    case IFLA_CARRIER_CHANGES:
+    case IFLA_NUM_RX_QUEUES:
+    case IFLA_NUM_TX_QUEUES:
+    case IFLA_PROMISCUITY:
+    case IFLA_EXT_MASK:
+    case IFLA_LINK_NETNSID:
+    case IFLA_GROUP:
+    case IFLA_MASTER:
+    case IFLA_NUM_VF:
+    case IFLA_STATS:
+        u32 = RTA_DATA(rtattr);
+        *u32 = tswap32(*u32);
+        break;
+    case IFLA_QDISC:
+    case IFLA_COST:
+    case IFLA_MAP:
+    case IFLA_OPERSTATE:
+    case IFLA_LINKMODE:
+    case IFLA_CARRIER:
+    case IFLA_STATS64:
+    case IFLA_AF_SPEC:
+    case IFLA_LINKINFO:
+        break;
+    default:
+        break;
+    }
+    return 0;
+}
+
+static abi_long host_to_target_data_addr_rtattr(struct rtattr *rtattr)
+{
+    uint32_t *u32;
+    struct ifa_cacheinfo *ci;
+
+    switch (rtattr->rta_type) {
+    case IFA_FLAGS:
+        u32 = RTA_DATA(rtattr);
+        *u32 = tswap32(*u32);
+        break;
+    case IFA_LOCAL:
+    case IFA_ADDRESS:
+    case IFA_BROADCAST:
+    case IFA_LABEL:
+        break;
+    case IFA_CACHEINFO:
+        ci = RTA_DATA(rtattr);
+        ci->ifa_prefered = tswap32(ci->ifa_prefered);
+        ci->ifa_valid = tswap32(ci->ifa_valid);
+        ci->cstamp = tswap32(ci->cstamp);
+        ci->tstamp = tswap32(ci->tstamp);
+        break;
+    default:
+        break;
+    }
+    return 0;
+}
+
+static abi_long host_to_target_data_route_rtattr(struct rtattr *rtattr)
+{
+    uint32_t *u32;
+    switch (rtattr->rta_type) {
+    case RTA_GATEWAY:
+    case RTA_DST:
+    case RTA_PREFSRC:
+        break;
+    case RTA_PRIORITY:
+    case RTA_TABLE:
+    case RTA_OIF:
+        u32 = RTA_DATA(rtattr);
+        *u32 = tswap32(*u32);
+        break;
+    default:
+        break;
+    }
+    return 0;
+}
+
+static abi_long host_to_target_link_rtattr(struct rtattr *rtattr,
+                                         uint32_t rtattr_len)
+{
+    return host_to_target_for_each_rtattr(rtattr, rtattr_len,
+                                          host_to_target_data_link_rtattr);
+}
+
+static abi_long host_to_target_addr_rtattr(struct rtattr *rtattr,
+                                         uint32_t rtattr_len)
+{
+    return host_to_target_for_each_rtattr(rtattr, rtattr_len,
+                                          host_to_target_data_addr_rtattr);
+}
+
+static abi_long host_to_target_route_rtattr(struct rtattr *rtattr,
+                                         uint32_t rtattr_len)
+{
+    return host_to_target_for_each_rtattr(rtattr, rtattr_len,
+                                          host_to_target_data_route_rtattr);
+}
+
+static abi_long host_to_target_data_route(struct nlmsghdr *nlh)
+{
+    uint32_t nlmsg_len;
+    struct ifinfomsg *ifi;
+    struct ifaddrmsg *ifa;
+    struct rtmsg *rtm;
+
+    nlmsg_len = nlh->nlmsg_len;
+    switch (nlh->nlmsg_type) {
+    case RTM_NEWLINK:
+    case RTM_DELLINK:
+    case RTM_GETLINK:
+        ifi = NLMSG_DATA(nlh);
+        ifi->ifi_type = tswap16(ifi->ifi_type);
+        ifi->ifi_index = tswap32(ifi->ifi_index);
+        ifi->ifi_flags = tswap32(ifi->ifi_flags);
+        ifi->ifi_change = tswap32(ifi->ifi_change);
+        host_to_target_link_rtattr(IFLA_RTA(ifi),
+                                   nlmsg_len - NLMSG_LENGTH(sizeof(*ifi)));
+        break;
+    case RTM_NEWADDR:
+    case RTM_DELADDR:
+    case RTM_GETADDR:
+        ifa = NLMSG_DATA(nlh);
+        ifa->ifa_index = tswap32(ifa->ifa_index);
+        host_to_target_addr_rtattr(IFA_RTA(ifa),
+                                   nlmsg_len - NLMSG_LENGTH(sizeof(*ifa)));
+        break;
+    case RTM_NEWROUTE:
+    case RTM_DELROUTE:
+    case RTM_GETROUTE:
+        rtm = NLMSG_DATA(nlh);
+        rtm->rtm_flags = tswap32(rtm->rtm_flags);
+        host_to_target_route_rtattr(RTM_RTA(rtm),
+                                    nlmsg_len - NLMSG_LENGTH(sizeof(*rtm)));
+        break;
+    default:
+        return -TARGET_EINVAL;
+    }
+    return 0;
+}
+
+static inline abi_long host_to_target_nlmsg_route(struct nlmsghdr *nlh,
+                                                  size_t len)
+{
+    return host_to_target_for_each_nlmsg(nlh, len, host_to_target_data_route);
+}
+
+static abi_long target_to_host_for_each_rtattr(struct rtattr *rtattr,
+                                               size_t len,
+                                               abi_long (*target_to_host_rtattr)
+                                                        (struct rtattr *))
+{
+    abi_long ret;
+
+    while (len >= (int)sizeof(struct rtattr)) {
+        rtattr->rta_len = tswap16(rtattr->rta_len);
+        rtattr->rta_type = tswap16(rtattr->rta_type);
+        if (rtattr->rta_len < sizeof(struct rtattr) ||
+            rtattr->rta_len > len) {
+            break;
+        }
+        ret = target_to_host_rtattr(rtattr);
+        if (ret < 0) {
+            return ret;
+        }
+        len -= RTA_ALIGN(rtattr->rta_len);
+        rtattr = (struct rtattr *)(((char *)rtattr) +
+                 RTA_ALIGN(rtattr->rta_len));
+    }
+    return 0;
+}
+
+static abi_long target_to_host_data_link_rtattr(struct rtattr *rtattr)
+{
+    switch (rtattr->rta_type) {
+    default:
+        break;
+    }
+    return 0;
+}
+
+static abi_long target_to_host_data_addr_rtattr(struct rtattr *rtattr)
+{
+    switch (rtattr->rta_type) {
+    case IFA_LOCAL:
+    case IFA_ADDRESS:
+        break;
+    default:
+        break;
+    }
+    return 0;
+}
+
+static abi_long target_to_host_data_route_rtattr(struct rtattr *rtattr)
+{
+    uint32_t *u32;
+    switch (rtattr->rta_type) {
+    case RTA_DST:
+    case RTA_SRC:
+    case RTA_GATEWAY:
+        /* nothing to do */
+        break;
+    case RTA_OIF:
+        u32 = RTA_DATA(rtattr);
+        *u32 = tswap32(*u32);
+        break;
+    default:
+        break;
+    }
+    return 0;
+}
+
+static void target_to_host_link_rtattr(struct rtattr *rtattr,
+                                       uint32_t rtattr_len)
+{
+    target_to_host_for_each_rtattr(rtattr, rtattr_len,
+                                   target_to_host_data_link_rtattr);
+}
+
+static void target_to_host_addr_rtattr(struct rtattr *rtattr,
+                                     uint32_t rtattr_len)
+{
+    target_to_host_for_each_rtattr(rtattr, rtattr_len,
+                                   target_to_host_data_addr_rtattr);
+}
+
+static void target_to_host_route_rtattr(struct rtattr *rtattr,
+                                     uint32_t rtattr_len)
+{
+    target_to_host_for_each_rtattr(rtattr, rtattr_len,
+                                   target_to_host_data_route_rtattr);
+}
+
+static abi_long target_to_host_data_route(struct nlmsghdr *nlh)
+{
+    struct ifinfomsg *ifi;
+    struct ifaddrmsg *ifa;
+    struct rtmsg *rtm;
+
+    switch (nlh->nlmsg_type) {
+    case RTM_GETLINK:
+        break;
+    case RTM_NEWLINK:
+    case RTM_DELLINK:
+        ifi = NLMSG_DATA(nlh);
+        ifi->ifi_type = tswap16(ifi->ifi_type);
+        ifi->ifi_index = tswap32(ifi->ifi_index);
+        ifi->ifi_flags = tswap32(ifi->ifi_flags);
+        ifi->ifi_change = tswap32(ifi->ifi_change);
+        target_to_host_link_rtattr(IFLA_RTA(ifi), nlh->nlmsg_len -
+                                   NLMSG_LENGTH(sizeof(*ifi)));
+        break;
+    case RTM_GETADDR:
+    case RTM_NEWADDR:
+    case RTM_DELADDR:
+        ifa = NLMSG_DATA(nlh);
+        ifa->ifa_index = tswap32(ifa->ifa_index);
+        target_to_host_addr_rtattr(IFA_RTA(ifa), nlh->nlmsg_len -
+                                   NLMSG_LENGTH(sizeof(*ifa)));
+        break;
+    case RTM_GETROUTE:
+        break;
+    case RTM_NEWROUTE:
+    case RTM_DELROUTE:
+        rtm = NLMSG_DATA(nlh);
+        rtm->rtm_flags = tswap32(rtm->rtm_flags);
+        target_to_host_route_rtattr(RTM_RTA(rtm), nlh->nlmsg_len -
+                                    NLMSG_LENGTH(sizeof(*rtm)));
+        break;
+    default:
+        return -TARGET_EOPNOTSUPP;
+    }
+    return 0;
+}
+
+static abi_long target_to_host_nlmsg_route(struct nlmsghdr *nlh, size_t len)
+{
+    return target_to_host_for_each_nlmsg(nlh, len, target_to_host_data_route);
+}
+
 /* do_setsockopt() Must return target values and target errnos. */
 static abi_long do_setsockopt(int sockfd, int level, int optname,
                               abi_ulong optval_addr, socklen_t optlen)
@@ -2103,6 +2528,21 @@  static TargetFdTrans target_packet_trans = {
     .target_to_host_addr = packet_target_to_host_sockaddr,
 };
 
+static abi_long netlink_route_target_to_host(void *buf, size_t len)
+{
+    return target_to_host_nlmsg_route(buf, len);
+}
+
+static abi_long netlink_route_host_to_target(void *buf, size_t len)
+{
+    return host_to_target_nlmsg_route(buf, len);
+}
+
+static TargetFdTrans target_netlink_route_trans = {
+    .target_to_host_data = netlink_route_target_to_host,
+    .host_to_target_data = netlink_route_host_to_target,
+};
+
 /* do_socket() Must return target values and target errnos. */
 static abi_long do_socket(int domain, int type, int protocol)
 {
@@ -2114,9 +2554,6 @@  static abi_long do_socket(int domain, int type, int protocol)
         return ret;
     }
 
-    if (domain == PF_NETLINK)
-        return -TARGET_EAFNOSUPPORT;
-
     if (domain == AF_PACKET ||
         (domain == AF_INET && type == SOCK_PACKET)) {
         protocol = tswap16(protocol);
@@ -2130,6 +2567,16 @@  static abi_long do_socket(int domain, int type, int protocol)
              * if socket type is SOCK_PACKET, bind by name
              */
             fd_trans_register(ret, &target_packet_trans);
+        } else if (domain == PF_NETLINK) {
+            switch (protocol) {
+            case NETLINK_ROUTE:
+                fd_trans_register(ret, &target_netlink_route_trans);
+                break;
+            default:
+                close(ret);
+                ret = -EPFNOSUPPORT;
+                break;
+            }
         }
     }
     return ret;
@@ -2214,14 +2661,25 @@  static abi_long do_sendrecvmsg_locked(int fd, struct target_msghdr *msgp,
     msg.msg_iov = vec;
 
     if (send) {
-        ret = target_to_host_cmsg(&msg, msgp);
-        if (ret == 0)
+        if (fd_trans_target_to_host_data(fd)) {
+            ret = fd_trans_target_to_host_data(fd)(msg.msg_iov->iov_base,
+                                                   msg.msg_iov->iov_len);
+        } else {
+            ret = target_to_host_cmsg(&msg, msgp);
+        }
+        if (ret == 0) {
             ret = get_errno(sendmsg(fd, &msg, flags));
+        }
     } else {
         ret = get_errno(recvmsg(fd, &msg, flags));
         if (!is_error(ret)) {
             len = ret;
-            ret = host_to_target_cmsg(msgp, &msg);
+            if (fd_trans_host_to_target_data(fd)) {
+                ret = fd_trans_host_to_target_data(fd)(msg.msg_iov->iov_base,
+                                                       msg.msg_iov->iov_len);
+            } else {
+                ret = host_to_target_cmsg(msgp, &msg);
+            }
             if (!is_error(ret)) {
                 msgp->msg_namelen = tswap32(msg.msg_namelen);
                 if (msg.msg_name != NULL) {
@@ -2448,6 +2906,13 @@  static abi_long do_sendto(int fd, abi_ulong msg, size_t len, int flags,
     host_msg = lock_user(VERIFY_READ, msg, len, 1);
     if (!host_msg)
         return -TARGET_EFAULT;
+    if (fd_trans_target_to_host_data(fd)) {
+        ret = fd_trans_target_to_host_data(fd)(host_msg, len);
+        if (ret < 0) {
+            unlock_user(host_msg, msg, 0);
+            return ret;
+        }
+    }
     if (target_addr) {
         addr = alloca(addrlen+1);
         ret = target_to_host_sockaddr(fd, addr, target_addr, addrlen);