Home Home > GIT Browse > stable
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichal Marek <mmarek@suse.com>2016-05-06 16:09:47 +0200
committerMichal Marek <mmarek@suse.com>2016-05-06 16:09:47 +0200
commit0a6fce8dcc63dcb1046ec9aafe5a2dc32012943d (patch)
tree34c85e706205ed4b163b4d8815ae0218d3ffe3ce
parent07062bca4027b5156040f3c2e4d343bc8eff34ff (diff)
parent6bba4f6d5b05ca73d5f8a3b1a4020c0be7df3b98 (diff)
Merge branch 'users/jjolly/SLE12-SP2/for-next' into SLE12-SP2rpm-4.4.9-36
Pull SMC (Shared Memory Communications) support from John Jolly (fate#319593, bsc#978258). Conflicts: config/arm64/default config/ppc64le/debug config/ppc64le/default config/s390x/default config/s390x/zfcpdump config/x86_64/debug config/x86_64/default
-rw-r--r--config/arm64/default1
-rw-r--r--config/ppc64le/debug1
-rw-r--r--config/ppc64le/default1
-rw-r--r--config/s390x/default1
-rw-r--r--config/x86_64/debug1
-rw-r--r--config/x86_64/default1
-rw-r--r--patches.arch/s390-sles12sp2-00-05-net-smc-r-01.patch101
-rw-r--r--patches.arch/s390-sles12sp2-00-05-net-smc-r-02.patch802
-rw-r--r--patches.arch/s390-sles12sp2-00-05-net-smc-r-03.patch636
-rw-r--r--patches.arch/s390-sles12sp2-00-05-net-smc-r-04.patch443
-rw-r--r--patches.arch/s390-sles12sp2-00-05-net-smc-r-05.patch1064
-rw-r--r--patches.arch/s390-sles12sp2-00-05-net-smc-r-06.patch819
-rw-r--r--patches.arch/s390-sles12sp2-00-05-net-smc-r-07.patch497
-rw-r--r--patches.arch/s390-sles12sp2-00-05-net-smc-r-08.patch929
-rw-r--r--patches.arch/s390-sles12sp2-00-05-net-smc-r-09.patch611
-rw-r--r--patches.arch/s390-sles12sp2-00-05-net-smc-r-10.patch477
-rw-r--r--patches.arch/s390-sles12sp2-00-05-net-smc-r-11.patch627
-rw-r--r--patches.arch/s390-sles12sp2-00-05-net-smc-r-12.patch601
-rw-r--r--patches.arch/s390-sles12sp2-00-05-net-smc-r-13.patch398
-rw-r--r--patches.arch/s390-sles12sp2-00-05-net-smc-r-14.patch903
-rw-r--r--series.conf15
-rw-r--r--supported.conf1
22 files changed, 8930 insertions, 0 deletions
diff --git a/config/arm64/default b/config/arm64/default
index fd89e18cf1..ac2ad8b5dd 100644
--- a/config/arm64/default
+++ b/config/arm64/default
@@ -624,6 +624,7 @@ CONFIG_XFRM_MIGRATE=y
CONFIG_XFRM_IPCOMP=m
CONFIG_NET_KEY=m
CONFIG_NET_KEY_MIGRATE=y
+# CONFIG_SMC is not set
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
diff --git a/config/ppc64le/debug b/config/ppc64le/debug
index 02092ffdd3..34536e218b 100644
--- a/config/ppc64le/debug
+++ b/config/ppc64le/debug
@@ -661,6 +661,7 @@ CONFIG_XFRM_MIGRATE=y
CONFIG_XFRM_IPCOMP=m
CONFIG_NET_KEY=y
CONFIG_NET_KEY_MIGRATE=y
+# CONFIG_SMC is not set
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
diff --git a/config/ppc64le/default b/config/ppc64le/default
index c6dd453cfd..cbd525e45f 100644
--- a/config/ppc64le/default
+++ b/config/ppc64le/default
@@ -668,6 +668,7 @@ CONFIG_XFRM_MIGRATE=y
CONFIG_XFRM_IPCOMP=m
CONFIG_NET_KEY=y
CONFIG_NET_KEY_MIGRATE=y
+# CONFIG_SMC is not set
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
diff --git a/config/s390x/default b/config/s390x/default
index c652a381d2..d538ecaedf 100644
--- a/config/s390x/default
+++ b/config/s390x/default
@@ -581,6 +581,7 @@ CONFIG_NET_KEY=m
CONFIG_NET_KEY_MIGRATE=y
CONFIG_IUCV=y
CONFIG_AFIUCV=m
+CONFIG_SMC=m
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
diff --git a/config/x86_64/debug b/config/x86_64/debug
index 67c613bd8c..fa0f103852 100644
--- a/config/x86_64/debug
+++ b/config/x86_64/debug
@@ -841,6 +841,7 @@ CONFIG_XFRM_MIGRATE=y
CONFIG_XFRM_IPCOMP=m
CONFIG_NET_KEY=m
CONFIG_NET_KEY_MIGRATE=y
+# CONFIG_SMC is not set
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
diff --git a/config/x86_64/default b/config/x86_64/default
index 84751441d0..135349ef4f 100644
--- a/config/x86_64/default
+++ b/config/x86_64/default
@@ -846,6 +846,7 @@ CONFIG_XFRM_MIGRATE=y
CONFIG_XFRM_IPCOMP=m
CONFIG_NET_KEY=m
CONFIG_NET_KEY_MIGRATE=y
+# CONFIG_SMC is not set
CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
diff --git a/patches.arch/s390-sles12sp2-00-05-net-smc-r-01.patch b/patches.arch/s390-sles12sp2-00-05-net-smc-r-01.patch
new file mode 100644
index 0000000000..4765b15a9c
--- /dev/null
+++ b/patches.arch/s390-sles12sp2-00-05-net-smc-r-01.patch
@@ -0,0 +1,101 @@
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Subject: net: introduce keepalive function in struct proto
+Patch-mainline: not yet, IBM pushing upstream
+References: bsc#978258,FATE#319593,LTC#131290
+
+Summary: net/smc: Shared Memory Communications - RDMA
+Description: Initial part of the implementation of the "Shared Memory
+ Communications-RDMA" (SMC-R) protocol. The protocol is defined
+ in RFC7609 [1]. It allows transparent transformation of TCP
+ connections using the "Remote Direct Memory Access over
+ Converged Ethernet" (RoCE) feature of certain communication
+ hardware for data center environments. Tested on s390 and x86
+ using Mellanox ConnectX-3 cards.
+
+ A new socket protocol family PF_SMC is being introduced. A
+ preload shared library will be offered to enable TCP-based
+ applications to use SMC-R without changes or recompilation.
+
+ References:
+ [1] SMC-R Informational RFC:
+ https://tools.ietf.org/rfc/rfc7609
+
+Upstream-Description:
+
+ net: introduce keepalive function in struct proto
+
+ Direct call of tcp_set_keepalive() function from protocol-agnostic
+ sock_setsockopt() function in net/core/sock.c violates network
+ layering. And newly introduced protocol (SMC-R) will need its own
+ keepalive function. Therefore, add "keepalive" function pointer
+ to "struct proto", and call it from sock_setsockopt() via this pointer.
+
+ Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+ Reviewed-by: Utz Bacher <utz.bacher@de.ibm.com>
+
+Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Acked-by: John Jolly <jjolly@suse.de>
+---
+ include/net/sock.h | 1 +
+ net/core/sock.c | 7 ++-----
+ net/ipv4/tcp_ipv4.c | 1 +
+ net/ipv4/tcp_timer.c | 1 +
+ net/ipv6/tcp_ipv6.c | 1 +
+ 5 files changed, 6 insertions(+), 5 deletions(-)
+
+--- a/include/net/sock.h
++++ b/include/net/sock.h
+@@ -976,6 +976,7 @@ struct proto {
+ int (*getsockopt)(struct sock *sk, int level,
+ int optname, char __user *optval,
+ int __user *option);
++ void (*keepalive)(struct sock *sk, int valbool);
+ #ifdef CONFIG_COMPAT
+ int (*compat_setsockopt)(struct sock *sk,
+ int level,
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -792,11 +792,8 @@ set_rcvbuf:
+ goto set_rcvbuf;
+
+ case SO_KEEPALIVE:
+-#ifdef CONFIG_INET
+- if (sk->sk_protocol == IPPROTO_TCP &&
+- sk->sk_type == SOCK_STREAM)
+- tcp_set_keepalive(sk, valbool);
+-#endif
++ if (sk->sk_prot->keepalive)
++ sk->sk_prot->keepalive(sk, valbool);
+ sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
+ break;
+
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -2316,6 +2316,7 @@ struct proto tcp_prot = {
+ .shutdown = tcp_shutdown,
+ .setsockopt = tcp_setsockopt,
+ .getsockopt = tcp_getsockopt,
++ .keepalive = tcp_set_keepalive,
+ .recvmsg = tcp_recvmsg,
+ .sendmsg = tcp_sendmsg,
+ .sendpage = tcp_sendpage,
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -569,6 +569,7 @@ void tcp_set_keepalive(struct sock *sk,
+ else if (!val)
+ inet_csk_delete_keepalive_timer(sk);
+ }
++EXPORT_SYMBOL(tcp_set_keepalive);
+
+
+ static void tcp_keepalive_timer (unsigned long data)
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -1884,6 +1884,7 @@ struct proto tcpv6_prot = {
+ .rsk_prot = &tcp6_request_sock_ops,
+ .h.hashinfo = &tcp_hashinfo,
+ .no_autobind = true,
++ .keepalive = tcp_set_keepalive,
+ #ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_tcp_setsockopt,
+ .compat_getsockopt = compat_tcp_getsockopt,
diff --git a/patches.arch/s390-sles12sp2-00-05-net-smc-r-02.patch b/patches.arch/s390-sles12sp2-00-05-net-smc-r-02.patch
new file mode 100644
index 0000000000..b4f0fb8315
--- /dev/null
+++ b/patches.arch/s390-sles12sp2-00-05-net-smc-r-02.patch
@@ -0,0 +1,802 @@
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Subject: smc: establish new socket family
+Patch-mainline: not yet, IBM pushing upstream
+References: bsc#978258,FATE#319593,LTC#131290
+
+Summary: net/smc: Shared Memory Communications - RDMA
+Description: Initial part of the implementation of the "Shared Memory
+ Communications-RDMA" (SMC-R) protocol. The protocol is defined
+ in RFC7609 [1]. It allows transparent transformation of TCP
+ connections using the "Remote Direct Memory Access over
+ Converged Ethernet" (RoCE) feature of certain communication
+ hardware for data center environments. Tested on s390 and x86
+ using Mellanox ConnectX-3 cards.
+
+ A new socket protocol family PF_SMC is being introduced. A
+ preload shared library will be offered to enable TCP-based
+ applications to use SMC-R without changes or recompilation.
+
+ References:
+ [1] SMC-R Informational RFC:
+ https://tools.ietf.org/rfc/rfc7609
+
+Upstream-Description:
+
+ smc: establish new socket family
+
+ * enable smc module loading and unloading
+ * register new socket family
+ * basic smc socket creation and deletion
+ * use backing TCP socket to run CLC (Connection Layer Control)
+ handshake of SMC protocol
+ * Setup for infiniband traffic is implemented in follow-on patches.
+ For now fallback to TCP socket is always used.
+
+ Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+ Reviewed-by: Utz Bacher <utz.bacher@de.ibm.com>
+
+Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Acked-by: John Jolly <jjolly@suse.de>
+---
+ MAINTAINERS | 7
+ include/linux/socket.h | 7
+ net/Kconfig | 1
+ net/Makefile | 1
+ net/smc/Kconfig | 11
+ net/smc/Makefile | 2
+ net/smc/af_smc.c | 631 +++++++++++++++++++++++++++++++++++++++++++++++++
+ net/smc/smc.h | 37 ++
+ 8 files changed, 696 insertions(+), 1 deletion(-)
+
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -9368,6 +9368,13 @@ L: linux-serial@vger.kernel.org
+ S: Maintained
+ F: drivers/tty/serial/
+
++SHARED MEMORY COMMUNICATIONS (SMC) SOCKETS
++M: Ursula Braun <ursula.braun@linux.vnet.ibm.com>
++L: linux-s390@vger.kernel.org
++W: http://www.ibm.com/developerworks/linux/linux390/
++S: Supported
++F: net/smc/
++
+ SYNOPSYS DESIGNWARE DMAC DRIVER
+ M: Viresh Kumar <vireshk@kernel.org>
+ M: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+--- a/include/linux/socket.h
++++ b/include/linux/socket.h
+@@ -200,7 +200,11 @@ struct ucred {
+ #define AF_ALG 38 /* Algorithm sockets */
+ #define AF_NFC 39 /* NFC sockets */
+ #define AF_VSOCK 40 /* vSockets */
+-#define AF_MAX 41 /* For now.. */
++#define AF_SMC 41 /* smc sockets: reserve number for
++ * PF_SMC protocol family that
++ * reuses AF_INET address family
++ */
++#define AF_MAX 42 /* For now.. */
+
+ /* Protocol families, same as address families. */
+ #define PF_UNSPEC AF_UNSPEC
+@@ -246,6 +250,7 @@ struct ucred {
+ #define PF_ALG AF_ALG
+ #define PF_NFC AF_NFC
+ #define PF_VSOCK AF_VSOCK
++#define PF_SMC AF_SMC
+ #define PF_MAX AF_MAX
+
+ /* Maximum queue length specifiable by listen. */
+--- a/net/Kconfig
++++ b/net/Kconfig
+@@ -54,6 +54,7 @@ source "net/packet/Kconfig"
+ source "net/unix/Kconfig"
+ source "net/xfrm/Kconfig"
+ source "net/iucv/Kconfig"
++source "net/smc/Kconfig"
+
+ config INET
+ bool "TCP/IP networking"
+--- a/net/Makefile
++++ b/net/Makefile
+@@ -49,6 +49,7 @@ obj-$(CONFIG_MAC80211) += mac80211/
+ obj-$(CONFIG_TIPC) += tipc/
+ obj-$(CONFIG_NETLABEL) += netlabel/
+ obj-$(CONFIG_IUCV) += iucv/
++obj-$(CONFIG_SMC) += smc/
+ obj-$(CONFIG_RFKILL) += rfkill/
+ obj-$(CONFIG_NET_9P) += 9p/
+ obj-$(CONFIG_CAIF) += caif/
+--- /dev/null
++++ b/net/smc/Kconfig
+@@ -0,0 +1,11 @@
++config SMC
++ tristate "SMC socket protocol family"
++ depends on INET && INFINIBAND
++ ---help---
++ SMC-R provides a "sockets over RDMA" solution making use of
++ RDMA over Converged Ethernet (RoCE) technology to upgrade
++ AF_INET TCP connections transparently.
++ The Linux implementation of the SMC-R solution is designed as
++ a separate socket family SMC.
++
++ Select this option if you want to run SMC socket applications
+--- /dev/null
++++ b/net/smc/Makefile
+@@ -0,0 +1,2 @@
++obj-$(CONFIG_SMC) += smc.o
++smc-y := af_smc.o
+--- /dev/null
++++ b/net/smc/af_smc.c
+@@ -0,0 +1,631 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * AF_SMC protocol family socket handler keeping the AF_INET sock address type
++ * applies to SOCK_STREAM sockets only
++ * offers an alternative communication option for TCP-protocol sockets
++ * applicable with RoCE-cards only
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
++ * based on prototype from Frank Blaschka
++ */
++
++#define KMSG_COMPONENT "smc"
++#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
++
++#include <linux/module.h>
++#include <linux/socket.h>
++#include <net/sock.h>
++
++#include "smc.h"
++
++static void smc_set_keepalive(struct sock *sk, int val)
++{
++ struct smc_sock *smc = smc_sk(sk);
++
++ smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
++}
++
++static struct proto smc_proto = {
++ .name = "SMC",
++ .owner = THIS_MODULE,
++ .keepalive = smc_set_keepalive,
++ .obj_size = sizeof(struct smc_sock),
++ .slab_flags = SLAB_DESTROY_BY_RCU,
++};
++
++static int smc_release(struct socket *sock)
++{
++ struct sock *sk = sock->sk;
++ struct smc_sock *smc;
++
++ if (!sk || (sk->sk_state == SMC_DESTRUCT))
++ goto out;
++
++ smc = smc_sk(sk);
++ lock_sock(sk);
++
++ sk->sk_state = SMC_CLOSED;
++ if (smc->clcsock) {
++ sock_release(smc->clcsock);
++ smc->clcsock = NULL;
++ }
++
++ /* detach socket */
++ sock_set_flag(sk, SOCK_ZAPPED);
++ sock_orphan(sk);
++ sock->sk = NULL;
++ release_sock(sk);
++
++ sock_put(sk);
++out:
++ return 0;
++}
++
++static void smc_destruct(struct sock *sk)
++{
++ if (sk->sk_state != SMC_CLOSED) {
++ pr_err("Attempt to release SMC socket in state %d %p\n",
++ sk->sk_state, sk);
++ return;
++ }
++ if (!sock_flag(sk, SOCK_DEAD)) {
++ pr_err("Attempt to release alive smc socket %p\n", sk);
++ return;
++ }
++
++ sk->sk_state = SMC_DESTRUCT;
++
++ sk_refcnt_debug_dec(sk);
++}
++
++static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
++{
++ struct smc_sock *smc;
++ struct sock *sk;
++
++ sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
++ if (!sk)
++ return NULL;
++
++ sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
++ sk->sk_state = SMC_INIT;
++ sk->sk_destruct = smc_destruct;
++ sk->sk_protocol = SMCPROTO_SMC;
++ sk_refcnt_debug_inc(sk);
++
++ smc = smc_sk(sk);
++ smc->clcsock = NULL;
++ smc->use_fallback = 0;
++
++ return sk;
++}
++
++static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
++ int addr_len)
++{
++ struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
++ struct sock *sk = sock->sk;
++ struct smc_sock *smc;
++ int rc;
++
++ smc = smc_sk(sk);
++
++ /* replicate tests from inet_bind(), to be safe wrt. future changes */
++ rc = -EINVAL;
++ if (addr_len < sizeof(struct sockaddr_in))
++ goto out;
++
++ rc = -EAFNOSUPPORT;
++ /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
++ if ((addr->sin_family != AF_INET) &&
++ ((addr->sin_family != AF_UNSPEC) ||
++ (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
++ goto out;
++
++ lock_sock(sk);
++
++ /* Check if socket is already active */
++ rc = -EINVAL;
++ if (sk->sk_state != SMC_INIT)
++ goto out_rel;
++
++ smc->clcsock->sk->sk_reuse = sk->sk_reuse;
++ rc = kernel_bind(smc->clcsock, uaddr, addr_len);
++
++out_rel:
++ release_sock(sk);
++out:
++ return rc;
++}
++
++static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
++ unsigned long mask)
++{
++ /* options we don't get control via setsockopt for */
++ nsk->sk_type = osk->sk_type;
++ nsk->sk_sndbuf = osk->sk_sndbuf;
++ nsk->sk_rcvbuf = osk->sk_rcvbuf;
++ nsk->sk_sndtimeo = osk->sk_sndtimeo;
++ nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
++ nsk->sk_mark = osk->sk_mark;
++ nsk->sk_priority = osk->sk_priority;
++ nsk->sk_rcvlowat = osk->sk_rcvlowat;
++ nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
++ nsk->sk_err = osk->sk_err;
++
++ nsk->sk_flags &= ~mask;
++ nsk->sk_flags |= osk->sk_flags & mask;
++}
++
++#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
++ (1UL << SOCK_KEEPOPEN) | \
++ (1UL << SOCK_LINGER) | \
++ (1UL << SOCK_BROADCAST) | \
++ (1UL << SOCK_TIMESTAMP) | \
++ (1UL << SOCK_DBG) | \
++ (1UL << SOCK_RCVTSTAMP) | \
++ (1UL << SOCK_RCVTSTAMPNS) | \
++ (1UL << SOCK_LOCALROUTE) | \
++ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
++ (1UL << SOCK_RXQ_OVFL) | \
++ (1UL << SOCK_WIFI_STATUS) | \
++ (1UL << SOCK_NOFCS) | \
++ (1UL << SOCK_FILTER_LOCKED))
++/* copy only relevant settings and flags of SOL_SOCKET level from smc to
++ * clc socket (since smc is not called for these options from net/core)
++ */
++static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
++{
++ smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
++}
++
++#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
++ (1UL << SOCK_KEEPOPEN) | \
++ (1UL << SOCK_LINGER) | \
++ (1UL << SOCK_DBG))
++/* copy only settings and flags relevant for smc from clc to smc socket */
++static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
++{
++ smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
++}
++
++static int smc_connect(struct socket *sock, struct sockaddr *addr,
++ int alen, int flags)
++{
++ struct sock *sk = sock->sk;
++ struct smc_sock *smc;
++ int rc = -EINVAL;
++
++ smc = smc_sk(sk);
++
++ /* separate smc parameter checking to be safe */
++ if (alen < sizeof(addr->sa_family))
++ goto out_err;
++ if (addr->sa_family != AF_INET)
++ goto out_err;
++
++ lock_sock(sk);
++ switch (sk->sk_state) {
++ default:
++ goto out;
++ case SMC_ACTIVE:
++ rc = -EISCONN;
++ goto out;
++ case SMC_INIT:
++ rc = 0;
++ break;
++ }
++
++ smc_copy_sock_settings_to_clc(smc);
++ rc = kernel_connect(smc->clcsock, addr, alen, flags);
++ if (rc)
++ goto out;
++
++ sk->sk_state = SMC_ACTIVE;
++
++ /* always use TCP fallback as transport mechanism for now;
++ * This will change once RDMA transport is implemented
++ */
++ smc->use_fallback = 1;
++
++out:
++ release_sock(sk);
++out_err:
++ return rc;
++}
++
++static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
++{
++ struct sock *sk = &lsmc->sk;
++ struct socket *new_clcsock;
++ struct sock *new_sk;
++ int rc;
++
++ new_sk = smc_sock_alloc(sock_net(sk), NULL);
++ if (!new_sk) {
++ rc = -ENOMEM;
++ lsmc->sk.sk_err = ENOMEM;
++ lsmc->sk.sk_state = SMC_CLOSED;
++ *new_smc = NULL;
++ goto out;
++ }
++ *new_smc = smc_sk(new_sk);
++
++ rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
++ if (rc) {
++ sock_put(new_sk);
++ *new_smc = NULL;
++ goto out;
++ }
++
++ (*new_smc)->clcsock = new_clcsock;
++out:
++ return rc;
++}
++
++static int smc_listen(struct socket *sock, int backlog)
++{
++ struct sock *sk = sock->sk;
++ struct smc_sock *smc;
++ int rc;
++
++ smc = smc_sk(sk);
++ lock_sock(sk);
++
++ rc = -EINVAL;
++ if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
++ goto out;
++
++ rc = 0;
++ if (sk->sk_state == SMC_LISTEN) {
++ sk->sk_max_ack_backlog = backlog;
++ goto out;
++ }
++ /* some socket options are handled in core, so we could not apply
++ * them to the clc socket -- copy smc socket options to clc socket
++ */
++ smc_copy_sock_settings_to_clc(smc);
++
++ rc = kernel_listen(smc->clcsock, backlog);
++ if (rc)
++ goto out;
++ sk->sk_max_ack_backlog = backlog;
++ sk->sk_ack_backlog = 0;
++ sk->sk_state = SMC_LISTEN;
++
++out:
++ release_sock(sk);
++ return rc;
++}
++
++static int smc_accept(struct socket *sock, struct socket *new_sock,
++ int flags)
++{
++ struct smc_sock *new_smc;
++ struct sock *sk = sock->sk;
++ struct smc_sock *lsmc;
++ int rc;
++
++ lsmc = smc_sk(sk);
++ lock_sock(sk);
++
++ if (lsmc->sk.sk_state != SMC_LISTEN) {
++ rc = -EINVAL;
++ goto out;
++ }
++
++ rc = smc_clcsock_accept(lsmc, &new_smc);
++ if (rc)
++ goto out;
++ sock_graft(&new_smc->sk, new_sock);
++ new_smc->sk.sk_state = SMC_ACTIVE;
++
++ smc_copy_sock_settings_to_smc(new_smc);
++
++ /* always use TCP fallback as transport mechanism for now;
++ * This will change once RDMA transport is implemented
++ */
++ new_smc->use_fallback = 1;
++
++out:
++ release_sock(sk);
++ return rc;
++}
++
++static int smc_getname(struct socket *sock, struct sockaddr *addr,
++ int *len, int peer)
++{
++ struct smc_sock *smc;
++
++ if (peer && (sock->sk->sk_state != SMC_ACTIVE))
++ return -ENOTCONN;
++
++ smc = smc_sk(sock->sk);
++
++ return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
++}
++
++static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
++{
++ struct sock *sk = sock->sk;
++ struct smc_sock *smc;
++ int rc = -EPIPE;
++
++ smc = smc_sk(sk);
++ lock_sock(sk);
++ if (sk->sk_state != SMC_ACTIVE)
++ goto out;
++ if (smc->use_fallback)
++ rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
++ else
++ rc = sock_no_sendmsg(sock, msg, len);
++out:
++ release_sock(sk);
++ return rc;
++}
++
++static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
++ int flags)
++{
++ struct sock *sk = sock->sk;
++ struct smc_sock *smc;
++ int rc = -ENOTCONN;
++
++ smc = smc_sk(sk);
++ lock_sock(sk);
++ if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
++ goto out;
++
++ if (smc->use_fallback)
++ rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
++ else
++ rc = sock_no_recvmsg(sock, msg, len, flags);
++out:
++ release_sock(sk);
++ return rc;
++}
++
++static unsigned int smc_poll(struct file *file, struct socket *sock,
++ poll_table *wait)
++{
++ struct sock *sk = sock->sk;
++ unsigned int mask = 0;
++ struct smc_sock *smc;
++
++ smc = smc_sk(sock->sk);
++ if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) ||
++ smc->use_fallback) {
++ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
++ /* if non-blocking connect finished ... */
++ lock_sock(sk);
++ if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
++ sk->sk_state = SMC_ACTIVE;
++ /* always use TCP fallback as transport mechanism;
++ * This will change once RDMA transport is implemented
++ */
++ smc->use_fallback = 1;
++ }
++ release_sock(sk);
++ } else {
++ mask = sock_no_poll(file, sock, wait);
++ }
++
++ return mask;
++}
++
++static int smc_shutdown(struct socket *sock, int how)
++{
++ struct sock *sk = sock->sk;
++ struct smc_sock *smc;
++ int rc = -EINVAL;
++
++ smc = smc_sk(sk);
++
++ if ((how < SHUT_RD) || (how > SHUT_RDWR))
++ goto out_err;
++
++ lock_sock(sk);
++
++ rc = -ENOTCONN;
++ if (sk->sk_state == SMC_CLOSED)
++ goto out;
++ if (smc->use_fallback) {
++ rc = kernel_sock_shutdown(smc->clcsock, how);
++ sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
++ if (sk->sk_shutdown == SHUTDOWN_MASK)
++ sk->sk_state = SMC_CLOSED;
++ } else {
++ rc = sock_no_shutdown(sock, how);
++ }
++
++out:
++ release_sock(sk);
++
++out_err:
++ return rc;
++}
++
++static int smc_setsockopt(struct socket *sock, int level, int optname,
++ char __user *optval, unsigned int optlen)
++{
++ struct sock *sk = sock->sk;
++ struct smc_sock *smc;
++
++ smc = smc_sk(sk);
++
++ /* generic setsockopts reaching us here always apply to the
++ * CLC socket
++ */
++ return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
++ optval, optlen);
++}
++
++static int smc_getsockopt(struct socket *sock, int level, int optname,
++ char __user *optval, int __user *optlen)
++{
++ struct smc_sock *smc;
++
++ smc = smc_sk(sock->sk);
++ /* socket options apply to the CLC socket */
++ return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
++ optval, optlen);
++}
++
++static int smc_ioctl(struct socket *sock, unsigned int cmd,
++ unsigned long arg)
++{
++ struct smc_sock *smc;
++
++ smc = smc_sk(sock->sk);
++ if (smc->use_fallback)
++ return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
++ else
++ return sock_no_ioctl(sock, cmd, arg);
++}
++
++static ssize_t smc_sendpage(struct socket *sock, struct page *page,
++ int offset, size_t size, int flags)
++{
++ struct sock *sk = sock->sk;
++ struct smc_sock *smc;
++ int rc = -EPIPE;
++
++ smc = smc_sk(sk);
++ lock_sock(sk);
++ if (sk->sk_state != SMC_ACTIVE)
++ goto out;
++ if (smc->use_fallback)
++ rc = kernel_sendpage(smc->clcsock, page, offset,
++ size, flags);
++ else
++ rc = sock_no_sendpage(sock, page, offset, size, flags);
++
++out:
++ release_sock(sk);
++ return rc;
++}
++
++static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
++ struct pipe_inode_info *pipe, size_t len,
++ unsigned int flags)
++{
++ struct sock *sk = sock->sk;
++ struct smc_sock *smc;
++ int rc = -ENOTCONN;
++
++ smc = smc_sk(sk);
++ lock_sock(sk);
++ if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
++ goto out;
++ if (smc->use_fallback) {
++ rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
++ pipe, len, flags);
++ } else {
++ rc = -EOPNOTSUPP;
++ }
++out:
++ release_sock(sk);
++ return rc;
++}
++
++/* must look like tcp */
++static const struct proto_ops smc_sock_ops = {
++ .family = PF_SMC,
++ .owner = THIS_MODULE,
++ .release = smc_release,
++ .bind = smc_bind,
++ .connect = smc_connect,
++ .socketpair = sock_no_socketpair,
++ .accept = smc_accept,
++ .getname = smc_getname,
++ .poll = smc_poll,
++ .ioctl = smc_ioctl,
++ .listen = smc_listen,
++ .shutdown = smc_shutdown,
++ .setsockopt = smc_setsockopt,
++ .getsockopt = smc_getsockopt,
++ .sendmsg = smc_sendmsg,
++ .recvmsg = smc_recvmsg,
++ .mmap = sock_no_mmap,
++ .sendpage = smc_sendpage,
++ .splice_read = smc_splice_read,
++};
++
++static int smc_create(struct net *net, struct socket *sock, int protocol,
++ int kern)
++{
++ struct smc_sock *smc;
++ struct sock *sk;
++ int rc;
++
++ rc = -ESOCKTNOSUPPORT;
++ if (sock->type != SOCK_STREAM)
++ goto out;
++
++ rc = -EPROTONOSUPPORT;
++ if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
++ goto out;
++
++ rc = -ENOBUFS;
++ sock->ops = &smc_sock_ops;
++ sk = smc_sock_alloc(net, sock);
++ if (!sk)
++ goto out;
++
++ /* create internal TCP socket for CLC handshake and fallback */
++ smc = smc_sk(sk);
++ rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
++ IPPROTO_TCP, &smc->clcsock);
++ if (rc)
++ sk_common_release(sk);
++
++out:
++ return rc;
++}
++
++static const struct net_proto_family smc_sock_family_ops = {
++ .family = PF_SMC,
++ .owner = THIS_MODULE,
++ .create = smc_create,
++};
++
++static int __init smc_init(void)
++{
++ int rc;
++
++ rc = proto_register(&smc_proto, 1);
++ if (rc) {
++ pr_err("%s: proto_register fails with %d\n", __func__, rc);
++ goto out;
++ }
++
++ rc = sock_register(&smc_sock_family_ops);
++ if (rc) {
++ pr_err("%s: sock_register fails with %d\n", __func__, rc);
++ goto out_proto;
++ }
++
++ return 0;
++
++out_proto:
++ proto_unregister(&smc_proto);
++out:
++ return rc;
++}
++
++static void __exit smc_exit(void)
++{
++ sock_unregister(PF_SMC);
++ proto_unregister(&smc_proto);
++}
++
++module_init(smc_init);
++module_exit(smc_exit);
++
++MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
++MODULE_DESCRIPTION("smc socket address family");
++MODULE_LICENSE("GPL");
++MODULE_ALIAS_NETPROTO(PF_SMC);
+--- /dev/null
++++ b/net/smc/smc.h
+@@ -0,0 +1,37 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * Definitions for the SMC module (socket related)
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Ursula Braun <ursula.braun@linux.vnet.ibm.com>
++ */
++#ifndef _SMC_H
++#define _SMC_H
++
++#include <linux/socket.h>
++#include <linux/types.h>
++#include <net/sock.h>
++
++#define SMCPROTO_SMC 0 /* SMC protocol */
++
++enum smc_state { /* possible states of an SMC socket */
++ SMC_ACTIVE = 1,
++ SMC_INIT = 2,
++ SMC_CLOSED = 7,
++ SMC_LISTEN = 10,
++ SMC_DESTRUCT = 32
++};
++
++struct smc_sock { /* smc sock container */
++ struct sock sk;
++ struct socket *clcsock; /* internal tcp socket */
++ u8 use_fallback : 1; /* fallback to tcp */
++};
++
++static inline struct smc_sock *smc_sk(const struct sock *sk)
++{
++ return (struct smc_sock *)sk;
++}
++#endif /* _SMC_H */
diff --git a/patches.arch/s390-sles12sp2-00-05-net-smc-r-03.patch b/patches.arch/s390-sles12sp2-00-05-net-smc-r-03.patch
new file mode 100644
index 0000000000..cf10557494
--- /dev/null
+++ b/patches.arch/s390-sles12sp2-00-05-net-smc-r-03.patch
@@ -0,0 +1,636 @@
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Subject: smc: establish pnet table management
+Patch-mainline: not yet, IBM pushing upstream
+References: bsc#978258,FATE#319593,LTC#131290
+
+Summary: net/smc: Shared Memory Communications - RDMA
+Description: Initial part of the implementation of the "Shared Memory
+ Communications-RDMA" (SMC-R) protocol. The protocol is defined
+ in RFC7609 [1]. It allows transparent transformation of TCP
+ connections using the "Remote Direct Memory Access over
+ Converged Ethernet" (RoCE) feature of certain communication
+ hardware for data center environments. Tested on s390 and x86
+ using Mellanox ConnectX-3 cards.
+
+ A new socket protocol family PF_SMC is being introduced. A
+ preload shared library will be offered to enable TCP-based
+ applications to use SMC-R without changes or recompilation.
+
+ References:
+ [1] SMC-R Informational RFC:
+ https://tools.ietf.org/rfc/rfc7609
+
+Upstream-Description:
+
+ smc: establish pnet table management
+
+ Connection creation with SMC-R starts through an internal
+ TCP-connection. The Ethernet interface for this TCP-connection is not
+ restricted to the Ethernet interface of a RoCE device. Any existing
+ Ethernet interface belonging to the same physical net can be used, as
+ long as there is a defined relation between the Ethernet interface and
+ some RoCE devices. This relation is defined with the help of an
+ identification string called "Physical Net ID" or short "pnet ID".
+ Information about defined pnet IDs and their related Ethernet
+ interfaces and RoCE devices is stored in the SMC-R pnet table.
+
+ This patch adds pnet table configuration support as a set of
+ sysfs files listed under /sys/kernel/smc. Attribute files
+ exist to add and delete pnet IDs and to map RoCE devices and
+ ethernet interfaces to an individual pnet ID.
+
+ There is no cross check if ethernet interfaces or infiniband
+ devices really exist in the system. This enables the configuration of
+ the pnet table after module load even if interfaces or devices might
+ not yet be available.
+
+ Signed-off-by: Thomas Richter <tmricht@linux.vnet.ibm.com>
+
+Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Acked-by: John Jolly <jjolly@suse.de>
+---
+ net/smc/Makefile | 2
+ net/smc/af_smc.c | 11 -
+ net/smc/smc_pnet.c | 504 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ net/smc/smc_pnet.h | 19 +
+ 4 files changed, 533 insertions(+), 3 deletions(-)
+
+--- a/net/smc/Makefile
++++ b/net/smc/Makefile
+@@ -1,2 +1,2 @@
+ obj-$(CONFIG_SMC) += smc.o
+-smc-y := af_smc.o
++smc-y := af_smc.o smc_pnet.o
+--- a/net/smc/af_smc.c
++++ b/net/smc/af_smc.c
+@@ -20,6 +20,7 @@
+ #include <net/sock.h>
+
+ #include "smc.h"
++#include "smc_pnet.h"
+
+ static void smc_set_keepalive(struct sock *sk, int val)
+ {
+@@ -596,10 +597,14 @@ static int __init smc_init(void)
+ {
+ int rc;
+
++ rc = smc_pnet_init();
++ if (rc)
++ return rc;
++
+ rc = proto_register(&smc_proto, 1);
+ if (rc) {
+ pr_err("%s: proto_register fails with %d\n", __func__, rc);
+- goto out;
++ goto out_pnet;
+ }
+
+ rc = sock_register(&smc_sock_family_ops);
+@@ -612,7 +617,8 @@ static int __init smc_init(void)
+
+ out_proto:
+ proto_unregister(&smc_proto);
+-out:
++out_pnet:
++ smc_pnet_exit();
+ return rc;
+ }
+
+@@ -620,6 +626,7 @@ static void __exit smc_exit(void)
+ {
+ sock_unregister(PF_SMC);
+ proto_unregister(&smc_proto);
++ smc_pnet_exit();
+ }
+
+ module_init(smc_init);
+--- /dev/null
++++ b/net/smc/smc_pnet.c
+@@ -0,0 +1,504 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * Sysfs support functions
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
++ */
++
++#include <linux/device.h>
++#include <linux/netdevice.h>
++#include <linux/inetdevice.h>
++#include <linux/types.h>
++#include <linux/ctype.h>
++
++#include <net/sock.h>
++
++#include <rdma/ib_verbs.h>
++
++#include "smc_pnet.h"
++
++#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */
++
++/* Sysfs interface for the pnet table
++ *
++ * Create a directory /sys/kernel/smc/ with these files:
++ * /sys/kernel/smc/pnetid_add --> Create a PNETID
++ * /sys/kernel/smc/pnetid_delete --> Delete a PNETID
++ * /sys/kernel/smc/flush --> Delete all PNETIDs
++ * /sys/kernel/smc/pnetids/xxxxx --> Created PNETIDs
++ *
++ * Create PNETID PNET1:
++ * A new file named PNET1 shows up in /sys/kernel/smc/pnetids/.
++ * echo PNET1 > /sys/kernel/smc/pnetid_add
++ *
++ * Display all created PNETIDs:
++ * ls -l /sys/kernel/smc/pnetids
++ *
++ * Delete PNETID PNET1:
++ * File PNET1 is removed from directory /sys/kernel/smc/pnetids/.
++ * echo PNET1 > /sys/kernel/smc/pnetid_del
++ *
++ * Add an ethernet interface to PNETID PNET1:
++ * A leading '+' is optional.
++ * echo "eth enccw0.0.f5f0" > /sys/kernel/smc/pnetids/PNET1
++ *
++ * Add an RDMA device to PNETID PNET1:
++ * A leading '+' is optional
++ * The 3rd field is an optional port. If not specified it defaults to 1.
++ * Currently accepted port numbers are 1 and 2. Other numbers generate an
++ * error.
++ * echo "ib mlx4_0 1" > /sys/kernel/smc/pnetids/PNET1
++ * echo "+ib mlx4_1 2" > /sys/kernel/smc/pnetids/PNET1
++ *
++ * Display all entries belonging to PNETID PNET1:
++ * cat /sys/kernel/smc/pnetids/PNET1
++ *
++ * Delete any entry from PNETID PNET1 with a leading '-':
++ * echo "-ib mlx4_1 2" > /sys/kernel/smc/pnetids/PNET1
++ *
++ * Delete all created PNETIDs at once:
++ * echo - > /sys/kernel/smc/flush
++ *
++ * No load balancing and link fail over is supported.
++ * This results a one to one relationship between ethernet interface and
++ * RDMA device including port name. Therefore each pnet identifier maps
++ * one ethernet interface to one RDMA device.
++ */
++
++/**
++ * struct smc_pnettable - SMC sysfs anchor
++ * @kset: SMC sysfs anchor
++ * @pnetids_kobj: Anchor to /sys/kernel/smc/pnetids
++ * @lock: Lock for list action
++ * @pnetlist: List of PNETIDs
++ */
++static struct smc_pnettable {
++ struct kset *kset;
++ struct kobject pnetids_kobj;
++ rwlock_t lock;
++ struct list_head pnetlist;
++} smc_pnettable = {
++ .pnetlist = LIST_HEAD_INIT(smc_pnettable.pnetlist),
++ .lock = __RW_LOCK_UNLOCKED(smc_pnettable.lock)
++};
++
++/**
++ * struct smc_pnetentry - pnet identifier name entry
++ * @list: List node.
++ * @attr: Embedded attribute structure
++ * @pnet_name: Pnet identifier name
++ * @if_name: Name of the ethernet interface.
++ * @ib_name: Name of the RDMA device.
++ * @ib_port: RDMA device port number.
++ */
++struct smc_pnetentry {
++ struct list_head list;
++ struct kobj_attribute attr;
++ char pnet_name[SMC_MAX_PNET_ID_LEN + 1];
++ char if_name[IFNAMSIZ];
++ char ib_name[IB_DEVICE_NAME_MAX];
++ u8 ib_port;
++};
++
++#define to_smcpnetentry(a) container_of((a), struct smc_pnetentry, attr)
++
++/* Release /sys/kernel/smc/pnetids and delete all pnetids. This function
++ * is called when the kobject anchor in smc_pnettable.pnetids_kobj is freed.
++ */
++static void smc_pnetid_release(struct kobject *kobj)
++{
++ struct smc_pnetentry *e, *tmp_e;
++
++ write_lock(&smc_pnettable.lock);
++ list_for_each_entry_safe(e, tmp_e, &smc_pnettable.pnetlist, list) {
++ list_del(&e->list);
++ kfree(e);
++ }
++ write_unlock(&smc_pnettable.lock);
++}
++
++static struct kobj_type smc_pnet_ktype = {
++ .release = smc_pnetid_release,
++ .sysfs_ops = &kobj_sysfs_ops
++};
++
++/* Remove an ethernet entry from the PNET table */
++static int smc_pnet_del_eth(struct smc_pnetentry *pnetelem, char *name)
++{
++ int rc = -ENOENT;
++
++ write_lock(&smc_pnettable.lock);
++ if (!strncmp(pnetelem->if_name, name, sizeof(pnetelem->if_name))) {
++ rc = 0;
++ pnetelem->if_name[0] = '\0';
++ }
++ write_unlock(&smc_pnettable.lock);
++ return rc;
++}
++
++/* Add an ethernet entry to the PNET table. Search the complete pnet table to
++ * make sure the same ethernet interface is not listed under different PNET ids.
++ */
++static int smc_pnet_add_eth(struct smc_pnetentry *pnetelem, char *name)
++{
++ struct smc_pnetentry *p;
++ int rc = -EEXIST;
++
++ write_lock(&smc_pnettable.lock);
++ list_for_each_entry(p, &smc_pnettable.pnetlist, list) {
++ if (!strncmp(p->if_name, name, sizeof(p->if_name)))
++ goto out;
++ }
++ if (pnetelem->if_name[0] == '\0') {
++ strncpy(pnetelem->if_name, name, sizeof(pnetelem->if_name));
++ rc = 0;
++ }
++out:
++ write_unlock(&smc_pnettable.lock);
++ return rc;
++}
++
++/* Create an ethernet interface entry. */
++static int smc_pnet_makeeth(struct smc_pnetentry *pnetelem, bool add,
++ char *name)
++{
++ name = skip_spaces(name);
++ if (!dev_valid_name(name))
++ return -EINVAL;
++ return (add) ? smc_pnet_add_eth(pnetelem, name)
++ : smc_pnet_del_eth(pnetelem, name);
++}
++
++/* Check if two RDMA device entries are identical. Use device name and port
++ * number for comparison.
++ */
++static bool smc_pnet_same_ibname(struct smc_pnetentry *a, char *name, u8 ibport)
++{
++ return a->ib_port == ibport &&
++ !strncmp(a->ib_name, name, sizeof(a->ib_name));
++}
++
++/* Add an RDMA device entry to the PNET table */
++static int smc_pnet_add_ib(struct smc_pnetentry *pnetelem, char *name,
++ u8 ibport)
++{
++ struct smc_pnetentry *p;
++ int rc = -EEXIST;
++
++ write_lock(&smc_pnettable.lock);
++ list_for_each_entry(p, &smc_pnettable.pnetlist, list) {
++ if (smc_pnet_same_ibname(p, name, ibport))
++ goto out;
++ }
++ if (pnetelem->ib_name[0] == '\0') {
++ strncpy(pnetelem->ib_name, name, sizeof(pnetelem->ib_name));
++ pnetelem->ib_port = ibport;
++ rc = 0;
++ }
++out:
++ write_unlock(&smc_pnettable.lock);
++ return rc;
++}
++
++/* Remove an RDMA device entry from the PNET table */
++static int smc_pnet_del_ib(struct smc_pnetentry *pnetelem, char *name,
++ u8 ibport)
++{
++ int rc = -ENOENT;
++
++ write_lock(&smc_pnettable.lock);
++ if (smc_pnet_same_ibname(pnetelem, name, ibport)) {
++ rc = 0;
++ pnetelem->ib_name[0] = '\0';
++ pnetelem->ib_port = 0;
++ }
++ write_unlock(&smc_pnettable.lock);
++ return rc;
++}
++
++/* Create an RDMA device entry. Optional port number delimited by blank
++ * from name. Missing port number defaults to 1.
++ */
++static int smc_pnet_makeib(struct smc_pnetentry *pnetelem, bool add, char *name)
++{
++ unsigned int tmp_port = 1;
++ char *portno;
++ int rc;
++
++ name = skip_spaces(name);
++ portno = strchr(name, ' ');
++ if (portno) { /* Port number specified */
++ *portno = '\0';
++ portno = skip_spaces(portno + 1);
++ rc = kstrtouint(portno, 10, &tmp_port);
++ if (rc || tmp_port > SMC_MAX_PORTS || !tmp_port) {
++ rc = -EINVAL;
++ goto out;
++ }
++ }
++ rc = (add) ? smc_pnet_add_ib(pnetelem, name, (u8)tmp_port)
++ : smc_pnet_del_ib(pnetelem, name, (u8)tmp_port);
++out:
++ return rc;
++}
++
++static ssize_t smc_pnetidfile_attr_store(struct kobject *kobj,
++ struct kobj_attribute *ka,
++ const char *buf, size_t len)
++{
++ char *text, *buf_copy;
++ bool add = true;
++ int rc;
++
++ /* Operate on a copy of the buffer, we might modify the string */
++ buf_copy = kstrdup(buf, GFP_KERNEL);
++ if (!buf_copy)
++ return -ENOMEM;
++ text = strim(buf_copy);
++ switch (*text) {
++ case '-':
++ add = false;
++ /* Fall through intended */
++ case '+':
++ ++text;
++ break;
++ }
++ text = skip_spaces(text);
++ rc = -EINVAL;
++ if (!strncmp(text, "ib ", 3))
++ rc = smc_pnet_makeib(to_smcpnetentry(ka), add, text + 3);
++ else if (!strncmp(text, "eth ", 4))
++ rc = smc_pnet_makeeth(to_smcpnetentry(ka), add, text + 4);
++ kfree(buf_copy);
++ return rc ?: len;
++}
++
++/* List all entries of a PNETID. List ethernet entries first followed by
++ * RDMA device entries. Output limited to PAGE_SIZE bytes.
++ */
++static ssize_t smc_pnetidfile_attr_show(struct kobject *kobj,
++ struct kobj_attribute *ka,
++ char *buf)
++{
++ struct smc_pnetentry *pnetelem = to_smcpnetentry(ka);
++
++ read_lock(&smc_pnettable.lock);
++ snprintf(buf, PAGE_SIZE, "eth %s\nib %s %u\n", pnetelem->if_name,
++ pnetelem->ib_name, pnetelem->ib_port);
++ read_unlock(&smc_pnettable.lock);
++ return strlen(buf);
++}
++
++/* Delete a PNETID attribute file in /sys/kernel/smc/pnetids.
++ * Remove the sysfs file first and then remove the node from the list and
++ * release memory.
++ */
++static int smc_pnetid_del_file(char *pnetid)
++{
++ struct smc_pnetentry *e, *tmp_e, *found = NULL;
++
++ write_lock(&smc_pnettable.lock);
++ list_for_each_entry_safe(e, tmp_e, &smc_pnettable.pnetlist, list) {
++ if (!strncmp(e->pnet_name, pnetid, sizeof(e->pnet_name))) {
++ list_del(&e->list);
++ found = e;
++ break;
++ }
++ }
++ write_unlock(&smc_pnettable.lock);
++ if (!found)
++ return -ENOENT;
++ sysfs_remove_file(&smc_pnettable.pnetids_kobj, &found->attr.attr);
++ kfree(found);
++ return 0;
++}
++
++/* Append a PNETID to the end of the list if not already on this list. */
++static int smc_pnet_append_pnetentry(struct smc_pnetentry *new)
++{
++ struct smc_pnetentry *pnetelem;
++ int rc = 0;
++
++ write_lock(&smc_pnettable.lock);
++ list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
++ if (!strncmp(pnetelem->pnet_name, new->pnet_name,
++ sizeof(new->pnet_name))) {
++ rc = -EEXIST;
++ goto found;
++ }
++ }
++ list_add_tail(&new->list, &smc_pnettable.pnetlist);
++found:
++ write_unlock(&smc_pnettable.lock);
++ return rc;
++}
++
++/* Add a PNETID attribute file in /sys/kernel/smc/pnetids. */
++static int smc_pnetid_add_file(char *pnetname)
++{
++ struct smc_pnetentry *pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL);
++ struct kobj_attribute *ka;
++ int rc;
++
++ if (!pnetelem)
++ return -ENOMEM;
++ ka = &pnetelem->attr;
++ sysfs_attr_init(&ka->attr);
++ strncpy(pnetelem->pnet_name, pnetname, sizeof(pnetelem->pnet_name));
++ ka->attr.name = pnetelem->pnet_name;
++ ka->attr.mode = S_IWUSR | S_IRUGO;
++ ka->show = smc_pnetidfile_attr_show;
++ ka->store = smc_pnetidfile_attr_store;
++ rc = smc_pnet_append_pnetentry(pnetelem);
++ if (rc)
++ goto outfree;
++ rc = sysfs_create_file_ns(&smc_pnettable.pnetids_kobj, &ka->attr, NULL);
++ if (!rc)
++ return rc;
++ /* sysfs failure, remove node from list */
++ write_lock(&smc_pnettable.lock);
++ list_del(&pnetelem->list);
++ write_unlock(&smc_pnettable.lock);
++outfree:
++ kfree(pnetelem);
++ return rc;
++}
++
++/* The limit for PNETID is 16 characters.
++ * Valid characters should be (single-byte character set) a-z, A-Z, 0-9.
++ * Lower case letters are converted to upper case.
++ * Interior blanks should not be used.
++ */
++static bool smc_pnetid_valid(const char *buf, char *pnetid)
++{
++ char *bf = skip_spaces(buf);
++ size_t len = strlen(bf);
++ char *end = bf + len;
++
++ if (!len)
++ return false;
++ while (--end >= bf && isspace(*end))
++ ;
++ if (end - bf >= SMC_MAX_PNET_ID_LEN)
++ return false;
++ while (bf <= end) {
++ if (!isalnum(*bf))
++ return false;
++ *pnetid++ = islower(*bf) ? toupper(*bf) : *bf;
++ bf++;
++ }
++ *pnetid = '\0';
++ return true;
++}
++
++static ssize_t smc_pnetid_store(bool add, const char *buf)
++{
++ char pnetid[SMC_MAX_PNET_ID_LEN + 1];
++
++ if (!smc_pnetid_valid(buf, pnetid))
++ return -EINVAL;
++ return add ? smc_pnetid_add_file(pnetid) : smc_pnetid_del_file(pnetid);
++}
++
++#define SMC_ATTR_WO(_name) \
++ struct kobj_attribute smc_attr_##_name = __ATTR(_name, S_IWUSR, NULL, \
++ smc_##_name##_store)
++
++static ssize_t smc_pnetid_del_store(struct kobject *kobj,
++ struct kobj_attribute *attr,
++ const char *buf, size_t count)
++{
++ ssize_t rc = smc_pnetid_store(false, buf);
++
++ return rc ?: count;
++}
++static SMC_ATTR_WO(pnetid_del);
++
++static ssize_t smc_pnetid_add_store(struct kobject *kobj,
++ struct kobj_attribute *attr,
++ const char *buf, size_t count)
++{
++ ssize_t rc = smc_pnetid_store(true, buf);
++
++ return rc ?: count;
++}
++static SMC_ATTR_WO(pnetid_add);
++
++/* Delete all PNETIDs. Any string with leading '-' will do.
++ * smc_pnetid_del_file() can not be called directly, because function
++ * sysfs_remove_file() can not be called under lock. Get the first entry
++ * of the list and remove it. smc_pnetid_del_file() can handle the case
++ * when a PNETID already has been deleted in the mean time.
++ */
++static ssize_t smc_flush_store(struct kobject *kobj,
++ struct kobj_attribute *attr,
++ const char *buf, size_t count)
++{
++ struct smc_pnettable *ptr = &smc_pnettable;
++ char pnetname[SMC_MAX_PNET_ID_LEN + 1];
++ struct smc_pnetentry *pnetelem;
++ char *bf = skip_spaces(buf);
++
++ if (*bf != '-')
++ return -EINVAL;
++ do {
++ read_lock(&ptr->lock);
++ pnetelem = list_first_entry_or_null(&ptr->pnetlist,
++ struct smc_pnetentry, list);
++ if (pnetelem)
++ strncpy(pnetname, pnetelem->pnet_name,
++ sizeof(pnetname));
++ read_unlock(&ptr->lock);
++ if (pnetelem)
++ smc_pnetid_del_file(pnetname);
++ } while (pnetelem);
++ return count;
++}
++static SMC_ATTR_WO(flush);
++
++static struct attribute *smc_pnetid_attrs[] = { /* Default SMC attributes */
++ &smc_attr_pnetid_add.attr,
++ &smc_attr_pnetid_del.attr,
++ &smc_attr_flush.attr,
++ NULL
++};
++
++static struct attribute_group smc_attr_group = {
++ .attrs = smc_pnetid_attrs
++};
++
++/* Remove directory tree created under /sys/kernel/smc/. */
++void smc_pnet_exit(void)
++{
++ kobject_put(&smc_pnettable.pnetids_kobj);
++ sysfs_remove_group(&smc_pnettable.kset->kobj, &smc_attr_group);
++ kset_unregister(smc_pnettable.kset);
++}
++
++/* Create directory tree for SMC under /sys/kernel/smc/. */
++int __init smc_pnet_init(void)
++{
++ int rc = -ENOMEM;
++
++ smc_pnettable.kset = kset_create_and_add("smc", NULL, kernel_kobj);
++ if (!smc_pnettable.kset)
++ goto bad0;
++ rc = sysfs_create_group(&smc_pnettable.kset->kobj, &smc_attr_group);
++ if (rc)
++ goto bad1;
++ rc = kobject_init_and_add(&smc_pnettable.pnetids_kobj, &smc_pnet_ktype,
++ &smc_pnettable.kset->kobj, "pnetids");
++ if (rc)
++ goto bad2;
++ return rc;
++
++bad2:
++ sysfs_remove_group(&smc_pnettable.kset->kobj, &smc_attr_group);
++bad1:
++ kset_unregister(smc_pnettable.kset);
++bad0:
++ return rc;
++}
+--- /dev/null
++++ b/net/smc/smc_pnet.h
+@@ -0,0 +1,19 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * PNET table queries
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
++ */
++
++#ifndef _SMC_PNET_H
++#define _SMC_PNET_H
++
++#define SMC_MAX_PORTS 2 /* Max # of ports */
++
++int smc_pnet_init(void) __init;
++void smc_pnet_exit(void);
++
++#endif
diff --git a/patches.arch/s390-sles12sp2-00-05-net-smc-r-04.patch b/patches.arch/s390-sles12sp2-00-05-net-smc-r-04.patch
new file mode 100644
index 0000000000..e0d4d4f530
--- /dev/null
+++ b/patches.arch/s390-sles12sp2-00-05-net-smc-r-04.patch
@@ -0,0 +1,443 @@
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Subject: smc: introduce SMC as an IB-client
+Patch-mainline: not yet, IBM pushing upstream
+References: bsc#978258,FATE#319593,LTC#131290
+
+Summary: net/smc: Shared Memory Communications - RDMA
+Description: Initial part of the implementation of the "Shared Memory
+ Communications-RDMA" (SMC-R) protocol. The protocol is defined
+ in RFC7609 [1]. It allows transparent transformation of TCP
+ connections using the "Remote Direct Memory Access over
+ Converged Ethernet" (RoCE) feature of certain communication
+ hardware for data center environments. Tested on s390 and x86
+ using Mellanox ConnectX-3 cards.
+
+ A new socket protocol family PF_SMC is being introduced. A
+ preload shared library will be offered to enable TCP-based
+ applications to use SMC-R without changes or recompilation.
+
+ References:
+ [1] SMC-R Informational RFC:
+ https://tools.ietf.org/rfc/rfc7609
+
+Upstream-Description:
+
+ smc: introduce SMC as an IB-client
+
+ * create a list of SMC IB-devices (IB-devices mentioned in PNET table)
+ * determine RoCE device and port belonging to used internal TCP interface
+ according to the PNET table
+
+ Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+
+Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Acked-by: John Jolly <jjolly@suse.de>
+---
+ net/smc/Makefile | 2
+ net/smc/af_smc.c | 10 +++
+ net/smc/smc.h | 5 +
+ net/smc/smc_ib.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ net/smc/smc_ib.h | 40 +++++++++++++
+ net/smc/smc_pnet.c | 98 +++++++++++++++++++++++++++++++++
+ net/smc/smc_pnet.h | 7 ++
+ 7 files changed, 316 insertions(+), 1 deletion(-)
+
+--- a/net/smc/Makefile
++++ b/net/smc/Makefile
+@@ -1,2 +1,2 @@
+ obj-$(CONFIG_SMC) += smc.o
+-smc-y := af_smc.o smc_pnet.o
++smc-y := af_smc.o smc_pnet.o smc_ib.o
+--- a/net/smc/af_smc.c
++++ b/net/smc/af_smc.c
+@@ -20,6 +20,7 @@
+ #include <net/sock.h>
+
+ #include "smc.h"
++#include "smc_ib.h"
+ #include "smc_pnet.h"
+
+ static void smc_set_keepalive(struct sock *sk, int val)
+@@ -613,8 +614,16 @@ static int __init smc_init(void)
+ goto out_proto;
+ }
+
++ rc = smc_ib_register_client();
++ if (rc) {
++ pr_err("%s: ib_register fails with %d\n", __func__, rc);
++ goto out_sock;
++ }
++
+ return 0;
+
++out_sock:
++ sock_unregister(PF_SMC);
+ out_proto:
+ proto_unregister(&smc_proto);
+ out_pnet:
+@@ -624,6 +633,7 @@ out_pnet:
+
+ static void __exit smc_exit(void)
+ {
++ smc_ib_unregister_client();
+ sock_unregister(PF_SMC);
+ proto_unregister(&smc_proto);
+ smc_pnet_exit();
+--- a/net/smc/smc.h
++++ b/net/smc/smc.h
+@@ -34,4 +34,9 @@ static inline struct smc_sock *smc_sk(co
+ {
+ return (struct smc_sock *)sk;
+ }
++
++#define SMC_SYSTEMID_LEN 8
++
++extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
++
+ #endif /* _SMC_H */
+--- /dev/null
++++ b/net/smc/smc_ib.c
+@@ -0,0 +1,155 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * IB infrastructure:
++ * Establish SMC-R as an Infiniband Client to be notified about added and
++ * removed IB devices of type RDMA.
++ * Determine device an port characteristics for these IB devices.
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
++ */
++
++#include <linux/random.h>
++#include <rdma/ib_verbs.h>
++
++#include "smc_pnet.h"
++#include "smc_ib.h"
++#include "smc.h"
++
++struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
++ .lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock),
++ .list = LIST_HEAD_INIT(smc_ib_devices.list),
++};
++
++#define SMC_LOCAL_SYSTEMID_RESET "%%%%%%%"
++
++u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system
++ * identifier
++ */
++
++static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
++{
++ struct net_device *ndev;
++ int rc;
++
++ rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
++ &smcibdev->gid[ibport - 1], NULL);
++ /* the SMC protocol requires specification of the roce MAC address;
++ * if net_device cannot be determined, it can be derived from gid 0
++ */
++ ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport);
++ if (ndev) {
++ memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN);
++ } else if (!rc) {
++ memcpy(&smcibdev->mac[ibport - 1][0],
++ &smcibdev->gid[ibport - 1].raw[8], 3);
++ memcpy(&smcibdev->mac[ibport - 1][3],
++ &smcibdev->gid[ibport - 1].raw[13], 3);
++ smcibdev->mac[ibport - 1][0] &= ~0x02;
++ }
++ return rc;
++}
++
++/* Create an identifier unique for this instance of SMC-R.
++ * The MAC-address of the first active registered IB device
++ * plus a random 2-byte number is used to create this identifier.
++ * This name is delivered to the peer during connection initialization.
++ */
++static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
++ u8 ibport)
++{
++ memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
++ sizeof(smcibdev->mac[ibport - 1]));
++ get_random_bytes(&local_systemid[0], 2);
++}
++
++bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
++{
++ return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
++}
++
++int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
++{
++ int rc;
++
++ memset(&smcibdev->pattr[ibport - 1], 0,
++ sizeof(smcibdev->pattr[ibport - 1]));
++ rc = ib_query_port(smcibdev->ibdev, ibport,
++ &smcibdev->pattr[ibport - 1]);
++ if (rc)
++ goto out;
++ smc_ib_fill_gid_and_mac(smcibdev, ibport);
++ if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
++ sizeof(local_systemid)) &&
++ smc_ib_port_active(smcibdev, ibport))
++ /* create unique system identifier */
++ smc_ib_define_local_systemid(smcibdev, ibport);
++out:
++ return rc;
++}
++
++static struct ib_client smc_ib_client;
++
++/* callback function for ib_register_client() */
++static void smc_ib_add_dev(struct ib_device *ibdev)
++{
++ struct smc_ib_device *smcibdev;
++ int i;
++
++ if (ibdev->node_type != RDMA_NODE_IB_CA)
++ return;
++
++ smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
++ if (!smcibdev)
++ return;
++
++ smcibdev->ibdev = ibdev;
++
++ for (i = 1; i <= SMC_MAX_PORTS; i++) {
++ if (smc_pnet_exists_in_table(smcibdev, i) &&
++ !smcibdev->initialized) {
++ /* dev hotplug: ib device and port is in pnet table */
++ if (smc_ib_remember_port_attr(smcibdev, i)) {
++ kfree(smcibdev);
++ return;
++ }
++ smcibdev->initialized = 1;
++ break;
++ }
++ }
++ spin_lock(&smc_ib_devices.lock);
++ list_add_tail(&smcibdev->list, &smc_ib_devices.list);
++ spin_unlock(&smc_ib_devices.lock);
++ ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
++}
++
++/* callback function for ib_register_client() */
++static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
++{
++ struct smc_ib_device *smcibdev;
++
++ smcibdev = ib_get_client_data(ibdev, &smc_ib_client);
++ ib_set_client_data(ibdev, &smc_ib_client, NULL);
++ spin_lock(&smc_ib_devices.lock);
++ list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
++ spin_unlock(&smc_ib_devices.lock);
++ kfree(smcibdev);
++}
++
++static struct ib_client smc_ib_client = {
++ .name = "smc_ib",
++ .add = smc_ib_add_dev,
++ .remove = smc_ib_remove_dev,
++};
++
++int __init smc_ib_register_client(void)
++{
++ return ib_register_client(&smc_ib_client);
++}
++
++void __exit smc_ib_unregister_client(void)
++{
++ ib_unregister_client(&smc_ib_client);
++}
+--- /dev/null
++++ b/net/smc/smc_ib.h
+@@ -0,0 +1,40 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * Definitions for IB environment
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Ursula Braun <Ursula Braun@linux.vnet.ibm.com>
++ */
++
++#ifndef _SMC_IB_H
++#define _SMC_IB_H
++
++#include <rdma/ib_verbs.h>
++
++#define SMC_MAX_PORTS 2 /* Max # of ports */
++#define SMC_GID_SIZE sizeof(union ib_gid)
++
++struct smc_ib_devices { /* list of smc ib devices definition */
++ struct list_head list;
++ spinlock_t lock; /* protects list of smc ib devices */
++};
++
++extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */
++
++struct smc_ib_device { /* ib-device infos for smc */
++ struct list_head list;
++ struct ib_device *ibdev;
++ struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */
++ char mac[SMC_MAX_PORTS][6]; /* mac address per port*/
++ union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */
++ u8 initialized : 1; /* ib dev CQ, evthdl done */
++};
++
++int __init smc_ib_register_client(void);
++void __exit smc_ib_unregister_client(void);
++bool smc_ib_port_active(struct smc_ib_device *, u8);
++int smc_ib_remember_port_attr(struct smc_ib_device *, u8);
++
++#endif
+--- a/net/smc/smc_pnet.c
++++ b/net/smc/smc_pnet.c
+@@ -18,6 +18,7 @@
+
+ #include <rdma/ib_verbs.h>
+
++#include "smc_ib.h"
+ #include "smc_pnet.h"
+
+ #define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */
+@@ -185,6 +186,8 @@ static bool smc_pnet_same_ibname(struct
+ static int smc_pnet_add_ib(struct smc_pnetentry *pnetelem, char *name,
+ u8 ibport)
+ {
++ struct smc_ib_device *smcibdev = NULL;
++ struct smc_ib_device *dev;
+ struct smc_pnetentry *p;
+ int rc = -EEXIST;
+
+@@ -196,10 +199,32 @@ static int smc_pnet_add_ib(struct smc_pn
+ if (pnetelem->ib_name[0] == '\0') {
+ strncpy(pnetelem->ib_name, name, sizeof(pnetelem->ib_name));
+ pnetelem->ib_port = ibport;
++ spin_lock(&smc_ib_devices.lock);
++ /* using string ib_name, search smcibdev in global list */
++ list_for_each_entry(dev, &smc_ib_devices.list, list) {
++ if (!strncmp(dev->ibdev->name, pnetelem->ib_name,
++ sizeof(pnetelem->ib_name))) {
++ smcibdev = dev;
++ break;
++ }
++ }
++ spin_unlock(&smc_ib_devices.lock);
+ rc = 0;
+ }
+ out:
+ write_unlock(&smc_pnettable.lock);
++ if (smcibdev && !smcibdev->initialized) {
++ /* ib dev already existed [dev coldplug].
++ * Complements: smc_ib_add_dev() [dev hotplug],
++ * smc_ib_global_event_handler() [port hotplug].
++ * Function call chain can sleep so outside of our locks.
++ */
++ rc = smc_ib_remember_port_attr(smcibdev,
++ pnetelem->ib_port);
++ if (rc)
++ return rc;
++ smcibdev->initialized = 1;
++ }
+ return rc;
+ }
+
+@@ -502,3 +527,76 @@ bad1:
+ bad0:
+ return rc;
+ }
++
++/* Scan the pnet table and find an IB device given the pnetid entry.
++ * Return infiniband device and port number if an active port is found.
++ * This function is called under smc_pnettable.lock.
++ */
++static void smc_pnet_ib_dev_by_pnet(struct smc_pnetentry *pnetelem,
++ struct smc_ib_device **smcibdev, u8 *ibport)
++{
++ struct smc_ib_device *dev;
++
++ *smcibdev = NULL;
++ *ibport = 0;
++ spin_lock(&smc_ib_devices.lock);
++ /* using string ib->ib_name, search ibdev in global list */
++ list_for_each_entry(dev, &smc_ib_devices.list, list) {
++ if (!strncmp(dev->ibdev->name, pnetelem->ib_name,
++ sizeof(pnetelem->ib_name)) &&
++ smc_ib_port_active(dev, pnetelem->ib_port)) {
++ *smcibdev = dev;
++ *ibport = pnetelem->ib_port;
++ break;
++ }
++ }
++ spin_unlock(&smc_ib_devices.lock);
++}
++
++/* PNET table analysis for a given sock:
++ * determine ib_device and port belonging to used internal TCP socket
++ * ethernet interface.
++ */
++void smc_pnet_find_roce_resource(struct sock *sk,
++ struct smc_ib_device **smcibdev, u8 *ibport)
++{
++ struct dst_entry *dst = sk_dst_get(sk);
++ struct smc_pnetentry *pnetelem;
++
++ *smcibdev = NULL;
++ *ibport = 0;
++
++ if (!dst)
++ return;
++ if (!dst->dev)
++ goto out_rel;
++ read_lock(&smc_pnettable.lock);
++ list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
++ if (!strncmp(dst->dev->name, pnetelem->if_name, IFNAMSIZ)) {
++ smc_pnet_ib_dev_by_pnet(pnetelem, smcibdev, ibport);
++ break;
++ }
++ }
++ read_unlock(&smc_pnettable.lock);
++out_rel:
++ dst_release(dst);
++}
++
++/* Returns true if a specific ib_device and port is in the PNET table. */
++bool smc_pnet_exists_in_table(struct smc_ib_device *smcibdev, u8 ibport)
++{
++ struct smc_pnetentry *pnetelem;
++ int rc = -false;
++
++ read_lock(&smc_pnettable.lock);
++ list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
++ if (!strncmp(smcibdev->ibdev->name, pnetelem->ib_name,
++ IB_DEVICE_NAME_MAX) &&
++ ibport == pnetelem->ib_port) {
++ rc = true;
++ break;
++ }
++ }
++ read_unlock(&smc_pnettable.lock);
++ return rc;
++}
+--- a/net/smc/smc_pnet.h
++++ b/net/smc/smc_pnet.h
+@@ -13,6 +13,13 @@
+
+ #define SMC_MAX_PORTS 2 /* Max # of ports */
+
++#include <net/sock.h>
++
++struct smc_ib_device;
++
++bool smc_pnet_exists_in_table(struct smc_ib_device *, u8);
++void smc_pnet_find_roce_resource(struct sock *, struct smc_ib_device **, u8 *);
++
+ int smc_pnet_init(void) __init;
+ void smc_pnet_exit(void);
+
diff --git a/patches.arch/s390-sles12sp2-00-05-net-smc-r-05.patch b/patches.arch/s390-sles12sp2-00-05-net-smc-r-05.patch
new file mode 100644
index 0000000000..dbb9ac73f6
--- /dev/null
+++ b/patches.arch/s390-sles12sp2-00-05-net-smc-r-05.patch
@@ -0,0 +1,1064 @@
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Subject: smc: CLC handshake (incl. preparation steps)
+Patch-mainline: not yet, IBM pushing upstream
+References: bsc#978258,FATE#319593,LTC#131290
+
+Summary: net/smc: Shared Memory Communications - RDMA
+Description: Initial part of the implementation of the "Shared Memory
+ Communications-RDMA" (SMC-R) protocol. The protocol is defined
+ in RFC7609 [1]. It allows transparent transformation of TCP
+ connections using the "Remote Direct Memory Access over
+ Converged Ethernet" (RoCE) feature of certain communication
+ hardware for data center environments. Tested on s390 and x86
+ using Mellanox ConnectX-3 cards.
+
+ A new socket protocol family PF_SMC is being introduced. A
+ preload shared library will be offered to enable TCP-based
+ applications to use SMC-R without changes or recompilation.
+
+ References:
+ [1] SMC-R Informational RFC:
+ https://tools.ietf.org/rfc/rfc7609
+
+Upstream-Description:
+
+ smc: CLC handshake (incl. preparation steps)
+
+ * CLC (Connection Layer Control) handshake
+
+ Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+
+Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Acked-by: John Jolly <jjolly@suse.de>
+---
+ net/smc/Makefile | 2
+ net/smc/af_smc.c | 462 ++++++++++++++++++++++++++++++++++++++++++++++++++----
+ net/smc/smc.h | 42 ++++
+ net/smc/smc_clc.c | 248 ++++++++++++++++++++++++++++
+ net/smc/smc_clc.h | 111 ++++++++++++
+ 5 files changed, 835 insertions(+), 30 deletions(-)
+
+--- a/net/smc/Makefile
++++ b/net/smc/Makefile
+@@ -1,2 +1,2 @@
+ obj-$(CONFIG_SMC) += smc.o
+-smc-y := af_smc.o smc_pnet.o smc_ib.o
++smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o
+--- a/net/smc/af_smc.c
++++ b/net/smc/af_smc.c
+@@ -6,6 +6,13 @@
+ * offers an alternative communication option for TCP-protocol sockets
+ * applicable with RoCE-cards only
+ *
++ * Initial restrictions:
++ * - non-blocking connect postponed
++ * - IPv6 support postponed
++ * - support for alternate links postponed
++ * - partial support for non-blocking sockets only
++ * - support for urgent data postponed
++ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+@@ -17,12 +24,17 @@
+
+ #include <linux/module.h>
+ #include <linux/socket.h>
++#include <linux/inetdevice.h>
+ #include <net/sock.h>
++#include <net/tcp.h>
+
+ #include "smc.h"
++#include "smc_clc.h"
+ #include "smc_ib.h"
+ #include "smc_pnet.h"
+
++static void smc_tcp_listen_worker(struct work_struct *);
++
+ static void smc_set_keepalive(struct sock *sk, int val)
+ {
+ struct smc_sock *smc = smc_sk(sk);
+@@ -47,6 +59,7 @@ static int smc_release(struct socket *so
+ goto out;
+
+ smc = smc_sk(sk);
++ sock_hold(sk);
+ lock_sock(sk);
+
+ sk->sk_state = SMC_CLOSED;
+@@ -101,6 +114,11 @@ static struct sock *smc_sock_alloc(struc
+ smc = smc_sk(sk);
+ smc->clcsock = NULL;
+ smc->use_fallback = 0;
++ smc->addr = NULL;
++ smc->listen_smc = NULL;
++ INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_worker);
++ INIT_LIST_HEAD(&smc->accept_q);
++ spin_lock_init(&smc->accept_q_lock);
+
+ return sk;
+ }
+@@ -194,6 +212,123 @@ static void smc_copy_sock_settings_to_sm
+ smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
+ }
+
++/* determine subnet and mask of internal TCP socket */
++int smc_netinfo_by_tcpsk(struct socket *clcsock,
++ __be32 *subnet, u8 *prefix_len)
++{
++ struct dst_entry *dst = sk_dst_get(clcsock->sk);
++ struct sockaddr_in addr;
++ int rc = 0;
++ int len;
++
++ if (!dst) {
++ rc = -ENOTCONN;
++ goto out;
++ }
++ if (!dst->dev) {
++ rc = -ENODEV;
++ goto out_rel;
++ }
++
++ /* get address to which the internal TCP socket is bound */
++ kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
++ /* analyze IPv4 specific data of net_device belonging to TCP socket */
++ for_ifa(dst->dev->ip_ptr) {
++ if (ifa->ifa_address != addr.sin_addr.s_addr)
++ continue;
++ *prefix_len = inet_mask_len(ifa->ifa_mask);
++ *subnet = ifa->ifa_address & ifa->ifa_mask;
++ rc = 0;
++ break;
++ } endfor_ifa(dst->dev->ip_ptr);
++
++out_rel:
++ dst_release(dst);
++out:
++ return rc;
++}
++
++/* setup for RDMA connection of client */
++static int smc_connect_rdma(struct smc_sock *smc)
++{
++ struct smc_clc_msg_accept_confirm aclc;
++ struct smc_ib_device *smcibdev;
++ int reason_code = 0;
++ int rc = 0;
++ u8 ibport;
++
++ if (smc->clc_started)
++ return rc;
++ smc->clc_started = 1;
++
++ /* IPSec connections opt out of SMC-R optimizations */
++ if (using_ipsec(smc)) {
++ reason_code = SMC_CLC_DECL_IPSEC;
++ goto decline_rdma;
++ }
++
++ /* PNET table look up: search active ib_device and port
++ * within same PNETID that also contains the ethernet device
++ * used for the internal TCP socket
++ */
++ smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
++ if (!smcibdev) {
++ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
++ goto decline_rdma;
++ }
++
++ /* do inband token exchange */
++ reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
++ if (reason_code < 0) {
++ rc = reason_code;
++ goto out_err;
++ }
++ if (reason_code > 0) /* configuration error */
++ goto decline_rdma;
++ /* receive SMC Accept CLC message */
++ reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
++ SMC_CLC_ACCEPT);
++ if (reason_code < 0) {
++ rc = reason_code;
++ goto out_err;
++ }
++ if (reason_code > 0)
++ goto decline_rdma;
++
++ /* tbd in follow-on patch: more steps to setup RDMA communcication,
++ * create connection, link group, link
++ */
++
++ /* tbd in follow-on patch: more steps to setup RDMA communcication,
++ * create rmbs, map rmbs, rtoken_handling, modify_qp
++ */
++
++ rc = smc_clc_send_confirm(smc);
++ if (rc)
++ goto out_err;
++
++ /* tbd in follow-on patch: llc_confirm */
++
++out_connected:
++ smc_copy_sock_settings_to_clc(smc);
++ smc->sk.sk_state = SMC_ACTIVE;
++
++ return rc;
++
++decline_rdma:
++ /* RDMA setup failed, switch back to TCP */
++ smc->use_fallback = 1;
++ if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
++ rc = smc_clc_send_decline(smc, reason_code, 0);
++ if (rc < sizeof(struct smc_clc_msg_decline))
++ goto out_err;
++ }
++ goto out_connected;
++
++out_err:
++ return rc;
++}
++
+ static int smc_connect(struct socket *sock, struct sockaddr *addr,
+ int alen, int flags)
+ {
+@@ -208,6 +343,7 @@ static int smc_connect(struct socket *so
+ goto out_err;
+ if (addr->sa_family != AF_INET)
+ goto out_err;
++ smc->addr = addr; /* needed for nonblocking connect */
+
+ lock_sock(sk);
+ switch (sk->sk_state) {
+@@ -226,12 +362,12 @@ static int smc_connect(struct socket *so
+ if (rc)
+ goto out;
+
+- sk->sk_state = SMC_ACTIVE;
+-
+- /* always use TCP fallback as transport mechanism for now;
+- * This will change once RDMA transport is implemented
+- */
+- smc->use_fallback = 1;
++ /* setup RDMA connection */
++ rc = smc_connect_rdma(smc);
++ if (rc < 0)
++ goto out;
++ else
++ rc = 0; /* success cases including fallback */
+
+ out:
+ release_sock(sk);
+@@ -246,18 +382,29 @@ static int smc_clcsock_accept(struct smc
+ struct sock *new_sk;
+ int rc;
+
++ release_sock(&lsmc->sk);
+ new_sk = smc_sock_alloc(sock_net(sk), NULL);
+ if (!new_sk) {
+ rc = -ENOMEM;
+ lsmc->sk.sk_err = ENOMEM;
+ lsmc->sk.sk_state = SMC_CLOSED;
+ *new_smc = NULL;
++ lock_sock(&lsmc->sk);
+ goto out;
+ }
+ *new_smc = smc_sk(new_sk);
+
+ rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
+- if (rc) {
++ lock_sock(&lsmc->sk);
++ if (rc < 0) {
++ lsmc->sk.sk_err = -rc;
++ sock_put(new_sk);
++ *new_smc = NULL;
++ goto out;
++ }
++ if (lsmc->sk.sk_state == SMC_CLOSED) {
++ if (new_clcsock)
++ sock_release(new_clcsock);
+ sock_put(new_sk);
+ *new_smc = NULL;
+ goto out;
+@@ -268,6 +415,216 @@ out:
+ return rc;
+ }
+
++/* add a just created sock to the accept queue of the listen sock as
++ * candidate for a following socket accept call from user space
++ */
++static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
++{
++ struct smc_sock *par = smc_sk(parent);
++
++ sock_hold(sk);
++ spin_lock(&par->accept_q_lock);
++ list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
++ spin_unlock(&par->accept_q_lock);
++ sk_acceptq_added(parent);
++}
++
++/* remove a socket from the accept queue of its parental listening socket */
++static void smc_accept_unlink(struct sock *sk)
++{
++ struct smc_sock *par = smc_sk(sk)->listen_smc;
++
++ spin_lock(&par->accept_q_lock);
++ list_del_init(&smc_sk(sk)->accept_q);
++ spin_unlock(&par->accept_q_lock);
++ sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
++ sock_put(sk);
++}
++
++/* remove a sock from the accept queue to bind it to a new socket created
++ * for a socket accept call from user space
++ */
++static struct sock *smc_accept_dequeue(struct sock *parent,
++ struct socket *new_sock)
++{
++ struct smc_sock *isk, *n;
++ struct sock *new_sk;
++
++ list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
++ new_sk = (struct sock *)isk;
++
++ smc_accept_unlink(new_sk);
++ if (new_sk->sk_state == SMC_CLOSED) {
++ /* tbd in follow-on patch: close this sock */
++ continue;
++ }
++ if (new_sock)
++ sock_graft(new_sk, new_sock);
++ return new_sk;
++ }
++ return NULL;
++}
++
++/* clean up for a created but never accepted sock */
++static void smc_destruct_non_accepted(struct sock *sk)
++{
++ struct smc_sock *smc = smc_sk(sk);
++
++ sock_hold(sk);
++ if (smc->clcsock) {
++ struct socket *tcp;
++
++ tcp = smc->clcsock;
++ smc->clcsock = NULL;
++ sock_release(tcp);
++ }
++ /* more closing stuff to be added with socket closing patch */
++ sock_put(sk);
++}
++
++/* setup for RDMA connection of server */
++static void smc_listen_worker(struct work_struct *work)
++{
++ struct smc_sock *new_smc = container_of(work, struct smc_sock,
++ smc_listen_work);
++ struct socket *newclcsock = new_smc->clcsock;
++ struct smc_sock *lsmc = new_smc->listen_smc;
++ struct smc_clc_msg_accept_confirm cclc;
++ struct sock *newsmcsk = &new_smc->sk;
++ struct smc_clc_msg_proposal pclc;
++ struct smc_ib_device *smcibdev;
++ struct sockaddr_in peeraddr;
++ int reason_code = 0;
++ int rc = 0, len;
++ __be32 subnet;
++ u8 prefix_len;
++ u8 ibport;
++
++ /* do inband token exchange -
++ *wait for and receive SMC Proposal CLC message
++ */
++ reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
++ SMC_CLC_PROPOSAL);
++ if (reason_code < 0)
++ goto out_err;
++ if (reason_code > 0)
++ goto decline_rdma;
++
++ /* IPSec connections opt out of SMC-R optimizations */
++ if (using_ipsec(new_smc)) {
++ reason_code = SMC_CLC_DECL_IPSEC;
++ goto decline_rdma;
++ }
++
++ /* PNET table look up: search active ib_device and port
++ * within same PNETID that also contains the ethernet device
++ * used for the internal TCP socket
++ */
++ smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
++ if (!smcibdev) {
++ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
++ goto decline_rdma;
++ }
++
++ /* determine subnet and mask from internal TCP socket */
++ rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
++ if (rc) {
++ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
++ goto decline_rdma;
++ }
++ if ((pclc.outgoing_subnet != subnet) ||
++ (pclc.prefix_len != prefix_len)) {
++ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
++ goto decline_rdma;
++ }
++
++ /* get address of the peer connected to the internal TCP socket */
++ kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
++
++ /* tbd in follow-on patch: more steps to setup RDMA communcication,
++ * create connection, link_group, link
++ */
++
++ /* tbd in follow-on patch: more steps to setup RDMA communcication,
++ * create rmbs, map rmbs
++ */
++
++ rc = smc_clc_send_accept(new_smc);
++ if (rc)
++ goto out_err;
++
++ /* receive SMC Confirm CLC message */
++ reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
++ SMC_CLC_CONFIRM);
++ if (reason_code < 0)
++ goto out_err;
++ if (reason_code > 0)
++ goto decline_rdma;
++
++ /* tbd in follow-on patch: more steps to setup RDMA communcication,
++ * rtoken_handling, modify_qp
++ */
++
++out_connected:
++ sk_refcnt_debug_inc(newsmcsk);
++ newsmcsk->sk_state = SMC_ACTIVE;
++enqueue:
++ lock_sock(&lsmc->sk);
++ if (lsmc->sk.sk_state == SMC_LISTEN) {
++ smc_accept_enqueue(&lsmc->sk, newsmcsk);
++ } else { /* no longer listening */
++ smc_destruct_non_accepted(newsmcsk);
++ }
++ release_sock(&lsmc->sk);
++
++ /* Wake up accept */
++ lsmc->sk.sk_data_ready(&lsmc->sk);
++ sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_worker */
++ return;
++
++decline_rdma:
++ /* RDMA setup failed, switch back to TCP */
++ new_smc->use_fallback = 1;
++ if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
++ rc = smc_clc_send_decline(new_smc, reason_code, 0);
++ if (rc < sizeof(struct smc_clc_msg_decline))
++ goto out_err;
++ }
++ goto out_connected;
++
++out_err:
++ newsmcsk->sk_state = SMC_CLOSED;
++ goto enqueue; /* queue new sock with sk_err set */
++}
++
++static void smc_tcp_listen_worker(struct work_struct *work)
++{
++ struct smc_sock *lsmc = container_of(work, struct smc_sock,
++ tcp_listen_work);
++ struct smc_sock *new_smc;
++ int rc = 0;
++
++ lock_sock(&lsmc->sk);
++ while (lsmc->sk.sk_state == SMC_LISTEN) {
++ rc = smc_clcsock_accept(lsmc, &new_smc);
++ if (rc)
++ goto out;
++ if (!new_smc)
++ continue;
++
++ new_smc->listen_smc = lsmc;
++ new_smc->use_fallback = 0; /* assume rdma capability first */
++ sock_hold(&lsmc->sk); /* sock_put in smc_listen_worker */
++ INIT_WORK(&new_smc->smc_listen_work, smc_listen_worker);
++ smc_copy_sock_settings_to_smc(new_smc);
++ schedule_work(&new_smc->smc_listen_work);
++ }
++
++out:
++ release_sock(&lsmc->sk);
++ lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
++}
++
+ static int smc_listen(struct socket *sock, int backlog)
+ {
+ struct sock *sk = sock->sk;
+@@ -297,6 +654,8 @@ static int smc_listen(struct socket *soc
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_ack_backlog = 0;
+ sk->sk_state = SMC_LISTEN;
++ INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_worker);
++ schedule_work(&smc->tcp_listen_work);
+
+ out:
+ release_sock(sk);
+@@ -306,10 +665,11 @@ out:
+ static int smc_accept(struct socket *sock, struct socket *new_sock,
+ int flags)
+ {
+- struct smc_sock *new_smc;
+- struct sock *sk = sock->sk;
++ struct sock *sk = sock->sk, *nsk;
++ DECLARE_WAITQUEUE(wait, current);
+ struct smc_sock *lsmc;
+- int rc;
++ long timeo;
++ int rc = 0;
+
+ lsmc = smc_sk(sk);
+ lock_sock(sk);
+@@ -319,18 +679,31 @@ static int smc_accept(struct socket *soc
+ goto out;
+ }
+
+- rc = smc_clcsock_accept(lsmc, &new_smc);
++ /* Wait for an incoming connection */
++ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
++ add_wait_queue_exclusive(sk_sleep(sk), &wait);
++ while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
++ set_current_state(TASK_INTERRUPTIBLE);
++ if (!timeo) {
++ rc = -EAGAIN;
++ break;
++ }
++ release_sock(sk);
++ timeo = schedule_timeout(timeo);
++ /* wakeup by sk_data_ready in smc_listen_worker() */
++ sched_annotate_sleep();
++ lock_sock(sk);
++ if (signal_pending(current)) {
++ rc = sock_intr_errno(timeo);
++ break;
++ }
++ }
++ set_current_state(TASK_RUNNING);
++ remove_wait_queue(sk_sleep(sk), &wait);
++
+ if (rc)
+ goto out;
+- sock_graft(&new_smc->sk, new_sock);
+- new_smc->sk.sk_state = SMC_ACTIVE;
+-
+- smc_copy_sock_settings_to_smc(new_smc);
+-
+- /* always use TCP fallback as transport mechanism for now;
+- * This will change once RDMA transport is implemented
+- */
+- new_smc->use_fallback = 1;
++ rc = sock_error(nsk);
+
+ out:
+ release_sock(sk);
+@@ -390,29 +763,61 @@ out:
+ return rc;
+ }
+
++static unsigned int smc_accept_poll(struct sock *parent)
++{
++ struct smc_sock *isk;
++ struct sock *sk;
++
++ lock_sock(parent);
++ list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
++ sk = (struct sock *)isk;
++
++ if (sk->sk_state == SMC_ACTIVE) {
++ release_sock(parent);
++ return POLLIN | POLLRDNORM;
++ }
++ }
++ release_sock(parent);
++
++ return 0;
++}
++
+ static unsigned int smc_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+ {
+ struct sock *sk = sock->sk;
+ unsigned int mask = 0;
+ struct smc_sock *smc;
++ int rc;
+
+ smc = smc_sk(sock->sk);
+- if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) ||
+- smc->use_fallback) {
++ if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
++ /* delegate to CLC child sock */
+ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
+ /* if non-blocking connect finished ... */
+ lock_sock(sk);
+ if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
+- sk->sk_state = SMC_ACTIVE;
+- /* always use TCP fallback as transport mechanism;
+- * This will change once RDMA transport is implemented
+- */
+- smc->use_fallback = 1;
++ sk->sk_err = smc->clcsock->sk->sk_err;
++ if (sk->sk_err) {
++ mask |= POLLERR;
++ } else {
++ rc = smc_connect_rdma(smc);
++ if (rc < 0)
++ mask |= POLLERR;
++ else
++ /* success cases including fallback */
++ mask |= POLLOUT | POLLWRNORM;
++ }
+ }
+ release_sock(sk);
+ } else {
+- mask = sock_no_poll(file, sock, wait);
++ sock_poll_wait(file, sk_sleep(sk), wait);
++ if (sk->sk_state == SMC_LISTEN)
++ /* woken up by sk_data_ready in smc_listen_worker() */
++ mask |= smc_accept_poll(sk);
++ if (sk->sk_err)
++ mask |= POLLERR;
++ /* for now - to be enhanced in follow-on patch */
+ }
+
+ return mask;
+@@ -579,6 +984,7 @@ static int smc_create(struct net *net, s
+
+ /* create internal TCP socket for CLC handshake and fallback */
+ smc = smc_sk(sk);
++ smc->use_fallback = 0; /* assume rdma capability first */
+ rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
+ IPPROTO_TCP, &smc->clcsock);
+ if (rc)
+--- a/net/smc/smc.h
++++ b/net/smc/smc.h
+@@ -27,7 +27,14 @@ enum smc_state { /* possible states of
+ struct smc_sock { /* smc sock container */
+ struct sock sk;
+ struct socket *clcsock; /* internal tcp socket */
+- u8 use_fallback : 1; /* fallback to tcp */
++ struct sockaddr *addr; /* inet connect address */
++ struct smc_sock *listen_smc; /* listen parent */
++ struct work_struct tcp_listen_work;/* handle tcp socket accepts */
++ struct work_struct smc_listen_work;/* prepare new accept socket */
++ struct list_head accept_q; /* sockets to be accepted */
++ spinlock_t accept_q_lock; /* protects accept_q */
++ u8 use_fallback : 1, /* fallback to tcp */
++ clc_started : 1;/* smc_connect_rdma ran */
+ };
+
+ static inline struct smc_sock *smc_sk(const struct sock *sk)
+@@ -39,4 +46,37 @@ static inline struct smc_sock *smc_sk(co
+
+ extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
+
++/* convert an u32 value into network byte order, store it into a 3 byte field */
++static inline void hton24(u8 *net, u32 host)
++{
++ __be32 t;
++
++ t = cpu_to_be32(host);
++ memcpy(net, ((u8 *)&t) + 1, 3);
++}
++
++/* convert a received 3 byte field into host byte order*/
++static inline u32 ntoh24(u8 *net)
++{
++ __be32 t = 0;
++
++ memcpy(((u8 *)&t) + 1, net, 3);
++ return be32_to_cpu(t);
++}
++
++#ifdef CONFIG_XFRM
++static inline bool using_ipsec(struct smc_sock *smc)
++{
++ return (smc->clcsock->sk->sk_policy[0] ||
++ smc->clcsock->sk->sk_policy[1]) ? 1 : 0;
++}
++#else
++static inline bool using_ipsec(struct smc_sock *smc)
++{
++ return 0;
++}
++#endif
++
++int smc_netinfo_by_tcpsk(struct socket *, __be32 *, u8 *);
++
+ #endif /* _SMC_H */
+--- /dev/null
++++ b/net/smc/smc_clc.c
+@@ -0,0 +1,248 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * CLC (connection layer control) handshake over initial TCP socket to
++ * prepare for RDMA traffic
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
++ */
++
++#include <net/sock.h>
++#include <net/tcp.h>
++
++#include "smc.h"
++#include "smc_clc.h"
++#include "smc_ib.h"
++
++/* Wait for data on the tcp-socket, analyze received data
++ * Returns:
++ * 0 if success and it was not a decline that we received.
++ * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send.
++ * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise.
++ */
++int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
++ u8 expected_type)
++{
++ struct smc_clc_msg_hdr *clcm = buf;
++ struct sock *clc_sk = smc->clcsock->sk;
++ struct msghdr msg = {NULL, 0};
++ int reason_code = 0;
++ DEFINE_WAIT(wait);
++ struct kvec vec;
++ int len, datlen;
++ int krflags;
++
++ /* peek the first few bytes to determine length of data to receive
++ * so we don't consume any subsequent CLC message or payload data
++ * in the TCP byte stream
++ */
++ vec.iov_base = buf;
++ vec.iov_len = buflen;
++ krflags = MSG_PEEK | MSG_WAITALL;
++ smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
++ len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1,
++ sizeof(struct smc_clc_msg_hdr), krflags);
++ if (signal_pending(current)) {
++ reason_code = -EINTR;
++ clc_sk->sk_err = -reason_code;
++ smc->sk.sk_err = clc_sk->sk_err;
++ goto out;
++ }
++ if (clc_sk->sk_err) {
++ reason_code = -clc_sk->sk_err;
++ smc->sk.sk_err = clc_sk->sk_err;
++ goto out;
++ }
++ if (!len) { /* peer has performed orderly shutdown */
++ smc->sk.sk_err = ECONNRESET;
++ reason_code = -ECONNRESET;
++ goto out;
++ }
++ datlen = ntohs(clcm->length);
++ if ((len < sizeof(struct smc_clc_msg_hdr)) ||
++ (datlen < sizeof(struct smc_clc_msg_decline)) ||
++ memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) ||
++ ((clcm->type != SMC_CLC_DECLINE) &&
++ (clcm->type != expected_type))) {
++ smc->sk.sk_err = EPROTO;
++ reason_code = -EPROTO;
++ goto out;
++ }
++
++ /* receive the complete CLC message */
++ vec.iov_base = buf;
++ vec.iov_len = buflen;
++ memset(&msg, 0, sizeof(struct msghdr));
++ krflags = MSG_WAITALL;
++ smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
++ len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags);
++ if (len < datlen) {
++ smc->sk.sk_err = EPROTO;
++ reason_code = -EPROTO;
++ goto out;
++ }
++ if (clcm->type == SMC_CLC_DECLINE)
++ reason_code = SMC_CLC_DECL_REPLY;
++out:
++ return reason_code;
++}
++
++/* send CLC DECLINE message across internal TCP socket */
++int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
++ u8 out_of_sync)
++{
++ struct smc_clc_msg_decline dclc;
++ struct msghdr msg;
++ struct kvec vec;
++ int len;
++
++ memset(&dclc, 0, sizeof(dclc));
++ memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
++ dclc.hdr.type = SMC_CLC_DECLINE;
++ dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
++ dclc.hdr.version = SMC_CLC_V1;
++ dclc.hdr.flag = out_of_sync ? 1 : 0;
++ memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid));
++ dclc.peer_diagnosis = htonl(peer_diag_info);
++ memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
++
++ memset(&msg, 0, sizeof(msg));
++ vec.iov_base = &dclc;
++ vec.iov_len = sizeof(struct smc_clc_msg_decline);
++ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
++ sizeof(struct smc_clc_msg_decline));
++ if (len < sizeof(struct smc_clc_msg_decline))
++ smc->sk.sk_err = EPROTO;
++ if (len < 0)
++ smc->sk.sk_err = -len;
++ return len;
++}
++
++/* send CLC PROPOSAL message across internal TCP socket */
++int smc_clc_send_proposal(struct smc_sock *smc,
++ struct smc_ib_device *smcibdev,
++ u8 ibport)
++{
++ struct smc_clc_msg_proposal pclc;
++ int reason_code = 0;
++ struct msghdr msg;
++ struct kvec vec;
++ int len, rc;
++
++ /* send SMC Proposal CLC message */
++ memset(&pclc, 0, sizeof(pclc));
++ memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
++ pclc.hdr.type = SMC_CLC_PROPOSAL;
++ pclc.hdr.length = htons(sizeof(pclc));
++ pclc.hdr.version = SMC_CLC_V1; /* SMC version */
++ memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
++ memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
++ memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1],
++ sizeof(smcibdev->mac[ibport - 1]));
++
++ /* determine subnet and mask from internal TCP socket */
++ rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet,
++ &pclc.prefix_len);
++ if (rc)
++ return SMC_CLC_DECL_CNFERR; /* configuration error */
++ memcpy(&pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
++ memset(&msg, 0, sizeof(msg));
++ vec.iov_base = &pclc;
++ vec.iov_len = sizeof(pclc);
++ /* due to the few bytes needed for clc-handshake this cannot block */
++ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc));
++ if (len < sizeof(pclc)) {
++ if (len >= 0) {
++ reason_code = -ENETUNREACH;
++ smc->sk.sk_err = -reason_code;
++ } else {
++ smc->sk.sk_err = smc->clcsock->sk->sk_err;
++ reason_code = -smc->sk.sk_err;
++ }
++ }
++
++ return reason_code;
++}
++
++/* send CLC CONFIRM message across internal TCP socket */
++int smc_clc_send_confirm(struct smc_sock *smc)
++{
++ struct smc_clc_msg_accept_confirm cclc;
++ int reason_code = 0;
++ struct msghdr msg;
++ struct kvec vec;
++ int len;
++
++ /* send SMC Confirm CLC msg */
++ memset(&cclc, 0, sizeof(cclc));
++ memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
++ cclc.hdr.type = SMC_CLC_CONFIRM;
++ cclc.hdr.length = htons(sizeof(cclc));
++ cclc.hdr.version = SMC_CLC_V1; /* SMC version */
++ memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
++
++ /* tbd in follow-on patch: fill in link-related values */
++
++ /* tbd in follow-on patch: fill in rmb-related values */
++
++ cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
++
++ memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
++
++ memset(&msg, 0, sizeof(msg));
++ vec.iov_base = &cclc;
++ vec.iov_len = sizeof(cclc);
++ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc));
++ if (len < sizeof(cclc)) {
++ if (len >= 0) {
++ reason_code = -ENETUNREACH;
++ smc->sk.sk_err = -reason_code;
++ } else {
++ smc->sk.sk_err = smc->clcsock->sk->sk_err;
++ reason_code = -smc->sk.sk_err;
++ }
++ }
++ return reason_code;
++}
++
++/* send CLC ACCEPT message across internal TCP socket */
++int smc_clc_send_accept(struct smc_sock *new_smc)
++{
++ struct smc_clc_msg_accept_confirm aclc;
++ struct msghdr msg;
++ struct kvec vec;
++ int rc = 0;
++ int len;
++
++ memset(&aclc, 0, sizeof(aclc));
++ memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
++ aclc.hdr.type = SMC_CLC_ACCEPT;
++ aclc.hdr.length = htons(sizeof(aclc));
++ aclc.hdr.version = SMC_CLC_V1; /* SMC version */
++ memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
++
++ /* tbd in follow-on patch: fill in link-related values */
++
++ /* tbd in follow-on patch: fill in rmb-related values */
++
++ aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */
++ memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
++
++ memset(&msg, 0, sizeof(msg));
++ vec.iov_base = &aclc;
++ vec.iov_len = sizeof(aclc);
++ len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc));
++ if (len < sizeof(aclc)) {
++ if (len >= 0) {
++ rc = -EPROTO;
++ new_smc->sk.sk_err = -rc;
++ } else {
++ new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err;
++ rc = -new_smc->sk.sk_err;
++ }
++ }
++
++ return rc;
++}
+--- /dev/null
++++ b/net/smc/smc_clc.h
+@@ -0,0 +1,111 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * CLC (connection layer control) handshake over initial TCP socket to
++ * prepare for RDMA traffic
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
++ */
++
++#ifndef _SMC_CLC_H
++#define _SMC_CLC_H
++
++#include <rdma/ib_verbs.h>
++
++#include "smc.h"
++
++#define SMC_CLC_PROPOSAL 0x01
++#define SMC_CLC_ACCEPT 0x02
++#define SMC_CLC_CONFIRM 0x03
++#define SMC_CLC_DECLINE 0x04
++
++/* eye catcher "SMCR" EBCDIC for CLC messages */
++static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
++
++#define SMC_CLC_V1 0x1 /* SMC version */
++#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
++#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
++#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */
++#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */
++#define SMC_CLC_DECL_IPSEC 0x03030000 /* IPsec usage */
++#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
++#define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */
++#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */
++
++struct smc_clc_msg_hdr { /* header1 of clc messages */
++ u8 eyecatcher[4]; /* eye catcher */
++ u8 type; /* proposal / accept / confirm / decline */
++ __be16 length;
++#if defined(__BIG_ENDIAN_BITFIELD)
++ u8 version : 4,
++ flag : 1,
++ rsvd : 3;
++#elif defined(__LITTLE_ENDIAN_BITFIELD)
++ u8 rsvd : 3,
++ flag : 1,
++ version : 4;
++#endif
++} __packed;
++
++struct smc_clc_msg_trail { /* trailer of clc messages */
++ u8 eyecatcher[4];
++} __packed;
++
++struct smc_clc_msg_local { /* header2 of clc messages */
++ u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */
++ union ib_gid gid; /* gid of ib_device port */
++ u8 mac[6]; /* mac of ib_device port */
++} __packed;
++
++struct smc_clc_msg_proposal { /* clc proposal message */
++ struct smc_clc_msg_hdr hdr;
++ struct smc_clc_msg_local lcl;
++ __be16 iparea_offset; /* offset to IP address information area */
++ __be32 outgoing_subnet; /* subnet mask */
++ u8 prefix_len; /* number of significant bits in mask */
++ u8 reserved[2];
++ u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */
++ struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
++} __packed;
++
++struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
++ struct smc_clc_msg_hdr hdr;
++ struct smc_clc_msg_local lcl;
++ u8 qpn[3]; /* QP number */
++ __be32 rmb_rkey; /* RMB rkey */
++ u8 conn_idx; /* Connection index, which RMBE in RMB */
++ __be32 rmbe_alert_token;/* unique connection id */
++#if defined(__BIG_ENDIAN_BITFIELD)
++ u8 rmbe_size : 4, /* server's RMB buf size (compressed not.) */
++ qp_mtu : 4; /* QP mtu */
++#elif defined(__LITTLE_ENDIAN_BITFIELD)
++ u8 qp_mtu : 4,
++ rmbe_size : 4;
++#endif
++ u8 reserved;
++ __be64 rmb_dma_addr; /* RMB virtual address */
++ u8 reserved2;
++ u8 psn[3]; /* initial packet sequence number */
++ struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
++} __packed;
++
++struct smc_clc_msg_decline { /* clc decline message */
++ struct smc_clc_msg_hdr hdr;
++ u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
++ __be32 peer_diagnosis; /* diagnosis information */
++ u8 reserved2[4];
++ struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
++} __packed;
++
++struct smc_sock;
++struct smc_ib_device;
++
++int smc_clc_wait_msg(struct smc_sock *, void *, int, u8);
++int smc_clc_send_decline(struct smc_sock *, u32, u8);
++int smc_clc_send_proposal(struct smc_sock *, struct smc_ib_device *, u8);
++int smc_clc_send_confirm(struct smc_sock *);
++int smc_clc_send_accept(struct smc_sock *);
++
++#endif
diff --git a/patches.arch/s390-sles12sp2-00-05-net-smc-r-06.patch b/patches.arch/s390-sles12sp2-00-05-net-smc-r-06.patch
new file mode 100644
index 0000000000..7a4d66de52
--- /dev/null
+++ b/patches.arch/s390-sles12sp2-00-05-net-smc-r-06.patch
@@ -0,0 +1,819 @@
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Subject: smc: connection and link group creation
+Patch-mainline: not yet, IBM pushing upstream
+References: bsc#978258,FATE#319593,LTC#131290
+
+Summary: net/smc: Shared Memory Communications - RDMA
+Description: Initial part of the implementation of the "Shared Memory
+ Communications-RDMA" (SMC-R) protocol. The protocol is defined
+ in RFC7609 [1]. It allows transparent transformation of TCP
+ connections using the "Remote Direct Memory Access over
+ Converged Ethernet" (RoCE) feature of certain communication
+ hardware for data center environments. Tested on s390 and x86
+ using Mellanox ConnectX-3 cards.
+
+ A new socket protocol family PF_SMC is being introduced. A
+ preload shared library will be offered to enable TCP-based
+ applications to use SMC-R without changes or recompilation.
+
+ References:
+ [1] SMC-R Informational RFC:
+ https://tools.ietf.org/rfc/rfc7609
+
+Upstream-Description:
+
+ smc: connection and link group creation
+
+ * create smc_connection for SMC-sockets
+ * determine suitable link group for a connection
+ * create a new link group if necessary
+
+ Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+
+Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Acked-by: John Jolly <jjolly@suse.de>
+---
+ net/smc/Makefile | 2
+ net/smc/af_smc.c | 105 +++++++++++++++++--
+ net/smc/smc.h | 17 +++
+ net/smc/smc_clc.c | 31 ++++-
+ net/smc/smc_clc.h | 2
+ net/smc/smc_core.c | 289 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ net/smc/smc_core.h | 104 +++++++++++++++++++
+ 7 files changed, 533 insertions(+), 17 deletions(-)
+
+--- a/net/smc/Makefile
++++ b/net/smc/Makefile
+@@ -1,2 +1,2 @@
+ obj-$(CONFIG_SMC) += smc.o
+-smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o
++smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o
+--- a/net/smc/af_smc.c
++++ b/net/smc/af_smc.c
+@@ -30,9 +30,19 @@
+
+ #include "smc.h"
+ #include "smc_clc.h"
++#include "smc_core.h"
+ #include "smc_ib.h"
+ #include "smc_pnet.h"
+
++static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group
++ * creation
++ */
++
++struct smc_lgr_list smc_lgr_list = { /* established link groups */
++ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
++ .list = LIST_HEAD_INIT(smc_lgr_list.list),
++};
++
+ static void smc_tcp_listen_worker(struct work_struct *);
+
+ static void smc_set_keepalive(struct sock *sk, int val)
+@@ -81,6 +91,8 @@ out:
+
+ static void smc_destruct(struct sock *sk)
+ {
++ struct smc_sock *smc = smc_sk(sk);
++
+ if (sk->sk_state != SMC_CLOSED) {
+ pr_err("Attempt to release SMC socket in state %d %p\n",
+ sk->sk_state, sk);
+@@ -92,6 +104,7 @@ static void smc_destruct(struct sock *sk
+ }
+
+ sk->sk_state = SMC_DESTRUCT;
++ smc_conn_free(&smc->conn);
+
+ sk_refcnt_debug_dec(sk);
+ }
+@@ -114,6 +127,7 @@ static struct sock *smc_sock_alloc(struc
+ smc = smc_sk(sk);
+ smc->clcsock = NULL;
+ smc->use_fallback = 0;
++ memset(&smc->conn, 0, sizeof(smc->conn));
+ smc->addr = NULL;
+ smc->listen_smc = NULL;
+ INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_worker);
+@@ -248,11 +262,31 @@ out:
+ return rc;
+ }
+
++static void smc_conn_save_peer_info(struct smc_sock *smc,
++ struct smc_clc_msg_accept_confirm *clc)
++{
++ smc->conn.peer_conn_idx = clc->conn_idx;
++}
++
++static void smc_link_save_peer_info(struct smc_link *link,
++ struct smc_clc_msg_accept_confirm *clc)
++{
++ link->peer_qpn = ntoh24(clc->qpn);
++ memcpy(link->peer_gid, clc->lcl.gid.raw, SMC_GID_SIZE);
++ memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
++ link->peer_psn = ntoh24(clc->psn);
++ link->peer_mtu = clc->qp_mtu;
++}
++
+ /* setup for RDMA connection of client */
+ static int smc_connect_rdma(struct smc_sock *smc)
+ {
++ struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
+ struct smc_clc_msg_accept_confirm aclc;
++ int local_contact = SMC_FIRST_CONTACT;
+ struct smc_ib_device *smcibdev;
++ struct smc_link *link;
++ u8 srv_first_contact;
+ int reason_code = 0;
+ int rc = 0;
+ u8 ibport;
+@@ -295,26 +329,43 @@ static int smc_connect_rdma(struct smc_s
+ if (reason_code > 0)
+ goto decline_rdma;
+
+- /* tbd in follow-on patch: more steps to setup RDMA communcication,
+- * create connection, link group, link
+- */
+-
++ srv_first_contact = aclc.hdr.flag;
++ mutex_lock(&smc_create_lgr_pending);
++ local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
++ ibport, &aclc.lcl, srv_first_contact);
++ if (local_contact < 0) {
++ rc = local_contact;
++ if (rc == -ENOMEM)
++ reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
++ else if (rc == -ENOLINK)
++ reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
++ goto decline_rdma_unlock;
++ }
++ link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
++
++ smc_conn_save_peer_info(smc, &aclc);
++ if (local_contact == SMC_FIRST_CONTACT)
++ smc_link_save_peer_info(link, &aclc);
+ /* tbd in follow-on patch: more steps to setup RDMA communcication,
+ * create rmbs, map rmbs, rtoken_handling, modify_qp
+ */
+
+ rc = smc_clc_send_confirm(smc);
+ if (rc)
+- goto out_err;
++ goto out_err_unlock;
+
+ /* tbd in follow-on patch: llc_confirm */
+
++ mutex_unlock(&smc_create_lgr_pending);
+ out_connected:
+ smc_copy_sock_settings_to_clc(smc);
+ smc->sk.sk_state = SMC_ACTIVE;
+
+- return rc;
++ return rc ? rc : local_contact;
+
++decline_rdma_unlock:
++ mutex_unlock(&smc_create_lgr_pending);
++ smc_conn_free(&smc->conn);
+ decline_rdma:
+ /* RDMA setup failed, switch back to TCP */
+ smc->use_fallback = 1;
+@@ -325,6 +376,9 @@ decline_rdma:
+ }
+ goto out_connected;
+
++out_err_unlock:
++ mutex_unlock(&smc_create_lgr_pending);
++ smc_conn_free(&smc->conn);
+ out_err:
+ return rc;
+ }
+@@ -490,10 +544,12 @@ static void smc_listen_worker(struct wor
+ struct socket *newclcsock = new_smc->clcsock;
+ struct smc_sock *lsmc = new_smc->listen_smc;
+ struct smc_clc_msg_accept_confirm cclc;
++ int local_contact = SMC_REUSE_CONTACT;
+ struct sock *newsmcsk = &new_smc->sk;
+ struct smc_clc_msg_proposal pclc;
+ struct smc_ib_device *smcibdev;
+ struct sockaddr_in peeraddr;
++ struct smc_link *link;
+ int reason_code = 0;
+ int rc = 0, len;
+ __be32 subnet;
+@@ -541,15 +597,30 @@ static void smc_listen_worker(struct wor
+ /* get address of the peer connected to the internal TCP socket */
+ kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
+
+- /* tbd in follow-on patch: more steps to setup RDMA communcication,
+- * create connection, link_group, link
+- */
++ /* allocate connection / link group */
++ mutex_lock(&smc_create_lgr_pending);
++ local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
++ smcibdev, ibport, &pclc.lcl, 0);
++ if (local_contact == SMC_REUSE_CONTACT)
++ /* lock no longer needed, free it due to following
++ * smc_clc_wait_msg() call
++ */
++ mutex_unlock(&smc_create_lgr_pending);
++ if (local_contact < 0) {
++ rc = local_contact;
++ if (rc == -ENOMEM)
++ reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
++ else if (rc == -ENOLINK)
++ reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
++ goto decline_rdma;
++ }
++ link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+
+ /* tbd in follow-on patch: more steps to setup RDMA communcication,
+ * create rmbs, map rmbs
+ */
+
+- rc = smc_clc_send_accept(new_smc);
++ rc = smc_clc_send_accept(new_smc, local_contact);
+ if (rc)
+ goto out_err;
+
+@@ -560,6 +631,9 @@ static void smc_listen_worker(struct wor
+ goto out_err;
+ if (reason_code > 0)
+ goto decline_rdma;
++ smc_conn_save_peer_info(new_smc, &cclc);
++ if (local_contact == SMC_FIRST_CONTACT)
++ smc_link_save_peer_info(link, &cclc);
+
+ /* tbd in follow-on patch: more steps to setup RDMA communcication,
+ * rtoken_handling, modify_qp
+@@ -569,6 +643,8 @@ out_connected:
+ sk_refcnt_debug_inc(newsmcsk);
+ newsmcsk->sk_state = SMC_ACTIVE;
+ enqueue:
++ if (local_contact == SMC_FIRST_CONTACT)
++ mutex_unlock(&smc_create_lgr_pending);
+ lock_sock(&lsmc->sk);
+ if (lsmc->sk.sk_state == SMC_LISTEN) {
+ smc_accept_enqueue(&lsmc->sk, newsmcsk);
+@@ -584,6 +660,7 @@ enqueue:
+
+ decline_rdma:
+ /* RDMA setup failed, switch back to TCP */
++ smc_conn_free(&new_smc->conn);
+ new_smc->use_fallback = 1;
+ if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
+ rc = smc_clc_send_decline(new_smc, reason_code, 0);
+@@ -1039,6 +1116,14 @@ out_pnet:
+
+ static void __exit smc_exit(void)
+ {
++ struct smc_link_group *lgr, *lg;
++
++ spin_lock(&smc_lgr_list.lock);
++ list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) {
++ list_del_init(&lgr->list);
++ smc_lgr_free(lgr); /* free link group */
++ }
++ spin_unlock(&smc_lgr_list.lock);
+ smc_ib_unregister_client();
+ sock_unregister(PF_SMC);
+ proto_unregister(&smc_proto);
+--- a/net/smc/smc.h
++++ b/net/smc/smc.h
+@@ -14,6 +14,8 @@
+ #include <linux/types.h>
+ #include <net/sock.h>
+
++#include "smc_ib.h"
++
+ #define SMCPROTO_SMC 0 /* SMC protocol */
+
+ enum smc_state { /* possible states of an SMC socket */
+@@ -24,9 +26,19 @@ enum smc_state { /* possible states of
+ SMC_DESTRUCT = 32
+ };
+
++struct smc_link_group;
++
++struct smc_connection {
++ struct rb_node alert_node;
++ struct smc_link_group *lgr; /* link group of connection */
++ u32 alert_token_local; /* unique conn. id */
++ u8 peer_conn_idx; /* from tcp handshake */
++};
++
+ struct smc_sock { /* smc sock container */
+ struct sock sk;
+ struct socket *clcsock; /* internal tcp socket */
++ struct smc_connection conn; /* smc connection */
+ struct sockaddr *addr; /* inet connect address */
+ struct smc_sock *listen_smc; /* listen parent */
+ struct work_struct tcp_listen_work;/* handle tcp socket accepts */
+@@ -77,6 +89,11 @@ static inline bool using_ipsec(struct sm
+ }
+ #endif
+
++struct smc_clc_msg_local;
++
+ int smc_netinfo_by_tcpsk(struct socket *, __be32 *, u8 *);
++void smc_conn_free(struct smc_connection *);
++int smc_conn_create(struct smc_sock *, __be32, struct smc_ib_device *, u8,
++ struct smc_clc_msg_local *, int);
+
+ #endif /* _SMC_H */
+--- a/net/smc/smc_clc.c
++++ b/net/smc/smc_clc.c
+@@ -13,6 +13,7 @@
+ #include <net/tcp.h>
+
+ #include "smc.h"
++#include "smc_core.h"
+ #include "smc_clc.h"
+ #include "smc_ib.h"
+
+@@ -169,12 +170,15 @@ int smc_clc_send_proposal(struct smc_soc
+ /* send CLC CONFIRM message across internal TCP socket */
+ int smc_clc_send_confirm(struct smc_sock *smc)
+ {
++ struct smc_connection *conn = &smc->conn;
+ struct smc_clc_msg_accept_confirm cclc;
++ struct smc_link *link;
+ int reason_code = 0;
+ struct msghdr msg;
+ struct kvec vec;
+ int len;
+
++ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ /* send SMC Confirm CLC msg */
+ memset(&cclc, 0, sizeof(cclc));
+ memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+@@ -182,12 +186,18 @@ int smc_clc_send_confirm(struct smc_sock
+ cclc.hdr.length = htons(sizeof(cclc));
+ cclc.hdr.version = SMC_CLC_V1; /* SMC version */
+ memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+-
+- /* tbd in follow-on patch: fill in link-related values */
++ memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
++ SMC_GID_SIZE);
++ memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
++ sizeof(link->smcibdev->mac));
+
+ /* tbd in follow-on patch: fill in rmb-related values */
+
++ hton24(cclc.qpn, link->roce_qp->qp_num);
+ cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
++ cclc.rmbe_alert_token = htonl(conn->alert_token_local);
++ cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
++ hton24(cclc.psn, link->psn_initial);
+
+ memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+@@ -208,26 +218,37 @@ int smc_clc_send_confirm(struct smc_sock
+ }
+
+ /* send CLC ACCEPT message across internal TCP socket */
+-int smc_clc_send_accept(struct smc_sock *new_smc)
++int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
+ {
++ struct smc_connection *conn = &new_smc->conn;
+ struct smc_clc_msg_accept_confirm aclc;
++ struct smc_link *link;
+ struct msghdr msg;
+ struct kvec vec;
+ int rc = 0;
+ int len;
+
++ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ memset(&aclc, 0, sizeof(aclc));
+ memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ aclc.hdr.type = SMC_CLC_ACCEPT;
+ aclc.hdr.length = htons(sizeof(aclc));
+ aclc.hdr.version = SMC_CLC_V1; /* SMC version */
++ if (srv_first_contact)
++ aclc.hdr.flag = 1;
+ memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+-
+- /* tbd in follow-on patch: fill in link-related values */
++ memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
++ SMC_GID_SIZE);
++ memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
++ sizeof(link->smcibdev->mac[link->ibport - 1]));
+
+ /* tbd in follow-on patch: fill in rmb-related values */
+
++ hton24(aclc.qpn, link->roce_qp->qp_num);
+ aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */
++ aclc.rmbe_alert_token = htonl(conn->alert_token_local);
++ aclc.qp_mtu = link->path_mtu;
++ hton24(aclc.psn, link->psn_initial);
+ memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ memset(&msg, 0, sizeof(msg));
+--- a/net/smc/smc_clc.h
++++ b/net/smc/smc_clc.h
+@@ -106,6 +106,6 @@ int smc_clc_wait_msg(struct smc_sock *,
+ int smc_clc_send_decline(struct smc_sock *, u32, u8);
+ int smc_clc_send_proposal(struct smc_sock *, struct smc_ib_device *, u8);
+ int smc_clc_send_confirm(struct smc_sock *);
+-int smc_clc_send_accept(struct smc_sock *);
++int smc_clc_send_accept(struct smc_sock *, int);
+
+ #endif
+--- /dev/null
++++ b/net/smc/smc_core.c
+@@ -0,0 +1,289 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * Basic Transport Functions exploiting Infiniband API
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
++ */
++
++#include <linux/socket.h>
++#include <linux/if_vlan.h>
++#include <linux/random.h>
++#include <net/tcp.h>
++#include <net/sock.h>
++#include <rdma/ib_verbs.h>
++
++#include "smc.h"
++#include "smc_clc.h"
++#include "smc_core.h"
++#include "smc_ib.h"
++
++/* Register connection's alert token in our lookup structure.
++ * To use rbtrees we have to implement our own insert core.
++ * Requires @conns_lock
++ * @smc connection to register
++ * Returns 0 on success, != otherwise.
++ */
++static void smc_lgr_add_alert_token(struct smc_connection *conn)
++{
++ struct rb_node **link, *parent = NULL;
++ u32 token = conn->alert_token_local;
++
++ link = &conn->lgr->conns_all.rb_node;
++ while (*link) {
++ struct smc_connection *cur = rb_entry(*link,
++ struct smc_connection, alert_node);
++
++ parent = *link;
++ if (cur->alert_token_local > token)
++ link = &parent->rb_left;
++ else
++ link = &parent->rb_right;
++ }
++ /* Put the new node there */
++ rb_link_node(&conn->alert_node, parent, link);
++ rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
++}
++
++/* Register connection in link group by assigning an alert token
++ * registered in a search tree.
++ * Requires @conns_lock
++ * Note that '0' is a reserved value and not assigned.
++ */
++static void smc_lgr_register_conn(struct smc_connection *conn)
++{
++ static atomic_t nexttoken = ATOMIC_INIT(0);
++
++ /* find a new alert_token_local value not yet used by some connection
++ * in this link group
++ */
++ while (!conn->alert_token_local) {
++ conn->alert_token_local = atomic_inc_return(&nexttoken);
++ if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
++ conn->alert_token_local = 0;
++ }
++ smc_lgr_add_alert_token(conn);
++ conn->lgr->conns_num++;
++}
++
++/* Unregister connection and reset the alert token of the given connection
++ */
++static void smc_lgr_unregister_conn(struct smc_connection *conn)
++{
++ struct smc_link_group *lgr = conn->lgr;
++
++ write_lock_bh(&lgr->conns_lock);
++ rb_erase(&conn->alert_node, &lgr->conns_all);
++ lgr->conns_num--;
++ write_unlock_bh(&lgr->conns_lock);
++}
++
++/* create a new SMC link group */
++static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
++ struct smc_ib_device *smcibdev, u8 ibport,
++ char *peer_systemid, unsigned short vlan_id)
++{
++ struct smc_link_group *lgr;
++ struct smc_link *lnk;
++ u8 rndvec[3];
++ int rc = 0;
++
++ lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
++ if (!lgr) {
++ rc = -ENOMEM;
++ goto out;
++ }
++ lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
++ lgr->daddr = peer_in_addr;
++ memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
++ lgr->vlan_id = vlan_id;
++
++ lnk = &lgr->lnk[SMC_SINGLE_LINK];
++ /* initialize link */
++ lnk->smcibdev = smcibdev;
++ lnk->ibport = ibport;
++ lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
++ get_random_bytes(rndvec, sizeof(rndvec));
++ lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
++
++ smc->conn.lgr = lgr;
++ rwlock_init(&lgr->conns_lock);
++ spin_lock(&smc_lgr_list.lock);
++ list_add_tail(&lgr->list, &smc_lgr_list.list);
++ spin_unlock(&smc_lgr_list.lock);
++out:
++ return rc;
++}
++
++/* remove a finished connection from its link group */
++void smc_conn_free(struct smc_connection *conn)
++{
++ struct smc_link_group *lgr = conn->lgr;
++
++ if (!lgr)
++ return;
++ smc_lgr_unregister_conn(conn);
++ conn->lgr = NULL;
++}
++
++static void smc_link_clear(struct smc_link_group *lgr)
++{
++ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
++
++ lnk->peer_qpn = 0;
++}
++
++/* remove a link group */
++void smc_lgr_free(struct smc_link_group *lgr)
++{
++ smc_link_clear(lgr);
++ kfree(lgr);
++}
++
++/* Client checks if creation / reuse of a link group happens
++ * synchronously with Server.
++ * Returns true iff client and server disagree.
++ */
++static bool smc_lgr_clt_srv_disagree(struct smc_connection *conn, int new_lgr,
++ int srv_first_contact)
++{
++ if (!srv_first_contact && new_lgr) {
++ /* Server reuses a link group, but Client wants to start
++ * a new one
++ */
++ return true;
++ }
++ if (srv_first_contact && !new_lgr) {
++ /* Server starts a new link group, but Client wants to reuse
++ * an existing link group
++ */
++ spin_lock(&smc_lgr_list.lock);
++ list_del_init(&conn->lgr->list);
++ spin_unlock(&smc_lgr_list.lock);
++ smc_lgr_unregister_conn(conn); /* takes conns_lock */
++ /* tbd: terminate existing connections */
++ return true;
++ }
++ return false;
++}
++
++/* Determine vlan of internal TCP socket.
++ * @vlan_id: address to store the determined vlan id into
++ */
++static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
++{
++ struct dst_entry *dst = sk_dst_get(clcsock->sk);
++ int rc = 0;
++
++ *vlan_id = 0;
++ if (!dst) {
++ rc = -ENOTCONN;
++ goto out;
++ }
++ if (!dst->dev) {
++ rc = -ENODEV;
++ goto out_rel;
++ }
++
++ if (is_vlan_dev(dst->dev))
++ *vlan_id = vlan_dev_vlan_id(dst->dev);
++
++out_rel:
++ dst_release(dst);
++out:
++ return rc;
++}
++
++/* determine the link gid matching the vlan id of the link group */
++static int smc_link_determine_gid(struct smc_link_group *lgr)
++{
++ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
++ struct ib_gid_attr gattr;
++ union ib_gid gid;
++ int i;
++
++ if (!lgr->vlan_id) {
++ lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
++ return 0;
++ }
++
++ for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
++ i++) {
++ ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
++ &gattr);
++ if (gattr.ndev &&
++ (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) {
++ lnk->gid = gid;
++ return 0;
++ }
++ }
++ return -ENODEV;
++}
++
++/* create a new SMC connection (and a new link group if necessary) */
++int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
++ struct smc_ib_device *smcibdev, u8 ibport,
++ struct smc_clc_msg_local *lcl, int srv_first_contact)
++{
++ struct smc_connection *conn = &smc->conn;
++ struct smc_link_group *lgr;
++ unsigned short vlan_id;
++ enum smc_lgr_role role;
++ int local_contact = SMC_FIRST_CONTACT;
++ int rc = 0;
++
++ /* wait when another process is already creating a connection
++ * till either this process can reuse an existing link group or
++ * till this process has finished creating a new link group
++ */
++ role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
++ rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id);
++ if (rc)
++ return rc;
++
++ /* determine if an existing link group can be reused */
++ spin_lock(&smc_lgr_list.lock);
++ list_for_each_entry(lgr, &smc_lgr_list.list, list) {
++ write_lock_bh(&lgr->conns_lock);
++ if (!memcmp(lgr->peer_systemid, lcl->id_for_peer,
++ SMC_SYSTEMID_LEN) &&
++ !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
++ SMC_GID_SIZE) &&
++ !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
++ sizeof(lcl->mac)) &&
++ (lgr->role == role) &&
++ (lgr->vlan_id == vlan_id)) {
++ /* link group found */
++ local_contact = SMC_REUSE_CONTACT;
++ conn->lgr = lgr;
++ smc_lgr_register_conn(conn); /* add smc conn to lgr */
++ write_unlock_bh(&lgr->conns_lock);
++ break;
++ }
++ write_unlock_bh(&lgr->conns_lock);
++ }
++ spin_unlock(&smc_lgr_list.lock);
++
++ if (role == SMC_CLNT) {
++ if (smc_lgr_clt_srv_disagree(conn, local_contact,
++ srv_first_contact)) {
++ /* send out_of_sync decline, reason synchr. error */
++ smc->sk.sk_err = ENOLINK;
++ return -ENOLINK;
++ }
++ }
++
++ if (local_contact == SMC_FIRST_CONTACT) {
++ rc = smc_lgr_create(smc, peer_in_addr, smcibdev, ibport,
++ lcl->id_for_peer, vlan_id);
++ if (rc)
++ goto out;
++ smc_lgr_register_conn(conn); /* add smc conn to lgr */
++ rc = smc_link_determine_gid(conn->lgr);
++ }
++
++out:
++ return rc ? rc : local_contact;
++}
+--- /dev/null
++++ b/net/smc/smc_core.h
+@@ -0,0 +1,104 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * Definitions for SMC Connections, Link Groups and Links
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
++ */
++
++#ifndef _SMC_CORE_H
++#define _SMC_CORE_H
++
++#include <rdma/ib_verbs.h>
++
++#include "smc.h"
++#include "smc_ib.h"
++
++struct smc_lgr_list { /* list of link group definition */
++ struct list_head list;
++ spinlock_t lock; /* protects list of link groups */
++ u32 conn_pending; /* delay 2nd connection creation*/
++};
++
++extern struct smc_lgr_list smc_lgr_list; /* list of link groups */
++
++enum smc_lgr_role { /* possible roles of a link group */
++ SMC_CLNT, /* client */
++ SMC_SERV /* server */
++};
++
++struct smc_link {
++ struct smc_ib_device *smcibdev; /* ib-device */
++ u8 ibport; /* port - values 1 | 2 */
++ struct ib_qp *roce_qp; /* IB queue pair */
++ struct ib_qp_attr qp_attr; /* IB queue pair attributes */
++ union ib_gid gid; /* gid matching used vlan id */
++ u32 peer_qpn; /* QP number of peer */
++ enum ib_mtu path_mtu; /* used mtu */
++ enum ib_mtu peer_mtu; /* mtu size of peer */
++ u32 psn_initial; /* QP tx initial packet seqno */
++ u32 peer_psn; /* QP rx initial packet seqno */
++ u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */
++ u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/
++};
++
++/* For now we just allow one parallel link per link group. The SMC protocol
++ * allows more (up to 8).
++ */
++#define SMC_LINKS_PER_LGR_MAX 1
++#define SMC_SINGLE_LINK 0
++
++#define SMC_FIRST_CONTACT 1 /* first contact to a peer */
++#define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/
++
++struct smc_link_group {
++ struct list_head list;
++ enum smc_lgr_role role; /* client or server */
++ __be32 daddr; /* destination ip address */
++ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */
++ char peer_systemid[SMC_SYSTEMID_LEN];
++ /* unique system_id of peer */
++ struct rb_root conns_all; /* connection tree */
++ rwlock_t conns_lock; /* protects conns_all */
++ unsigned int conns_num; /* current # of connections */
++ unsigned short vlan_id; /* vlan id of link group */
++};
++
++/* Find the connection associated with the given alert token in the link group.
++ * To use rbtrees we have to implement our own search core.
++ * Requires @conns_lock
++ * @token alert token to search for
++ * @lgr link group to search in
++ * Returns connection associated with token if found, NULL otherwise.
++ */
++static inline struct smc_connection *smc_lgr_find_conn(
++ u32 token, struct smc_link_group *lgr)
++{
++ struct smc_connection *res = NULL;
++ struct rb_node *node;
++
++ node = lgr->conns_all.rb_node;
++ while (node) {
++ struct smc_connection *cur = rb_entry(node,
++ struct smc_connection, alert_node);
++
++ if (cur->alert_token_local > token) {
++ node = node->rb_left;
++ } else {
++ if (cur->alert_token_local < token) {
++ node = node->rb_right;
++ } else {
++ res = cur;
++ break;
++ }
++ }
++ }
++
++ return res;
++}
++
++void smc_lgr_free(struct smc_link_group *);
++
++#endif
diff --git a/patches.arch/s390-sles12sp2-00-05-net-smc-r-07.patch b/patches.arch/s390-sles12sp2-00-05-net-smc-r-07.patch
new file mode 100644
index 0000000000..5232c0a68a
--- /dev/null
+++ b/patches.arch/s390-sles12sp2-00-05-net-smc-r-07.patch
@@ -0,0 +1,497 @@
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Subject: smc: remote memory buffers (RMBs)
+Patch-mainline: not yet, IBM pushing upstream
+References: bsc#978258,FATE#319593,LTC#131290
+
+Summary: net/smc: Shared Memory Communications - RDMA
+Description: Initial part of the implementation of the "Shared Memory
+ Communications-RDMA" (SMC-R) protocol. The protocol is defined
+ in RFC7609 [1]. It allows transparent transformation of TCP
+ connections using the "Remote Direct Memory Access over
+ Converged Ethernet" (RoCE) feature of certain communication
+ hardware for data center environments. Tested on s390 and x86
+ using Mellanox ConnectX-3 cards.
+
+ A new socket protocol family PF_SMC is being introduced. A
+ preload shared library will be offered to enable TCP-based
+ applications to use SMC-R without changes or recompilation.
+
+ References:
+ [1] SMC-R Informational RFC:
+ https://tools.ietf.org/rfc/rfc7609
+
+Upstream-Description:
+
+ smc: remote memory buffers (RMBs)
+
+ * allocate data RMB memory for sending and receiving
+ * size depends on the maximum socket send and receive buffers
+ * allocated RMBs are kept during life time of the owning link group
+ * map the allocated RMBs to DMA
+
+ Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+
+Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Acked-by: John Jolly <jjolly@suse.de>
+---
+ net/smc/af_smc.c | 29 +++++++-
+ net/smc/smc.h | 33 +++++++++
+ net/smc/smc_clc.c | 6 -
+ net/smc/smc_core.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ net/smc/smc_core.h | 21 +++++
+ net/smc/smc_ib.c | 19 +++++
+ net/smc/smc_ib.h | 5 +
+ 7 files changed, 295 insertions(+), 6 deletions(-)
+
+--- a/net/smc/af_smc.c
++++ b/net/smc/af_smc.c
+@@ -266,6 +266,8 @@ static void smc_conn_save_peer_info(stru
+ struct smc_clc_msg_accept_confirm *clc)
+ {
+ smc->conn.peer_conn_idx = clc->conn_idx;
++ smc->conn.peer_rmbe_len = smc_uncompress_bufsize(clc->rmbe_size);
++ atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_len);
+ }
+
+ static void smc_link_save_peer_info(struct smc_link *link,
+@@ -344,6 +346,18 @@ static int smc_connect_rdma(struct smc_s
+ link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+
+ smc_conn_save_peer_info(smc, &aclc);
++
++ rc = smc_sndbuf_create(smc);
++ if (rc) {
++ reason_code = SMC_CLC_DECL_MEM;
++ goto decline_rdma_unlock;
++ }
++ rc = smc_rmb_create(smc);
++ if (rc) {
++ reason_code = SMC_CLC_DECL_MEM;
++ goto decline_rdma_unlock;
++ }
++
+ if (local_contact == SMC_FIRST_CONTACT)
+ smc_link_save_peer_info(link, &aclc);
+ /* tbd in follow-on patch: more steps to setup RDMA communcication,
+@@ -616,9 +630,16 @@ static void smc_listen_worker(struct wor
+ }
+ link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+
+- /* tbd in follow-on patch: more steps to setup RDMA communcication,
+- * create rmbs, map rmbs
+- */
++ rc = smc_sndbuf_create(new_smc);
++ if (rc) {
++ reason_code = SMC_CLC_DECL_MEM;
++ goto decline_rdma;
++ }
++ rc = smc_rmb_create(new_smc);
++ if (rc) {
++ reason_code = SMC_CLC_DECL_MEM;
++ goto decline_rdma;
++ }
+
+ rc = smc_clc_send_accept(new_smc, local_contact);
+ if (rc)
+@@ -1066,6 +1087,8 @@ static int smc_create(struct net *net, s
+ IPPROTO_TCP, &smc->clcsock);
+ if (rc)
+ sk_common_release(sk);
++ smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
++ smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
+
+ out:
+ return rc;
+--- a/net/smc/smc.h
++++ b/net/smc/smc.h
+@@ -33,6 +33,16 @@ struct smc_connection {
+ struct smc_link_group *lgr; /* link group of connection */
+ u32 alert_token_local; /* unique conn. id */
+ u8 peer_conn_idx; /* from tcp handshake */
++ int peer_rmbe_len; /* size of peer rx buffer */
++ atomic_t peer_rmbe_space;/* remaining free bytes in peer
++ * rmbe
++ */
++
++ struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */
++ int sndbuf_size; /* sndbuf size <== sock wmem */
++ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
++ int rmbe_size; /* RMBE size <== sock rmem */
++ int rmbe_size_short;/* compressed notation */
+ };
+
+ struct smc_sock { /* smc sock container */
+@@ -76,6 +86,29 @@ static inline u32 ntoh24(u8 *net)
+ return be32_to_cpu(t);
+ }
+
++#define SMC_RMB_SIZES 16 /* number of distinct sizes for an RMB*/
++
++/* convert the RMB size into the compressed notation - minimum 16K */
++static inline u8 smc_compress_bufsize(int size)
++{
++ u8 compressed = 0;
++
++ size = (size - 1) >> 14;
++ compressed = ilog2(size) + 1;
++ if (compressed >= SMC_RMB_SIZES)
++ compressed = SMC_RMB_SIZES - 1;
++ return compressed;
++}
++
++/* convert the RMB size from compressed notation into integer */
++static inline int smc_uncompress_bufsize(u8 compressed)
++{
++ u32 size;
++
++ size = 0x00000001 << (((int)compressed) + 14);
++ return (int)size;
++}
++
+ #ifdef CONFIG_XFRM
+ static inline bool using_ipsec(struct smc_sock *smc)
+ {
+--- a/net/smc/smc_clc.c
++++ b/net/smc/smc_clc.c
+@@ -241,13 +241,13 @@ int smc_clc_send_accept(struct smc_sock
+ SMC_GID_SIZE);
+ memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
+ sizeof(link->smcibdev->mac[link->ibport - 1]));
+-
+- /* tbd in follow-on patch: fill in rmb-related values */
+-
+ hton24(aclc.qpn, link->roce_qp->qp_num);
+ aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */
+ aclc.rmbe_alert_token = htonl(conn->alert_token_local);
+ aclc.qp_mtu = link->path_mtu;
++ aclc.rmbe_size = conn->rmbe_size_short,
++ aclc.rmb_dma_addr =
++ cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
+ hton24(aclc.psn, link->psn_initial);
+ memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+--- a/net/smc/smc_core.c
++++ b/net/smc/smc_core.c
+@@ -89,6 +89,7 @@ static int smc_lgr_create(struct smc_soc
+ struct smc_link *lnk;
+ u8 rndvec[3];
+ int rc = 0;
++ int i;
+
+ lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
+ if (!lgr) {
+@@ -99,6 +100,12 @@ static int smc_lgr_create(struct smc_soc
+ lgr->daddr = peer_in_addr;
+ memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
+ lgr->vlan_id = vlan_id;
++ rwlock_init(&lgr->sndbufs_lock);
++ rwlock_init(&lgr->rmbs_lock);
++ for (i = 0; i < SMC_RMB_SIZES; i++) {
++ INIT_LIST_HEAD(&lgr->sndbufs[i]);
++ INIT_LIST_HEAD(&lgr->rmbs[i]);
++ }
+
+ lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ /* initialize link */
+@@ -117,6 +124,22 @@ out:
+ return rc;
+ }
+
++static void smc_sndbuf_free(struct smc_connection *conn)
++{
++ if (conn->sndbuf_desc) {
++ xchg(&conn->sndbuf_desc->used, 0);
++ conn->sndbuf_size = 0;
++ }
++}
++
++static void smc_rmb_free(struct smc_connection *conn)
++{
++ if (conn->rmb_desc) {
++ xchg(&conn->rmb_desc->used, 0);
++ conn->rmbe_size = 0;
++ }
++}
++
+ /* remove a finished connection from its link group */
+ void smc_conn_free(struct smc_connection *conn)
+ {
+@@ -126,6 +149,8 @@ void smc_conn_free(struct smc_connection
+ return;
+ smc_lgr_unregister_conn(conn);
+ conn->lgr = NULL;
++ smc_rmb_free(conn);
++ smc_sndbuf_free(conn);
+ }
+
+ static void smc_link_clear(struct smc_link_group *lgr)
+@@ -287,3 +312,166 @@ int smc_conn_create(struct smc_sock *smc
+ out:
+ return rc ? rc : local_contact;
+ }
++
++/* try to reuse a sndbuf description slot of the sndbufs list for a certain
++ * buf_size; if not available, return NULL
++ */
++static inline
++struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr,
++ int compressed_bufsize)
++{
++ struct smc_buf_desc *sndbuf_slot;
++
++ read_lock_bh(&lgr->sndbufs_lock);
++ list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize],
++ list) {
++ if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) {
++ read_unlock_bh(&lgr->sndbufs_lock);
++ return sndbuf_slot;
++ }
++ }
++ read_unlock_bh(&lgr->sndbufs_lock);
++ return NULL;
++}
++
++/* try to reuse an rmb description slot of the rmbs list for a certain
++ * rmbe_size; if not available, return NULL
++ */
++static inline
++struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr,
++ int compressed_bufsize)
++{
++ struct smc_buf_desc *rmb_slot;
++
++ read_lock_bh(&lgr->rmbs_lock);
++ list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize],
++ list) {
++ if (cmpxchg(&rmb_slot->used, 0, 1) == 0) {
++ read_unlock_bh(&lgr->rmbs_lock);
++ return rmb_slot;
++ }
++ }
++ read_unlock_bh(&lgr->rmbs_lock);
++ return NULL;
++}
++
++/* create the tx buffer for an SMC socket */
++int smc_sndbuf_create(struct smc_sock *smc)
++{
++ struct smc_connection *conn = &smc->conn;
++ struct smc_link_group *lgr = conn->lgr;
++ int tmp_bufsize, tmp_bufsize_short;
++ struct smc_buf_desc *sndbuf_desc;
++ int rc;
++
++ /* use socket send buffer size (w/o overhead) as start value */
++ for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2);
++ tmp_bufsize_short >= 0; tmp_bufsize_short--) {
++ tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
++ /* check for reusable sndbuf_slot in the link group */
++ sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short);
++ if (sndbuf_desc) {
++ memset(conn->sndbuf_desc->cpu_addr, 0, tmp_bufsize);
++ break; /* found reusable slot */
++ }
++ /* try to alloc a new send buffer */
++ sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL);
++ if (!sndbuf_desc)
++ break; /* give up with -ENOMEM */
++ sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize,
++ GFP_KERNEL | __GFP_NOWARN |
++ __GFP_NOMEMALLOC |
++ __GFP_NORETRY);
++ if (!sndbuf_desc->cpu_addr) {
++ kfree(sndbuf_desc);
++ /* if send buffer allocation has failed,
++ * try a smaller one
++ */
++ continue;
++ }
++ rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
++ tmp_bufsize, sndbuf_desc,
++ DMA_TO_DEVICE);
++ if (rc) {
++ kfree(sndbuf_desc->cpu_addr);
++ kfree(sndbuf_desc);
++ continue; /* if mapping failed, try smaller one */
++ }
++ sndbuf_desc->used = 1;
++ write_lock_bh(&lgr->sndbufs_lock);
++ list_add(&sndbuf_desc->list,
++ &lgr->sndbufs[tmp_bufsize_short]);
++ write_unlock_bh(&lgr->sndbufs_lock);
++ }
++ if (sndbuf_desc && sndbuf_desc->cpu_addr) {
++ conn->sndbuf_desc = sndbuf_desc;
++ conn->sndbuf_size = tmp_bufsize;
++ smc->sk.sk_sndbuf = tmp_bufsize * 2;
++ return 0;
++ } else {
++ return -ENOMEM;
++ }
++}
++
++/* create the RMB for an SMC socket (even though the SMC protocol
++ * allows more than one RMB-element per RMB, the Linux implementation
++ * uses just one RMB-element per RMB, i.e. uses an extra RMB for every
++ * connection in a link group
++ */
++int smc_rmb_create(struct smc_sock *smc)
++{
++ struct smc_connection *conn = &smc->conn;
++ struct smc_link_group *lgr = conn->lgr;
++ int tmp_bufsize, tmp_bufsize_short;
++ struct smc_buf_desc *rmb_desc;
++ int rc;
++
++ /* use socket recv buffer size (w/o overhead) as start value */
++ for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2);
++ tmp_bufsize_short >= 0; tmp_bufsize_short--) {
++ tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
++ /* check for reusable rmb_slot in the link group */
++ rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short);
++ if (rmb_desc) {
++ memset(conn->rmb_desc->cpu_addr, 0, tmp_bufsize);
++ break; /* found reusable slot */
++ }
++ /* try to alloc a new RMB */
++ rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL);
++ if (!rmb_desc)
++ break; /* give up with -ENOMEM */
++ rmb_desc->cpu_addr = kzalloc(tmp_bufsize,
++ GFP_KERNEL | __GFP_NOWARN |
++ __GFP_NOMEMALLOC |
++ __GFP_NORETRY);
++ if (!rmb_desc->cpu_addr) {
++ kfree(rmb_desc);
++ /* if RMB allocation has failed,
++ * try a smaller one
++ */
++ continue;
++ }
++ rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
++ tmp_bufsize, rmb_desc,
++ DMA_FROM_DEVICE);
++ if (rc) {
++ kfree(rmb_desc->cpu_addr);
++ kfree(rmb_desc);
++ continue; /* if mapping failed, try smaller one */
++ }
++ rmb_desc->used = 1;
++ write_lock_bh(&lgr->rmbs_lock);
++ list_add(&rmb_desc->list,
++ &lgr->rmbs[tmp_bufsize_short]);
++ write_unlock_bh(&lgr->rmbs_lock);
++ }
++ if (rmb_desc && rmb_desc->cpu_addr) {
++ conn->rmb_desc = rmb_desc;
++ conn->rmbe_size = tmp_bufsize;
++ conn->rmbe_size_short = tmp_bufsize_short;
++ smc->sk.sk_rcvbuf = tmp_bufsize * 2;
++ return 0;
++ } else {
++ return -ENOMEM;
++ }
++}
+--- a/net/smc/smc_core.h
++++ b/net/smc/smc_core.h
+@@ -16,6 +16,9 @@
+ #include "smc.h"
+ #include "smc_ib.h"
+
++#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
++#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */
++
+ struct smc_lgr_list { /* list of link group definition */
+ struct list_head list;
+ spinlock_t lock; /* protects list of link groups */
+@@ -53,6 +56,15 @@ struct smc_link {
+ #define SMC_FIRST_CONTACT 1 /* first contact to a peer */
+ #define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/
+
++/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
++struct smc_buf_desc {
++ struct list_head list;
++ u64 dma_addr[SMC_LINKS_PER_LGR_MAX];
++ /* mapped address of buffer */
++ void *cpu_addr; /* virtual address of buffer */
++ u32 used; /* currently used / unused */
++};
++
+ struct smc_link_group {
+ struct list_head list;
+ enum smc_lgr_role role; /* client or server */
+@@ -64,6 +76,11 @@ struct smc_link_group {
+ rwlock_t conns_lock; /* protects conns_all */
+ unsigned int conns_num; /* current # of connections */
+ unsigned short vlan_id; /* vlan id of link group */
++
++ struct list_head sndbufs[SMC_RMB_SIZES]; /* tx buffers */
++ rwlock_t sndbufs_lock; /* protects tx buffers */
++ struct list_head rmbs[SMC_RMB_SIZES]; /* rx buffers */
++ rwlock_t rmbs_lock; /* protects rx buffers */
+ };
+
+ /* Find the connection associated with the given alert token in the link group.
+@@ -99,6 +116,10 @@ static inline struct smc_connection *smc
+ return res;
+ }
+
++struct smc_clc_msg_accept_confirm;
++
+ void smc_lgr_free(struct smc_link_group *);
++int smc_sndbuf_create(struct smc_sock *);
++int smc_rmb_create(struct smc_sock *);
+
+ #endif
+--- a/net/smc/smc_ib.c
++++ b/net/smc/smc_ib.c
+@@ -16,6 +16,7 @@
+
+ #include "smc_pnet.h"
+ #include "smc_ib.h"
++#include "smc_core.h"
+ #include "smc.h"
+
+ struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
+@@ -29,6 +30,24 @@ u8 local_systemid[SMC_SYSTEMID_LEN] = SM
+ * identifier
+ */
+
++/* map a new TX or RX buffer to DMA */
++int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
++ struct smc_buf_desc *buf_slot,
++ enum dma_data_direction data_direction)
++{
++ int rc = 0;
++
++ if (buf_slot->dma_addr[SMC_SINGLE_LINK])
++ return rc; /* already mapped */
++ buf_slot->dma_addr[SMC_SINGLE_LINK] =
++ ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr,
++ buf_size, data_direction);
++ if (ib_dma_mapping_error(smcibdev->ibdev,
++ buf_slot->dma_addr[SMC_SINGLE_LINK]))
++ rc = -EIO;
++ return rc;
++}
++
+ static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
+ {
+ struct net_device *ndev;
+--- a/net/smc/smc_ib.h
++++ b/net/smc/smc_ib.h
+@@ -32,9 +32,14 @@ struct smc_ib_device { /* ib-device i
+ u8 initialized : 1; /* ib dev CQ, evthdl done */
+ };
+
++struct smc_sock;
++struct smc_buf_desc;
++
+ int __init smc_ib_register_client(void);
+ void __exit smc_ib_unregister_client(void);
+ bool smc_ib_port_active(struct smc_ib_device *, u8);
+ int smc_ib_remember_port_attr(struct smc_ib_device *, u8);
++int smc_ib_buf_map(struct smc_ib_device *, int, struct smc_buf_desc *,
++ enum dma_data_direction);
+
+ #endif
diff --git a/patches.arch/s390-sles12sp2-00-05-net-smc-r-08.patch b/patches.arch/s390-sles12sp2-00-05-net-smc-r-08.patch
new file mode 100644
index 0000000000..5e9fe226bb
--- /dev/null
+++ b/patches.arch/s390-sles12sp2-00-05-net-smc-r-08.patch
@@ -0,0 +1,929 @@
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Subject: smc: work request (WR) base for use by LLC and CDC
+Patch-mainline: not yet, IBM pushing upstream
+References: bsc#978258,FATE#319593,LTC#131290
+
+Summary: net/smc: Shared Memory Communications - RDMA
+Description: Initial part of the implementation of the "Shared Memory
+ Communications-RDMA" (SMC-R) protocol. The protocol is defined
+ in RFC7609 [1]. It allows transparent transformation of TCP
+ connections using the "Remote Direct Memory Access over
+ Converged Ethernet" (RoCE) feature of certain communication
+ hardware for data center environments. Tested on s390 and x86
+ using Mellanox ConnectX-3 cards.
+
+ A new socket protocol family PF_SMC is being introduced. A
+ preload shared library will be offered to enable TCP-based
+ applications to use SMC-R without changes or recompilation.
+
+ References:
+ [1] SMC-R Informational RFC:
+ https://tools.ietf.org/rfc/rfc7609
+
+Upstream-Description:
+
+ smc: work request (WR) base for use by LLC and CDC
+
+ The base containers for RDMA transport are work requests and completion
+ queue entries processed through Infiniband verbs:
+ * allocate and initialize these areas
+ * map these areas to DMA
+ * implement the basic communication consisting of work request posting
+ and receival of completion queue events
+
+ Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+
+Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Acked-by: John Jolly <jjolly@suse.de>
+---
+ net/smc/Makefile | 2
+ net/smc/smc.h | 4
+ net/smc/smc_core.c | 9
+ net/smc/smc_core.h | 29 ++
+ net/smc/smc_ib.c | 71 ++++++
+ net/smc/smc_ib.h | 11 +
+ net/smc/smc_wr.c | 563 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ net/smc/smc_wr.h | 92 ++++++++
+ 8 files changed, 780 insertions(+), 1 deletion(-)
+
+--- a/net/smc/Makefile
++++ b/net/smc/Makefile
+@@ -1,2 +1,2 @@
+ obj-$(CONFIG_SMC) += smc.o
+-smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o
++smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o
+--- a/net/smc/smc.h
++++ b/net/smc/smc.h
+@@ -28,6 +28,10 @@ enum smc_state { /* possible states of
+
+ struct smc_link_group;
+
++struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */
++ u8 type;
++} __packed;
++
+ struct smc_connection {
+ struct rb_node alert_node;
+ struct smc_link_group *lgr; /* link group of connection */
+--- a/net/smc/smc_core.c
++++ b/net/smc/smc_core.c
+@@ -19,6 +19,7 @@
+ #include "smc_clc.h"
+ #include "smc_core.h"
+ #include "smc_ib.h"
++#include "smc_wr.h"
+
+ /* Register connection's alert token in our lookup structure.
+ * To use rbtrees we have to implement our own insert core.
+@@ -114,12 +115,20 @@ static int smc_lgr_create(struct smc_soc
+ lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
+ get_random_bytes(rndvec, sizeof(rndvec));
+ lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
++ rc = smc_wr_alloc_link_mem(lnk);
++ if (rc)
++ goto free_lgr;
++ init_waitqueue_head(&lnk->wr_tx_wait);
+
+ smc->conn.lgr = lgr;
+ rwlock_init(&lgr->conns_lock);
+ spin_lock(&smc_lgr_list.lock);
+ list_add_tail(&lgr->list, &smc_lgr_list.list);
+ spin_unlock(&smc_lgr_list.lock);
++ return 0;
++
++free_lgr:
++ kfree(lgr);
+ out:
+ return rc;
+ }
+--- a/net/smc/smc_core.h
++++ b/net/smc/smc_core.h
+@@ -32,11 +32,40 @@ enum smc_lgr_role { /* possible roles o
+ SMC_SERV /* server */
+ };
+
++#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
++
++struct smc_wr_buf {
++ u8 raw[SMC_WR_BUF_SIZE];
++} __packed;
++
+ struct smc_link {
+ struct smc_ib_device *smcibdev; /* ib-device */
+ u8 ibport; /* port - values 1 | 2 */
++ struct ib_pd *roce_pd; /* IB protection domain,
++ * unique for every RoCE QP
++ */
+ struct ib_qp *roce_qp; /* IB queue pair */
+ struct ib_qp_attr qp_attr; /* IB queue pair attributes */
++
++ struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */
++ struct ib_send_wr *wr_tx_ibs; /* WR send meta data */
++ struct ib_sge *wr_tx_sges; /* WR send gather meta data */
++ struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */
++ /* above four vectors have wr_tx_cnt elements and use the same index */
++ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */
++ atomic64_t wr_tx_id; /* seq # of last sent WR */
++ unsigned long *wr_tx_mask; /* bit mask of used indexes */
++ u32 wr_tx_cnt; /* number of WR send buffers */
++ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */
++
++ struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */
++ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
++ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
++ /* above three vectors have wr_rx_cnt elements and use the same index */
++ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
++ u64 wr_rx_id; /* seq # of last recv WR */
++ u32 wr_rx_cnt; /* number of WR recv buffers */
++
+ union ib_gid gid; /* gid matching used vlan id */
+ u32 peer_qpn; /* QP number of peer */
+ enum ib_mtu path_mtu; /* used mtu */
+--- a/net/smc/smc_ib.c
++++ b/net/smc/smc_ib.c
+@@ -17,6 +17,7 @@
+ #include "smc_pnet.h"
+ #include "smc_ib.h"
+ #include "smc_core.h"
++#include "smc_wr.h"
+ #include "smc.h"
+
+ struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
+@@ -30,6 +31,76 @@ u8 local_systemid[SMC_SYSTEMID_LEN] = SM
+ * identifier
+ */
+
++void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
++{
++ ib_dealloc_pd(lnk->roce_pd);
++ lnk->roce_pd = NULL;
++}
++
++int smc_ib_create_protection_domain(struct smc_link *lnk)
++{
++ lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev);
++ if (IS_ERR(lnk->roce_pd)) {
++ lnk->roce_pd = NULL;
++ return (int)PTR_ERR(lnk->roce_pd);
++ }
++ return 0;
++}
++
++static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
++{
++ switch (ibevent->event) {
++ case IB_EVENT_DEVICE_FATAL:
++ case IB_EVENT_GID_CHANGE:
++ case IB_EVENT_PORT_ERR:
++ case IB_EVENT_QP_ACCESS_ERR:
++ /* tbd in follow-on patch:
++ * abnormal close of corresponding connections
++ */
++ break;
++ default:
++ break;
++ }
++}
++
++void smc_ib_destroy_queue_pair(struct smc_link *lnk)
++{
++ ib_destroy_qp(lnk->roce_qp);
++ lnk->roce_qp = NULL;
++}
++
++/* create a queue pair within the protection domain for a link */
++int smc_ib_create_queue_pair(struct smc_link *lnk)
++{
++ struct ib_qp_init_attr qp_attr = {
++ .event_handler = smc_ib_qp_event_handler,
++ .qp_context = lnk,
++ .send_cq = lnk->smcibdev->roce_cq_send,
++ .recv_cq = lnk->smcibdev->roce_cq_recv,
++ .srq = NULL,
++ .cap = {
++ .max_send_wr = SMC_WR_BUF_CNT,
++ /* include unsolicited rdma_writes as well,
++ * there are max. 2 RDMA_WRITE per 1 WR_SEND
++ */
++ .max_recv_wr = SMC_WR_BUF_CNT * 3,
++ .max_send_sge = SMC_IB_MAX_SEND_SGE,
++ .max_recv_sge = 1,
++ .max_inline_data = SMC_WR_TX_SIZE,
++ },
++ .sq_sig_type = IB_SIGNAL_REQ_WR,
++ .qp_type = IB_QPT_RC,
++ };
++
++ lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
++ if (IS_ERR(lnk->roce_qp)) {
++ lnk->roce_qp = NULL;
++ return (int)PTR_ERR(lnk->roce_qp);
++ }
++ smc_wr_remember_qp_attr(lnk);
++ return 0;
++}
++
+ /* map a new TX or RX buffer to DMA */
+ int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
+ struct smc_buf_desc *buf_slot,
+--- a/net/smc/smc_ib.h
++++ b/net/smc/smc_ib.h
+@@ -16,6 +16,8 @@
+ #define SMC_MAX_PORTS 2 /* Max # of ports */
+ #define SMC_GID_SIZE sizeof(union ib_gid)
+
++#define SMC_IB_MAX_SEND_SGE 2
++
+ struct smc_ib_devices { /* list of smc ib devices definition */
+ struct list_head list;
+ spinlock_t lock; /* protects list of smc ib devices */
+@@ -27,6 +29,10 @@ struct smc_ib_device { /* ib-device i
+ struct list_head list;
+ struct ib_device *ibdev;
+ struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */
++ struct ib_cq *roce_cq_send; /* send completion queue */
++ struct ib_cq *roce_cq_recv; /* recv completion queue */
++ struct tasklet_struct send_tasklet; /* called by send cq handler */
++ struct tasklet_struct recv_tasklet; /* called by recv cq handler */
+ char mac[SMC_MAX_PORTS][6]; /* mac address per port*/
+ union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */
+ u8 initialized : 1; /* ib dev CQ, evthdl done */
+@@ -34,6 +40,7 @@ struct smc_ib_device { /* ib-device i
+
+ struct smc_sock;
+ struct smc_buf_desc;
++struct smc_link;
+
+ int __init smc_ib_register_client(void);
+ void __exit smc_ib_unregister_client(void);
+@@ -41,5 +48,9 @@ bool smc_ib_port_active(struct smc_ib_de
+ int smc_ib_remember_port_attr(struct smc_ib_device *, u8);
+ int smc_ib_buf_map(struct smc_ib_device *, int, struct smc_buf_desc *,
+ enum dma_data_direction);
++void smc_ib_dealloc_protection_domain(struct smc_link *);
++int smc_ib_create_protection_domain(struct smc_link *);
++void smc_ib_destroy_queue_pair(struct smc_link *);
++int smc_ib_create_queue_pair(struct smc_link *);
+
+ #endif
+--- /dev/null
++++ b/net/smc/smc_wr.c
+@@ -0,0 +1,563 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * Work Requests exploiting Infiniband API
++ *
++ * Work requests (WR) of type ib_post_send or ib_post_recv respectively
++ * are submitted to either RC SQ or RC RQ respectively
++ * (reliably connected send/receive queue)
++ * and become work queue entries (WQEs).
++ * While an SQ WR/WQE is pending, we track it until transmission completion.
++ * Through a send or receive completion queue (CQ) respectively,
++ * we get completion queue entries (CQEs) [aka work completions (WCs)].
++ * Since the CQ callback is called from IRQ context, we split work by using
++ * bottom halves implemented by tasklets.
++ *
++ * SMC uses this to exchange LLC (link layer control)
++ * and CDC (connection data control) messages.
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
++ */
++
++#include <linux/hashtable.h>
++
++#include "smc.h"
++#include "smc_wr.h"
++
++#define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */
++
++#define SMC_WR_RX_HASH_BITS 4
++static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
++static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
++
++struct smc_wr_tx_pend { /* control data for a pending send request */
++ u64 wr_id; /* work request id sent */
++ smc_wr_tx_handler handler;
++ enum ib_wc_status wc_status; /* CQE status */
++ struct smc_link *link;
++ u32 idx;
++ struct smc_wr_tx_pend_priv priv;
++};
++
++static bool smc_wr_tx_pending_on_link(struct smc_link *link)
++{
++ return find_first_bit(link->wr_tx_mask, link->wr_tx_cnt)
++ != link->wr_tx_cnt;
++}
++
++int smc_wr_tx_wait_no_pending_on_link(struct smc_link *link)
++{
++ int rc = 1;
++
++ if (smc_wr_tx_pending_on_link(link)) {
++ rc = wait_event_interruptible_timeout(
++ link->wr_tx_wait,
++ !smc_wr_tx_pending_on_link(link),
++ SMC_WR_TX_WAIT_PENDING_TIME);
++ }
++ return rc;
++}
++
++static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
++{
++ u32 i;
++
++ for (i = 0; i < link->wr_tx_cnt; i++) {
++ if (link->wr_tx_pends[i].wr_id == wr_id)
++ return i;
++ }
++ return link->wr_tx_cnt;
++}
++
++static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
++{
++ struct smc_wr_tx_pend pnd_snd;
++ struct smc_link *link;
++ u32 pnd_snd_idx;
++ int i;
++
++ link = wc->qp->qp_context;
++ pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
++ if (pnd_snd_idx == link->wr_tx_cnt)
++ return;
++ link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
++ memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd));
++ if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
++ return;
++ if (wc->status) {
++ if ((wc->status == IB_WC_RETRY_EXC_ERR) ||
++ (wc->status == IB_WC_RNR_RETRY_EXC_ERR)) {
++ pr_warn("smc overload on link %p - reason %s\n",
++ link,
++ (wc->status == IB_WC_RETRY_EXC_ERR) ?
++ "retry" : "rnr");
++ }
++ for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt)
++ clear_bit(i, link->wr_tx_mask);
++ /* tbd in future patch: terminate connections of this link
++ * group abnormally
++ */
++ }
++ if (pnd_snd.handler)
++ pnd_snd.handler(&pnd_snd.priv, link, wc->status);
++ wake_up(&link->wr_tx_wait);
++}
++
++void smc_wr_tx_tasklet_fn(unsigned long data)
++{
++ struct smc_ib_device *dev = (struct smc_ib_device *)data;
++ struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
++ int i = 0, rc;
++ int polled = 0;
++
++again:
++ polled++;
++ do {
++ rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
++ if (polled == 1) {
++ ib_req_notify_cq(dev->roce_cq_send,
++ IB_CQ_NEXT_COMP |
++ IB_CQ_REPORT_MISSED_EVENTS);
++ }
++ if (!rc)
++ break;
++ for (i = 0; i < rc; i++)
++ smc_wr_tx_process_cqe(&wc[i]);
++ } while (rc > 0);
++ if (polled == 1)
++ goto again;
++}
++
++void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
++{
++ struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
++
++ tasklet_schedule(&dev->send_tasklet);
++}
++
++static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
++{
++ *idx = link->wr_tx_cnt;
++ for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
++ if (!test_and_set_bit(*idx, link->wr_tx_mask))
++ return 0;
++ }
++ *idx = link->wr_tx_cnt;
++ return -EBUSY;
++}
++
++/**
++ * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
++ * and sets info for pending transmit tracking
++ * @link: Pointer to smc_link used to later send the message.
++ * @handler: Send completion handler function pointer.
++ * @wr_buf: Out value returns pointer to message buffer.
++ * @wr_pend_priv: Out value returns pointer serving as handler context.
++ *
++ * Return: 0 on success, or -errno on error.
++ */
++int smc_wr_tx_get_free_slot(struct smc_link *link,
++ smc_wr_tx_handler handler,
++ struct smc_wr_buf **wr_buf,
++ struct smc_wr_tx_pend_priv **wr_pend_priv)
++{
++ struct smc_wr_tx_pend *wr_pend;
++ struct ib_send_wr *wr_ib;
++ u64 wr_id;
++ u32 idx;
++ int rc;
++
++ *wr_buf = NULL;
++ *wr_pend_priv = NULL;
++ if (in_softirq()) {
++ rc = smc_wr_tx_get_free_slot_index(link, &idx);
++ if (rc)
++ return rc;
++ } else {
++ rc = wait_event_interruptible_timeout(
++ link->wr_tx_wait,
++ (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
++ SMC_WR_TX_WAIT_FREE_SLOT_TIME);
++ if (!rc) {
++ /* tbd in future patch: timeout - terminate connections
++ * of this link group abnormally
++ */
++ return -EPIPE;
++ }
++ if (rc == -ERESTARTSYS)
++ return -EINTR;
++ if (idx == link->wr_tx_cnt)
++ return -EPIPE;
++ }
++ wr_id = smc_wr_tx_get_next_wr_id(link);
++ wr_pend = &link->wr_tx_pends[idx];
++ /* clear the full struct smc_wr_tx_pend including .priv */
++ memset(wr_pend, 0, sizeof(*wr_pend));
++ wr_pend->wr_id = wr_id;
++ wr_pend->handler = handler;
++ wr_pend->link = link;
++ wr_pend->idx = idx;
++ wr_ib = &link->wr_tx_ibs[idx];
++ wr_ib->wr_id = wr_id;
++ *wr_buf = &link->wr_tx_bufs[idx];
++ *wr_pend_priv = &wr_pend->priv;
++ return 0;
++}
++
++int smc_wr_tx_put_slot(struct smc_link *link,
++ struct smc_wr_tx_pend_priv *wr_pend_priv)
++{
++ struct smc_wr_tx_pend *pend;
++
++ pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
++ if (pend->idx < link->wr_tx_cnt) {
++ test_and_clear_bit(pend->idx, link->wr_tx_mask);
++ return 1;
++ }
++
++ return 0;
++}
++
++/* Send prepared WR slot via ib_post_send.
++ * Requires conn->send_lock being held if entered with an smc_connection
++ * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
++ */
++int smc_wr_tx_send(struct smc_link *link, struct smc_connection *conn,
++ struct smc_wr_tx_pend_priv *priv)
++{
++ struct ib_send_wr *failed_wr = NULL;
++ struct smc_wr_tx_pend *pend;
++ int rc;
++
++ ib_req_notify_cq(link->smcibdev->roce_cq_send,
++ IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS);
++ pend = container_of(priv, struct smc_wr_tx_pend, priv);
++ rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx],
++ &failed_wr);
++ if (rc)
++ smc_wr_tx_put_slot(link, priv);
++ return rc;
++}
++
++int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
++{
++ struct smc_wr_rx_handler *h_iter;
++ int rc = 0;
++
++ spin_lock(&smc_wr_rx_hash_lock);
++ hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
++ if (h_iter->type == handler->type) {
++ rc = -EEXIST;
++ goto out_unlock;
++ }
++ }
++ hash_add(smc_wr_rx_hash, &handler->list, handler->type);
++out_unlock:
++ spin_unlock(&smc_wr_rx_hash_lock);
++ return rc;
++}
++
++/* Demultiplex a received work request based on the message type to its handler.
++ * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
++ * and not being modified any more afterwards so we don't need to lock it.
++ */
++static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
++{
++ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
++ struct smc_wr_rx_handler *handler;
++ struct smc_wr_rx_hdr *wr_rx;
++ u32 index;
++
++ if (wc->byte_len < sizeof(*wr_rx))
++ return; /* short message */
++ index = wc->wr_id % link->wr_rx_cnt;
++ wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
++ hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
++ if (handler->type == wr_rx->type)
++ handler->handler(wc, wr_rx);
++ }
++}
++
++static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
++{
++ struct smc_link *link;
++ int i;
++
++ for (i = 0; i < num; i++) {
++ link = wc[i].qp->qp_context;
++ if (wc[i].status == IB_WC_SUCCESS) {
++ smc_wr_rx_demultiplex(&wc[i]);
++ smc_wr_rx_post(link); /* refill WR RX */
++ } else {
++ /* handle status errors */
++ switch (wc[i].status) {
++ case IB_WC_RETRY_EXC_ERR:
++ case IB_WC_RNR_RETRY_EXC_ERR:
++ case IB_WC_WR_FLUSH_ERR:
++ /* tbd in future patch: terminate connections of this
++ * link group abnormally
++ */
++ break;
++ default:
++ smc_wr_rx_post(link); /* refill WR RX */
++ break;
++ }
++ continue;
++ }
++ }
++}
++
++void smc_wr_rx_tasklet_fn(unsigned long data)
++{
++ struct smc_ib_device *dev = (struct smc_ib_device *)data;
++ struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
++ int polled = 0;
++ int rc;
++
++again:
++ polled++;
++ do {
++ memset(&wc, 0, sizeof(wc));
++ rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
++ if (polled == 1) {
++ ib_req_notify_cq(dev->roce_cq_recv,
++ IB_CQ_SOLICITED_MASK
++ | IB_CQ_REPORT_MISSED_EVENTS);
++ }
++ if (!rc)
++ break;
++ smc_wr_rx_process_cqes(&wc[0], rc);
++ } while (rc > 0);
++ if (polled == 1)
++ goto again;
++}
++
++void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
++{
++ struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
++
++ tasklet_schedule(&dev->recv_tasklet);
++}
++
++int smc_wr_rx_post_init(struct smc_link *link)
++{
++ u32 i;
++ int rc = 0;
++
++ for (i = 0; i < link->wr_rx_cnt; i++)
++ rc = smc_wr_rx_post(link);
++ return rc;
++}
++
++void smc_wr_remember_qp_attr(struct smc_link *lnk)
++{
++ struct ib_qp_attr *attr = &lnk->qp_attr;
++ struct ib_qp_init_attr init_attr;
++
++ memset(attr, 0, sizeof(*attr));
++ memset(&init_attr, 0, sizeof(init_attr));
++ ib_query_qp(lnk->roce_qp, attr,
++ IB_QP_STATE |
++ IB_QP_CUR_STATE |
++ IB_QP_PKEY_INDEX |
++ IB_QP_PORT |
++ IB_QP_QKEY |
++ IB_QP_AV |
++ IB_QP_PATH_MTU |
++ IB_QP_TIMEOUT |
++ IB_QP_RETRY_CNT |
++ IB_QP_RNR_RETRY |
++ IB_QP_RQ_PSN |
++ IB_QP_ALT_PATH |
++ IB_QP_MIN_RNR_TIMER |
++ IB_QP_SQ_PSN |
++ IB_QP_PATH_MIG_STATE |
++ IB_QP_CAP |
++ IB_QP_DEST_QPN,
++ &init_attr);
++
++ lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
++ lnk->qp_attr.cap.max_send_wr);
++ lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
++ lnk->qp_attr.cap.max_recv_wr);
++}
++
++static void smc_wr_init_sge(struct smc_link *lnk)
++{
++ u32 i;
++
++ for (i = 0; i < lnk->wr_tx_cnt; i++) {
++ lnk->wr_tx_sges[i].addr =
++ lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
++ lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
++ lnk->wr_tx_ibs[i].next = NULL;
++ lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
++ lnk->wr_tx_ibs[i].num_sge = 1;
++ lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
++ lnk->wr_tx_ibs[i].send_flags =
++ IB_SEND_SIGNALED | IB_SEND_SOLICITED | IB_SEND_INLINE;
++ }
++ for (i = 0; i < lnk->wr_rx_cnt; i++) {
++ lnk->wr_rx_sges[i].addr =
++ lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
++ lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE;
++ lnk->wr_rx_ibs[i].next = NULL;
++ lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
++ lnk->wr_rx_ibs[i].num_sge = 1;
++ }
++}
++
++void smc_wr_free_link(struct smc_link *lnk)
++{
++ struct ib_device *ibdev;
++
++ memset(lnk->wr_tx_mask, 0,
++ BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
++
++ if (!lnk->smcibdev)
++ return;
++ ibdev = lnk->smcibdev->ibdev;
++
++ if (lnk->wr_rx_dma_addr) {
++ ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
++ SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
++ DMA_FROM_DEVICE);
++ lnk->wr_rx_dma_addr = 0;
++ }
++ if (lnk->wr_tx_dma_addr) {
++ ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
++ SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
++ DMA_TO_DEVICE);
++ lnk->wr_tx_dma_addr = 0;
++ }
++}
++
++void smc_wr_free_link_mem(struct smc_link *lnk)
++{
++ kfree(lnk->wr_tx_pends);
++ lnk->wr_tx_pends = NULL;
++ kfree(lnk->wr_tx_mask);
++ lnk->wr_tx_mask = NULL;
++ kfree(lnk->wr_tx_sges);
++ lnk->wr_tx_sges = NULL;
++ kfree(lnk->wr_rx_sges);
++ lnk->wr_rx_sges = NULL;
++ kfree(lnk->wr_rx_ibs);
++ lnk->wr_rx_ibs = NULL;
++ kfree(lnk->wr_tx_ibs);
++ lnk->wr_tx_ibs = NULL;
++ kfree(lnk->wr_tx_bufs);
++ lnk->wr_tx_bufs = NULL;
++ kfree(lnk->wr_rx_bufs);
++ lnk->wr_rx_bufs = NULL;
++}
++
++int smc_wr_alloc_link_mem(struct smc_link *link)
++{
++ /* allocate link related memory */
++ link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
++ if (!link->wr_tx_bufs)
++ goto no_mem;
++ link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
++ GFP_KERNEL);
++ if (!link->wr_rx_bufs)
++ goto no_mem_wr_tx_bufs;
++ link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
++ GFP_KERNEL);
++ if (!link->wr_tx_ibs)
++ goto no_mem_wr_rx_bufs;
++ link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
++ sizeof(link->wr_rx_ibs[0]),
++ GFP_KERNEL);
++ if (!link->wr_rx_ibs)
++ goto no_mem_wr_tx_ibs;
++ link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
++ GFP_KERNEL);
++ if (!link->wr_tx_sges)
++ goto no_mem_wr_rx_ibs;
++ link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
++ sizeof(link->wr_rx_sges[0]),
++ GFP_KERNEL);
++ if (!link->wr_rx_sges)
++ goto no_mem_wr_tx_sges;
++ link->wr_tx_mask = kzalloc(
++ BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*link->wr_tx_mask),
++ GFP_KERNEL);
++ if (!link->wr_tx_mask)
++ goto no_mem_wr_rx_sges;
++ link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
++ sizeof(link->wr_tx_pends[0]),
++ GFP_KERNEL);
++ if (!link->wr_tx_pends)
++ goto no_mem_wr_tx_mask;
++ return 0;
++
++no_mem_wr_tx_mask:
++ kfree(link->wr_tx_mask);
++no_mem_wr_rx_sges:
++ kfree(link->wr_rx_sges);
++no_mem_wr_tx_sges:
++ kfree(link->wr_tx_sges);
++no_mem_wr_rx_ibs:
++ kfree(link->wr_rx_ibs);
++no_mem_wr_tx_ibs:
++ kfree(link->wr_tx_ibs);
++no_mem_wr_rx_bufs:
++ kfree(link->wr_rx_bufs);
++no_mem_wr_tx_bufs:
++ kfree(link->wr_tx_bufs);
++no_mem:
++ return -ENOMEM;
++}
++
++void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
++{
++ tasklet_kill(&smcibdev->recv_tasklet);
++ tasklet_kill(&smcibdev->send_tasklet);
++}
++
++void smc_wr_add_dev(struct smc_ib_device *smcibdev)
++{
++ tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn,
++ (unsigned long)smcibdev);
++ tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn,
++ (unsigned long)smcibdev);
++}
++
++int smc_wr_create_lgr(struct smc_link *lnk)
++{
++ struct ib_device *ibdev = lnk->smcibdev->ibdev;
++ int rc = 0;
++
++ atomic64_set(&lnk->wr_tx_id, 0);
++ lnk->wr_rx_id = 0;
++ lnk->wr_rx_dma_addr = ib_dma_map_single(
++ ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
++ DMA_FROM_DEVICE);
++ if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
++ lnk->wr_rx_dma_addr = 0;
++ rc = -EIO;
++ goto out;
++ }
++ lnk->wr_tx_dma_addr = ib_dma_map_single(
++ ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
++ DMA_TO_DEVICE);
++ if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
++ rc = -EIO;
++ goto dma_unmap;
++ }
++ smc_wr_init_sge(lnk);
++ memset(lnk->wr_tx_mask, 0,
++ BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
++ return rc;
++
++dma_unmap:
++ ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
++ SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
++ DMA_FROM_DEVICE);
++ lnk->wr_rx_dma_addr = 0;
++out:
++ return rc;
++}
+--- /dev/null
++++ b/net/smc/smc_wr.h
+@@ -0,0 +1,92 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * Work Requests exploiting Infiniband API
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
++ */
++
++#ifndef SMC_WR_H
++#define SMC_WR_H
++
++#include <rdma/ib_verbs.h>
++
++#include "smc.h"
++#include "smc_core.h"
++
++#define SMC_WR_MAX_CQE 32768 /* max. # of completion queue elements */
++#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
++
++#define SMC_WR_TX_WAIT_FREE_SLOT_TIME HZ
++#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ)
++
++#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
++
++#define SMC_WR_TX_PEND_PRIV_SIZE 32
++
++struct smc_wr_tx_pend_priv {
++ u8 priv[SMC_WR_TX_PEND_PRIV_SIZE];
++};
++
++typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *,
++ struct smc_link *,
++ enum ib_wc_status);
++
++struct smc_wr_rx_handler {
++ struct hlist_node list; /* hash table collision resolution */
++ void (*handler)(struct ib_wc *, void *);
++ u8 type;
++};
++
++/* Only used by RDMA write WRs.
++ * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
++ */
++static inline u64 smc_wr_tx_get_next_wr_id(struct smc_link *link)
++{
++ return atomic64_inc_return(&link->wr_tx_id);
++}
++
++/* post a new receive work request to fill a completed old work request entry */
++static inline int smc_wr_rx_post(struct smc_link *link)
++{
++ struct ib_recv_wr *bad_recv_wr = NULL;
++ int rc = 0;
++ u64 wr_id;
++ u32 index;
++
++ wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */
++ index = wr_id % link->wr_rx_cnt;
++ link->wr_rx_ibs[index].wr_id = wr_id;
++ rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], &bad_recv_wr);
++ return rc;
++}
++
++struct smc_connection;
++
++int smc_wr_create_lgr(struct smc_link *);
++int smc_wr_alloc_link_mem(struct smc_link *);
++void smc_wr_free_link(struct smc_link *);
++void smc_wr_free_link_mem(struct smc_link *);
++void smc_wr_remember_qp_attr(struct smc_link *);
++void smc_wr_cq_event_handler(struct ib_event *, void *);
++void smc_wr_remove_dev(struct smc_ib_device *);
++void smc_wr_add_dev(struct smc_ib_device *);
++
++int smc_wr_tx_wait_no_pending_on_link(struct smc_link *);
++int smc_wr_tx_get_free_slot(struct smc_link *, smc_wr_tx_handler,
++ struct smc_wr_buf **,
++ struct smc_wr_tx_pend_priv **);
++int smc_wr_tx_put_slot(struct smc_link *, struct smc_wr_tx_pend_priv *);
++int smc_wr_tx_send(struct smc_link *, struct smc_connection *,
++ struct smc_wr_tx_pend_priv *);
++void smc_wr_tx_tasklet_fn(unsigned long);
++void smc_wr_tx_cq_handler(struct ib_cq *, void *);
++
++int smc_wr_rx_register_handler(struct smc_wr_rx_handler *);
++int smc_wr_rx_post_init(struct smc_link *);
++void smc_wr_rx_tasklet_fn(unsigned long);
++void smc_wr_rx_cq_handler(struct ib_cq *, void *);
++
++#endif /* SMC_WR_H */
diff --git a/patches.arch/s390-sles12sp2-00-05-net-smc-r-09.patch b/patches.arch/s390-sles12sp2-00-05-net-smc-r-09.patch
new file mode 100644
index 0000000000..d1800b56e3
--- /dev/null
+++ b/patches.arch/s390-sles12sp2-00-05-net-smc-r-09.patch
@@ -0,0 +1,611 @@
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Subject: smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR
+Patch-mainline: not yet, IBM pushing upstream
+References: bsc#978258,FATE#319593,LTC#131290
+
+Summary: net/smc: Shared Memory Communications - RDMA
+Description: Initial part of the implementation of the "Shared Memory
+ Communications-RDMA" (SMC-R) protocol. The protocol is defined
+ in RFC7609 [1]. It allows transparent transformation of TCP
+ connections using the "Remote Direct Memory Access over
+ Converged Ethernet" (RoCE) feature of certain communication
+ hardware for data center environments. Tested on s390 and x86
+ using Mellanox ConnectX-3 cards.
+
+ A new socket protocol family PF_SMC is being introduced. A
+ preload shared library will be offered to enable TCP-based
+ applications to use SMC-R without changes or recompilation.
+
+ References:
+ [1] SMC-R Informational RFC:
+ https://tools.ietf.org/rfc/rfc7609
+
+Upstream-Description:
+
+ smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR
+
+ Prepare the link for RDMA transport:
+ Create a queue pair (QP) and move it into the state Ready-To-Receive (RTR).
+
+ Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+
+Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Acked-by: John Jolly <jjolly@suse.de>
+---
+ net/smc/af_smc.c | 34 ++++++--
+ net/smc/smc.h | 1
+ net/smc/smc_clc.c | 10 +-
+ net/smc/smc_core.c | 68 ++++++++++++++++
+ net/smc/smc_core.h | 18 ++++
+ net/smc/smc_ib.c | 217 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ net/smc/smc_ib.h | 10 ++
+ net/smc/smc_pnet.c | 3
+ net/smc/smc_wr.c | 2
+ 9 files changed, 354 insertions(+), 9 deletions(-)
+
+--- a/net/smc/af_smc.c
++++ b/net/smc/af_smc.c
+@@ -360,9 +360,20 @@ static int smc_connect_rdma(struct smc_s
+
+ if (local_contact == SMC_FIRST_CONTACT)
+ smc_link_save_peer_info(link, &aclc);
+- /* tbd in follow-on patch: more steps to setup RDMA communcication,
+- * create rmbs, map rmbs, rtoken_handling, modify_qp
+- */
++
++ rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
++ if (rc) {
++ reason_code = SMC_CLC_DECL_INTERR;
++ goto decline_rdma_unlock;
++ }
++
++ if (local_contact == SMC_FIRST_CONTACT) {
++ rc = smc_ib_ready_link(link);
++ if (rc) {
++ reason_code = SMC_CLC_DECL_INTERR;
++ goto decline_rdma_unlock;
++ }
++ }
+
+ rc = smc_clc_send_confirm(smc);
+ if (rc)
+@@ -656,9 +667,20 @@ static void smc_listen_worker(struct wor
+ if (local_contact == SMC_FIRST_CONTACT)
+ smc_link_save_peer_info(link, &cclc);
+
+- /* tbd in follow-on patch: more steps to setup RDMA communcication,
+- * rtoken_handling, modify_qp
+- */
++ rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
++ if (rc) {
++ reason_code = SMC_CLC_DECL_INTERR;
++ goto decline_rdma;
++ }
++
++ /* tbd in follow-on patch: modify_qp, llc_confirm */
++ if (local_contact == SMC_FIRST_CONTACT) {
++ rc = smc_ib_ready_link(link);
++ if (rc) {
++ reason_code = SMC_CLC_DECL_INTERR;
++ goto decline_rdma;
++ }
++ }
+
+ out_connected:
+ sk_refcnt_debug_inc(newsmcsk);
+--- a/net/smc/smc.h
++++ b/net/smc/smc.h
+@@ -41,6 +41,7 @@ struct smc_connection {
+ atomic_t peer_rmbe_space;/* remaining free bytes in peer
+ * rmbe
+ */
++ int rtoken_idx; /* idx to peer RMB rkey/addr */
+
+ struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */
+ int sndbuf_size; /* sndbuf size <== sock wmem */
+--- a/net/smc/smc_clc.c
++++ b/net/smc/smc_clc.c
+@@ -190,13 +190,15 @@ int smc_clc_send_confirm(struct smc_sock
+ SMC_GID_SIZE);
+ memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
+ sizeof(link->smcibdev->mac));
+-
+- /* tbd in follow-on patch: fill in rmb-related values */
+-
+ hton24(cclc.qpn, link->roce_qp->qp_num);
++ cclc.rmb_rkey =
++ htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
+ cclc.rmbe_alert_token = htonl(conn->alert_token_local);
+ cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
++ cclc.rmbe_size = conn->rmbe_size_short;
++ cclc.rmb_dma_addr =
++ cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
+ hton24(cclc.psn, link->psn_initial);
+
+ memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+@@ -242,6 +244,8 @@ int smc_clc_send_accept(struct smc_sock
+ memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
+ sizeof(link->smcibdev->mac[link->ibport - 1]));
+ hton24(aclc.qpn, link->roce_qp->qp_num);
++ aclc.rmb_rkey =
++ htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */
+ aclc.rmbe_alert_token = htonl(conn->alert_token_local);
+ aclc.qp_mtu = link->path_mtu;
+--- a/net/smc/smc_core.c
++++ b/net/smc/smc_core.c
+@@ -119,6 +119,19 @@ static int smc_lgr_create(struct smc_soc
+ if (rc)
+ goto free_lgr;
+ init_waitqueue_head(&lnk->wr_tx_wait);
++ rc = smc_ib_create_protection_domain(lnk);
++ if (rc)
++ goto free_link_mem;
++ rc = smc_ib_get_memory_region(lnk->roce_pd, IB_ACCESS_LOCAL_WRITE,
++ &lnk->mr_tx);
++ if (rc)
++ goto dealloc_pd;
++ rc = smc_ib_create_queue_pair(lnk);
++ if (rc)
++ goto dereg_mr;
++ rc = smc_wr_create_lgr(lnk);
++ if (rc)
++ goto destroy_qp;
+
+ smc->conn.lgr = lgr;
+ rwlock_init(&lgr->conns_lock);
+@@ -127,6 +140,14 @@ static int smc_lgr_create(struct smc_soc
+ spin_unlock(&smc_lgr_list.lock);
+ return 0;
+
++destroy_qp:
++ smc_ib_destroy_queue_pair(lnk);
++dereg_mr:
++ smc_ib_dereg_memory_region(lnk->mr_tx);
++dealloc_pd:
++ smc_ib_dealloc_protection_domain(lnk);
++free_link_mem:
++ smc_wr_free_link_mem(lnk);
+ free_lgr:
+ kfree(lgr);
+ out:
+@@ -468,6 +489,18 @@ int smc_rmb_create(struct smc_sock *smc)
+ kfree(rmb_desc);
+ continue; /* if mapping failed, try smaller one */
+ }
++ rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd,
++ IB_ACCESS_REMOTE_WRITE |
++ IB_ACCESS_LOCAL_WRITE,
++ &rmb_desc->mr_rx[SMC_SINGLE_LINK]);
++ if (rc) {
++ smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
++ tmp_bufsize, rmb_desc,
++ DMA_FROM_DEVICE);
++ kfree(rmb_desc->cpu_addr);
++ kfree(rmb_desc);
++ continue;
++ }
+ rmb_desc->used = 1;
+ write_lock_bh(&lgr->rmbs_lock);
+ list_add(&rmb_desc->list,
+@@ -484,3 +517,38 @@ int smc_rmb_create(struct smc_sock *smc)
+ return -ENOMEM;
+ }
+ }
++
++static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
++{
++ int i;
++
++ for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
++ if (!test_and_set_bit(i, lgr->rtokens_used_mask))
++ return i;
++ }
++ return -ENOSPC;
++}
++
++/* save rkey and dma_addr received from peer during clc handshake */
++int smc_rmb_rtoken_handling(struct smc_connection *conn,
++ struct smc_clc_msg_accept_confirm *clc)
++{
++ u64 dma_addr = be64_to_cpu(clc->rmb_dma_addr);
++ struct smc_link_group *lgr = conn->lgr;
++ u32 rkey = ntohl(clc->rmb_rkey);
++ int i;
++
++ for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
++ if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
++ test_bit(i, lgr->rtokens_used_mask)) {
++ conn->rtoken_idx = i;
++ return 0;
++ }
++ }
++ conn->rtoken_idx = smc_rmb_reserve_rtoken_idx(lgr);
++ if (conn->rtoken_idx < 0)
++ return conn->rtoken_idx;
++ lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey = rkey;
++ lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr = dma_addr;
++ return 0;
++}
+--- a/net/smc/smc_core.h
++++ b/net/smc/smc_core.h
+@@ -47,6 +47,8 @@ struct smc_link {
+ struct ib_qp *roce_qp; /* IB queue pair */
+ struct ib_qp_attr qp_attr; /* IB queue pair attributes */
+
++ struct ib_mr *mr_tx; /* send IB DMA memory region */
++
+ struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */
+ struct ib_send_wr *wr_tx_ibs; /* WR send meta data */
+ struct ib_sge *wr_tx_sges; /* WR send gather meta data */
+@@ -91,9 +93,17 @@ struct smc_buf_desc {
+ u64 dma_addr[SMC_LINKS_PER_LGR_MAX];
+ /* mapped address of buffer */
+ void *cpu_addr; /* virtual address of buffer */
++ struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; /* for rmb only:
++ * rkey provided to peer
++ */
+ u32 used; /* currently used / unused */
+ };
+
++struct smc_rtoken { /* address/key of remote RMB */
++ u64 dma_addr;
++ u32 rkey;
++};
++
+ struct smc_link_group {
+ struct list_head list;
+ enum smc_lgr_role role; /* client or server */
+@@ -110,6 +120,12 @@ struct smc_link_group {
+ rwlock_t sndbufs_lock; /* protects tx buffers */
+ struct list_head rmbs[SMC_RMB_SIZES]; /* rx buffers */
+ rwlock_t rmbs_lock; /* protects rx buffers */
++ struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
++ [SMC_LINKS_PER_LGR_MAX];
++ /* remote addr/key pairs */
++ unsigned long rtokens_used_mask[BITS_TO_LONGS(
++ SMC_RMBS_PER_LGR_MAX)];
++ /* used rtoken elements */
+ };
+
+ /* Find the connection associated with the given alert token in the link group.
+@@ -150,5 +166,7 @@ struct smc_clc_msg_accept_confirm;
+ void smc_lgr_free(struct smc_link_group *);
+ int smc_sndbuf_create(struct smc_sock *);
+ int smc_rmb_create(struct smc_sock *);
++int smc_rmb_rtoken_handling(struct smc_connection *,
++ struct smc_clc_msg_accept_confirm *);
+
+ #endif
+--- a/net/smc/smc_ib.c
++++ b/net/smc/smc_ib.c
+@@ -20,6 +20,11 @@
+ #include "smc_wr.h"
+ #include "smc.h"
+
++#define SMC_QP_MIN_RNR_TIMER 5
++#define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */
++#define SMC_QP_RETRY_CNT 7 /* 7: infinite */
++#define SMC_QP_RNR_RETRY 7 /* 7: infinite */
++
+ struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
+ .lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock),
+ .list = LIST_HEAD_INIT(smc_ib_devices.list),
+@@ -31,6 +36,200 @@ u8 local_systemid[SMC_SYSTEMID_LEN] = SM
+ * identifier
+ */
+
++void smc_ib_dereg_memory_region(struct ib_mr *mr)
++{
++ ib_dereg_mr(mr);
++ mr = NULL;
++}
++
++int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
++ struct ib_mr **mr)
++{
++ if (*mr)
++ return 0; /* already done */
++ /* obtain unique key -
++ * next invocation of ib_get_dma_mr returns a different key!
++ */
++ *mr = ib_get_dma_mr(pd, access_flags);
++ if (IS_ERR(*mr))
++ *mr = NULL;
++ return PTR_ERR_OR_ZERO(*mr);
++}
++
++static int smc_ib_modify_qp_init(struct smc_link *lnk)
++{
++ struct ib_qp_attr qp_attr;
++ int rc = 0;
++
++ memset(&qp_attr, 0, sizeof(qp_attr));
++ qp_attr.qp_state = IB_QPS_INIT;
++ qp_attr.pkey_index = 0;
++ qp_attr.port_num = lnk->ibport;
++ qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
++ | IB_ACCESS_REMOTE_WRITE;
++ rc = ib_modify_qp(lnk->roce_qp, &qp_attr,
++ IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_ACCESS_FLAGS |
++ IB_QP_PORT);
++ return rc;
++}
++
++static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
++{
++ enum ib_qp_attr_mask qp_attr_mask =
++ IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
++ IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
++ struct ib_qp_attr qp_attr;
++ int rc = 0;
++
++ memset(&qp_attr, 0, sizeof(qp_attr));
++ qp_attr.qp_state = IB_QPS_RTR;
++ qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
++ qp_attr.ah_attr.port_num = lnk->ibport;
++ qp_attr.ah_attr.ah_flags = IB_AH_GRH;
++ qp_attr.ah_attr.grh.hop_limit = 1;
++ memcpy(&qp_attr.ah_attr.grh.dgid, lnk->peer_gid,
++ sizeof(lnk->peer_gid));
++ memcpy(&qp_attr.ah_attr.dmac, lnk->peer_mac,
++ sizeof(lnk->peer_mac));
++ qp_attr.dest_qp_num = lnk->peer_qpn;
++ qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
++ qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
++ * requests
++ */
++ qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
++
++ rc = ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
++ return rc;
++}
++
++int smc_ib_modify_qp_rts(struct smc_link *lnk)
++{
++ struct ib_qp_attr qp_attr;
++ int rc = 0;
++
++ memset(&qp_attr, 0, sizeof(qp_attr));
++ qp_attr.qp_state = IB_QPS_RTS;
++ qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */
++ qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */
++ qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */
++ qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */
++ qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and
++ * atomic ops allowed
++ */
++ rc = ib_modify_qp(lnk->roce_qp, &qp_attr,
++ IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
++ IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
++ IB_QP_MAX_QP_RD_ATOMIC);
++ return rc;
++}
++
++int smc_ib_ready_link(struct smc_link *lnk)
++{
++ struct smc_link_group *lgr =
++ container_of(lnk, struct smc_link_group, lnk[0]);
++ int rc = 0;
++
++ rc = smc_ib_modify_qp_init(lnk);
++ if (rc)
++ goto out;
++
++ rc = smc_ib_modify_qp_rtr(lnk);
++ if (rc)
++ goto out;
++ smc_wr_remember_qp_attr(lnk);
++ rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
++ IB_CQ_SOLICITED_MASK);
++ if (rc)
++ goto out;
++ rc = smc_wr_rx_post_init(lnk);
++ if (rc)
++ goto out;
++ smc_wr_remember_qp_attr(lnk);
++
++ if (lgr->role == SMC_SERV) {
++ rc = smc_ib_modify_qp_rts(lnk);
++ if (rc)
++ goto out;
++ smc_wr_remember_qp_attr(lnk);
++ }
++out:
++ return rc;
++}
++
++/* process context wrapper for might_sleep smc_ib_remember_port_attr */
++static void smc_ib_port_event_work(struct work_struct *work)
++{
++ struct smc_ib_device *smcibdev = container_of(
++ work, struct smc_ib_device, port_event_work);
++ u8 port_idx;
++
++ for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
++ smc_ib_remember_port_attr(smcibdev, port_idx + 1);
++ clear_bit(port_idx, &smcibdev->port_event_mask);
++ }
++}
++
++/* can be called in IRQ context */
++static void smc_ib_global_event_handler(struct ib_event_handler *handler,
++ struct ib_event *ibevent)
++{
++ struct smc_ib_device *smcibdev;
++ u8 port_idx;
++
++ smcibdev = container_of(handler, struct smc_ib_device, event_handler);
++ switch (ibevent->event) {
++ case IB_EVENT_PORT_ERR:
++ port_idx = ibevent->element.port_num - 1;
++ set_bit(port_idx, &smcibdev->port_event_mask);
++ schedule_work(&smcibdev->port_event_work);
++ /* fall through */
++ case IB_EVENT_DEVICE_FATAL:
++ /* tbd in follow-on patch:
++ * abnormal close of corresponding connections
++ */
++ break;
++ case IB_EVENT_PORT_ACTIVE:
++ port_idx = ibevent->element.port_num - 1;
++ set_bit(port_idx, &smcibdev->port_event_mask);
++ schedule_work(&smcibdev->port_event_work);
++ break;
++ default:
++ break;
++ }
++}
++
++long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
++{
++ struct ib_cq_init_attr cqattr = {
++ .cqe = SMC_WR_MAX_CQE, .comp_vector = 0 };
++ long rc = 0;
++
++ smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
++ smc_wr_tx_cq_handler, NULL,
++ smcibdev, &cqattr);
++ if (IS_ERR(smcibdev->roce_cq_send)) {
++ rc = PTR_ERR(smcibdev->roce_cq_send);
++ goto err;
++ }
++ smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
++ smc_wr_rx_cq_handler, NULL,
++ smcibdev, &cqattr);
++ if (IS_ERR(smcibdev->roce_cq_recv)) {
++ rc = PTR_ERR(smcibdev->roce_cq_recv);
++ goto err_cq;
++ }
++ INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
++ smc_ib_global_event_handler);
++ ib_register_event_handler(&smcibdev->event_handler);
++ smc_wr_add_dev(smcibdev);
++ return rc;
++
++err_cq:
++ ib_destroy_cq(smcibdev->roce_cq_send);
++err:
++ return rc;
++}
++
+ void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
+ {
+ ib_dealloc_pd(lnk->roce_pd);
+@@ -119,6 +318,17 @@ int smc_ib_buf_map(struct smc_ib_device
+ return rc;
+ }
+
++void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int buf_size,
++ struct smc_buf_desc *buf_slot,
++ enum dma_data_direction data_direction)
++{
++ if (!buf_slot->used)
++ return; /* already unmapped */
++ ib_dma_unmap_single(smcibdev->ibdev, *buf_slot->dma_addr, buf_size,
++ data_direction);
++ buf_slot->dma_addr[SMC_SINGLE_LINK] = 0;
++}
++
+ static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
+ {
+ struct net_device *ndev;
+@@ -196,6 +406,7 @@ static void smc_ib_add_dev(struct ib_dev
+ return;
+
+ smcibdev->ibdev = ibdev;
++ INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
+
+ for (i = 1; i <= SMC_MAX_PORTS; i++) {
+ if (smc_pnet_exists_in_table(smcibdev, i) &&
+@@ -205,6 +416,10 @@ static void smc_ib_add_dev(struct ib_dev
+ kfree(smcibdev);
+ return;
+ }
++ if (smc_ib_setup_per_ibdev(smcibdev)) {
++ kfree(smcibdev);
++ return;
++ }
+ smcibdev->initialized = 1;
+ break;
+ }
+@@ -221,10 +436,12 @@ static void smc_ib_remove_dev(struct ib_
+ struct smc_ib_device *smcibdev;
+
+ smcibdev = ib_get_client_data(ibdev, &smc_ib_client);
++ smc_wr_remove_dev(smcibdev);
+ ib_set_client_data(ibdev, &smc_ib_client, NULL);
+ spin_lock(&smc_ib_devices.lock);
+ list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
+ spin_unlock(&smc_ib_devices.lock);
++ cancel_work_sync(&smcibdev->port_event_work);
+ kfree(smcibdev);
+ }
+
+--- a/net/smc/smc_ib.h
++++ b/net/smc/smc_ib.h
+@@ -29,6 +29,7 @@ struct smc_ib_device { /* ib-device i
+ struct list_head list;
+ struct ib_device *ibdev;
+ struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */
++ struct ib_event_handler event_handler; /* global ib_event handler */
+ struct ib_cq *roce_cq_send; /* send completion queue */
+ struct ib_cq *roce_cq_recv; /* recv completion queue */
+ struct tasklet_struct send_tasklet; /* called by send cq handler */
+@@ -36,6 +37,8 @@ struct smc_ib_device { /* ib-device i
+ char mac[SMC_MAX_PORTS][6]; /* mac address per port*/
+ union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */
+ u8 initialized : 1; /* ib dev CQ, evthdl done */
++ struct work_struct port_event_work;
++ unsigned long port_event_mask;
+ };
+
+ struct smc_sock;
+@@ -48,9 +51,16 @@ bool smc_ib_port_active(struct smc_ib_de
+ int smc_ib_remember_port_attr(struct smc_ib_device *, u8);
+ int smc_ib_buf_map(struct smc_ib_device *, int, struct smc_buf_desc *,
+ enum dma_data_direction);
++void smc_ib_buf_unmap(struct smc_ib_device *, int, struct smc_buf_desc *,
++ enum dma_data_direction);
+ void smc_ib_dealloc_protection_domain(struct smc_link *);
+ int smc_ib_create_protection_domain(struct smc_link *);
+ void smc_ib_destroy_queue_pair(struct smc_link *);
+ int smc_ib_create_queue_pair(struct smc_link *);
++void smc_ib_dereg_memory_region(struct ib_mr *);
++int smc_ib_get_memory_region(struct ib_pd *, int, struct ib_mr **);
++int smc_ib_ready_link(struct smc_link *);
++int smc_ib_modify_qp_rts(struct smc_link *);
++long smc_ib_setup_per_ibdev(struct smc_ib_device *);
+
+ #endif
+--- a/net/smc/smc_pnet.c
++++ b/net/smc/smc_pnet.c
+@@ -223,6 +223,9 @@ out:
+ pnetelem->ib_port);
+ if (rc)
+ return rc;
++ rc = smc_ib_setup_per_ibdev(smcibdev);
++ if (rc)
++ return rc;
+ smcibdev->initialized = 1;
+ }
+ return rc;
+--- a/net/smc/smc_wr.c
++++ b/net/smc/smc_wr.c
+@@ -392,6 +392,7 @@ static void smc_wr_init_sge(struct smc_l
+ lnk->wr_tx_sges[i].addr =
+ lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
+ lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
++ lnk->wr_tx_sges[i].lkey = lnk->mr_tx->lkey;
+ lnk->wr_tx_ibs[i].next = NULL;
+ lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
+ lnk->wr_tx_ibs[i].num_sge = 1;
+@@ -403,6 +404,7 @@ static void smc_wr_init_sge(struct smc_l
+ lnk->wr_rx_sges[i].addr =
+ lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
+ lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE;
++ lnk->wr_rx_sges[i].lkey = lnk->mr_tx->lkey;
+ lnk->wr_rx_ibs[i].next = NULL;
+ lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
+ lnk->wr_rx_ibs[i].num_sge = 1;
diff --git a/patches.arch/s390-sles12sp2-00-05-net-smc-r-10.patch b/patches.arch/s390-sles12sp2-00-05-net-smc-r-10.patch
new file mode 100644
index 0000000000..7f6a231e1e
--- /dev/null
+++ b/patches.arch/s390-sles12sp2-00-05-net-smc-r-10.patch
@@ -0,0 +1,477 @@
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Subject: smc: link layer control (LLC)
+Patch-mainline: not yet, IBM pushing upstream
+References: bsc#978258,FATE#319593,LTC#131290
+
+Summary: net/smc: Shared Memory Communications - RDMA
+Description: Initial part of the implementation of the "Shared Memory
+ Communications-RDMA" (SMC-R) protocol. The protocol is defined
+ in RFC7609 [1]. It allows transparent transformation of TCP
+ connections using the "Remote Direct Memory Access over
+ Converged Ethernet" (RoCE) feature of certain communication
+ hardware for data center environments. Tested on s390 and x86
+ using Mellanox ConnectX-3 cards.
+
+ A new socket protocol family PF_SMC is being introduced. A
+ preload shared library will be offered to enable TCP-based
+ applications to use SMC-R without changes or recompilation.
+
+ References:
+ [1] SMC-R Informational RFC:
+ https://tools.ietf.org/rfc/rfc7609
+
+Upstream-Description:
+
+ smc: link layer control (LLC)
+
+ send and receive LLC messages CONFIRM_LINK (via IB message send and CQE)
+
+ Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+
+Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Acked-by: John Jolly <jjolly@suse.de>
+---
+ net/smc/Makefile | 2
+ net/smc/af_smc.c | 94 ++++++++++++++++++++++++++++++++
+ net/smc/smc_clc.h | 2
+ net/smc/smc_core.c | 9 +++
+ net/smc/smc_core.h | 5 +
+ net/smc/smc_llc.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ net/smc/smc_llc.h | 63 +++++++++++++++++++++
+ 7 files changed, 324 insertions(+), 3 deletions(-)
+
+--- a/net/smc/Makefile
++++ b/net/smc/Makefile
+@@ -1,2 +1,2 @@
+ obj-$(CONFIG_SMC) += smc.o
+-smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o
++smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
+--- a/net/smc/af_smc.c
++++ b/net/smc/af_smc.c
+@@ -30,6 +30,7 @@
+
+ #include "smc.h"
+ #include "smc_clc.h"
++#include "smc_llc.h"
+ #include "smc_core.h"
+ #include "smc_ib.h"
+ #include "smc_pnet.h"
+@@ -262,6 +263,41 @@ out:
+ return rc;
+ }
+
++static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
++{
++ struct smc_link_group *lgr = smc->conn.lgr;
++ struct smc_link *link;
++ int rest;
++ int rc;
++
++ link = &lgr->lnk[SMC_SINGLE_LINK];
++ /* receive CONFIRM LINK request from server over RoCE fabric */
++ rest = wait_for_completion_interruptible_timeout(
++ &link->llc_confirm,
++ SMC_LLC_WAIT_FIRST_TIME);
++ if (rest <= 0) {
++ struct smc_clc_msg_decline dclc;
++
++ rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
++ SMC_CLC_DECLINE);
++ return rc;
++ }
++
++ rc = smc_ib_modify_qp_rts(link);
++ if (rc)
++ return SMC_CLC_DECL_INTERR;
++
++ smc_wr_remember_qp_attr(link);
++ /* send CONFIRM LINK response over RoCE fabric */
++ rc = smc_llc_send_confirm_link(link,
++ link->smcibdev->mac[link->ibport - 1],
++ gid, SMC_LLC_RESP);
++ if (rc < 0)
++ return SMC_CLC_DECL_TCL;
++
++ return rc;
++}
++
+ static void smc_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+ {
+@@ -379,7 +415,17 @@ static int smc_connect_rdma(struct smc_s
+ if (rc)
+ goto out_err_unlock;
+
+- /* tbd in follow-on patch: llc_confirm */
++ if (local_contact == SMC_FIRST_CONTACT) {
++ /* QP confirmation over RoCE fabric */
++ reason_code = smc_clnt_conf_first_link(
++ smc, &smcibdev->gid[ibport - 1]);
++ if (reason_code < 0) {
++ rc = reason_code;
++ goto out_err_unlock;
++ }
++ if (reason_code > 0)
++ goto decline_rdma_unlock;
++ }
+
+ mutex_unlock(&smc_create_lgr_pending);
+ out_connected:
+@@ -561,6 +607,36 @@ static void smc_destruct_non_accepted(st
+ sock_put(sk);
+ }
+
++static int smc_serv_conf_first_link(struct smc_sock *smc)
++{
++ struct smc_link_group *lgr = smc->conn.lgr;
++ struct smc_link *link;
++ int rest;
++ int rc;
++
++ link = &lgr->lnk[SMC_SINGLE_LINK];
++ /* send CONFIRM LINK request to client over the RoCE fabric */
++ rc = smc_llc_send_confirm_link(link,
++ link->smcibdev->mac[link->ibport - 1],
++ &link->smcibdev->gid[link->ibport - 1],
++ SMC_LLC_REQ);
++ if (rc < 0)
++ return SMC_CLC_DECL_TCL;
++
++ /* receive CONFIRM LINK response from client over the RoCE fabric */
++ rest = wait_for_completion_interruptible_timeout(
++ &link->llc_confirm_resp,
++ SMC_LLC_WAIT_FIRST_TIME);
++ if (rest <= 0) {
++ struct smc_clc_msg_decline dclc;
++
++ rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
++ SMC_CLC_DECLINE);
++ }
++
++ return rc;
++}
++
+ /* setup for RDMA connection of server */
+ static void smc_listen_worker(struct work_struct *work)
+ {
+@@ -673,13 +749,21 @@ static void smc_listen_worker(struct wor
+ goto decline_rdma;
+ }
+
+- /* tbd in follow-on patch: modify_qp, llc_confirm */
+ if (local_contact == SMC_FIRST_CONTACT) {
+ rc = smc_ib_ready_link(link);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_INTERR;
+ goto decline_rdma;
+ }
++ /* QP confirmation over RoCE fabric */
++ reason_code = smc_serv_conf_first_link(new_smc);
++ if (reason_code < 0) {
++ /* peer is not aware of a problem */
++ rc = reason_code;
++ goto out_err;
++ }
++ if (reason_code > 0)
++ goto decline_rdma;
+ }
+
+ out_connected:
+@@ -1130,6 +1214,12 @@ static int __init smc_init(void)
+ if (rc)
+ return rc;
+
++ rc = smc_llc_init();
++ if (rc) {
++ pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
++ goto out_pnet;
++ }
++
+ rc = proto_register(&smc_proto, 1);
+ if (rc) {
+ pr_err("%s: proto_register fails with %d\n", __func__, rc);
+--- a/net/smc/smc_clc.h
++++ b/net/smc/smc_clc.h
+@@ -33,6 +33,8 @@ static const char SMC_EYECATCHER[4] = {'
+ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
+ #define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */
+ #define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */
++#define SMC_CLC_DECL_TCL 0x02040000 /* timeout w4 QP confirm */
++#define SMC_CLC_DECL_SEND 0x07000000 /* sending problem */
+
+ struct smc_clc_msg_hdr { /* header1 of clc messages */
+ u8 eyecatcher[4]; /* eye catcher */
+--- a/net/smc/smc_core.c
++++ b/net/smc/smc_core.c
+@@ -20,6 +20,11 @@
+ #include "smc_core.h"
+ #include "smc_ib.h"
+ #include "smc_wr.h"
++#include "smc_llc.h"
++
++#define SMC_LGR_NUM_INCR 256
++
++static u32 smc_lgr_num; /* unique link group number */
+
+ /* Register connection's alert token in our lookup structure.
+ * To use rbtrees we have to implement our own insert core.
+@@ -107,6 +112,8 @@ static int smc_lgr_create(struct smc_soc
+ INIT_LIST_HEAD(&lgr->sndbufs[i]);
+ INIT_LIST_HEAD(&lgr->rmbs[i]);
+ }
++ smc_lgr_num += SMC_LGR_NUM_INCR;
++ lgr->id = smc_lgr_num;
+
+ lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ /* initialize link */
+@@ -132,6 +139,8 @@ static int smc_lgr_create(struct smc_soc
+ rc = smc_wr_create_lgr(lnk);
+ if (rc)
+ goto destroy_qp;
++ init_completion(&lnk->llc_confirm);
++ init_completion(&lnk->llc_confirm_resp);
+
+ smc->conn.lgr = lgr;
+ rwlock_init(&lgr->conns_lock);
+--- a/net/smc/smc_core.h
++++ b/net/smc/smc_core.h
+@@ -76,6 +76,9 @@ struct smc_link {
+ u32 peer_psn; /* QP rx initial packet seqno */
+ u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */
+ u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/
++ u8 link_id; /* unique # within link group */
++ struct completion llc_confirm; /* wait for rx of conf link */
++ struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
+ };
+
+ /* For now we just allow one parallel link per link group. The SMC protocol
+@@ -126,6 +129,8 @@ struct smc_link_group {
+ unsigned long rtokens_used_mask[BITS_TO_LONGS(
+ SMC_RMBS_PER_LGR_MAX)];
+ /* used rtoken elements */
++
++ u32 id; /* unique lgr id */
+ };
+
+ /* Find the connection associated with the given alert token in the link group.
+--- /dev/null
++++ b/net/smc/smc_llc.c
+@@ -0,0 +1,152 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * Link Layer Control (LLC)
++ *
++ * For now, we only support the necessary "confirm link" functionality
++ * which happens for the first RoCE link after successful CLC handshake.
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
++ * Ursula Braun <ursula.braun@de.ibm.com>
++ */
++
++#include <net/tcp.h>
++#include <rdma/ib_verbs.h>
++
++#include "smc.h"
++#include "smc_core.h"
++#include "smc_clc.h"
++#include "smc_llc.h"
++
++struct smc_llc_tx_pend {
++};
++
++/* handler for send/transmission completion of an LLC msg */
++static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend,
++ struct smc_link *link,
++ enum ib_wc_status wc_status)
++{
++ /* future work: handle wc_status error for recovery and failover */
++}
++
++/**
++ * smc_llc_add_pending_send() - add LLC control message to pending WQE transmits
++ * @link: Pointer to SMC link used for sending LLC control message.
++ * @wr_buf: Out variable returning pointer to work request payload buffer.
++ * @pend: Out variable returning pointer to private pending WR tracking.
++ * It's the context the transmit complete handler will get.
++ *
++ * Reserves and pre-fills an entry for a pending work request send/tx.
++ * Used by mid-level smc_llc_send_msg() to prepare for later actual send/tx.
++ * Can sleep due to smc_get_ctrl_buf (if not in softirq context).
++ *
++ * Return: 0 on success, otherwise an error value.
++ */
++static int smc_llc_add_pending_send(struct smc_link *link,
++ struct smc_wr_buf **wr_buf,
++ struct smc_wr_tx_pend_priv **pend)
++{
++ int rc;
++
++ rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, pend);
++ if (rc < 0)
++ return rc;
++ BUILD_BUG_ON_MSG(
++ sizeof(union smc_llc_msg) > SMC_WR_BUF_SIZE,
++ "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_llc_msg)");
++ BUILD_BUG_ON_MSG(
++ sizeof(union smc_llc_msg) != SMC_WR_TX_SIZE,
++ "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_llc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
++ BUILD_BUG_ON_MSG(
++ sizeof(struct smc_llc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
++ "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_llc_tx_pend)");
++ return 0;
++}
++
++/* high-level API to send LLC confirm link */
++int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
++ union ib_gid *gid,
++ enum smc_llc_reqresp reqresp)
++{
++ struct smc_link_group *lgr = container_of(link, struct smc_link_group,
++ lnk[SMC_SINGLE_LINK]);
++ struct smc_llc_msg_confirm_link *confllc;
++ struct smc_wr_tx_pend_priv *pend;
++ struct smc_wr_buf *wr_buf;
++ int rc;
++
++ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
++ if (rc)
++ return rc;
++ confllc = (struct smc_llc_msg_confirm_link *)wr_buf;
++ memset(confllc, 0, sizeof(*confllc));
++ confllc->hd.common.type = SMC_LLC_CONFIRM_LINK;
++ confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link);
++ if (reqresp == SMC_LLC_RESP)
++ confllc->hd.flags |= SMC_LLC_FLAG_RESP;
++ memcpy(confllc->sender_mac, mac, ETH_ALEN);
++ memcpy(&confllc->sender_gid, gid, SMC_GID_SIZE);
++ hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
++ /* confllc->link_num = SMC_SINGLE_LINK; already done by memset above */
++ confllc->link_uid = htonl(lgr->id);
++ confllc->max_links = SMC_LINKS_PER_LGR_MAX;
++ /* send llc message */
++ rc = smc_wr_tx_send(link, NULL, pend);
++ return rc;
++}
++
++static void smc_llc_rx_confirm_link(struct smc_link *link,
++ struct smc_llc_msg_confirm_link *llc)
++{
++ struct smc_link_group *lgr;
++
++ lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
++ if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
++ if (lgr->role == SMC_SERV)
++ complete(&link->llc_confirm_resp);
++ } else {
++ if (lgr->role == SMC_CLNT) {
++ link->link_id = llc->link_num;
++ complete(&link->llc_confirm);
++ }
++ }
++}
++
++static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
++{
++ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
++ union smc_llc_msg *llc = buf;
++
++ if (wc->byte_len < sizeof(*llc))
++ return; /* short message */
++ if (llc->raw.hdr.length != sizeof(*llc))
++ return; /* invalid message */
++ if (llc->raw.hdr.common.type == SMC_LLC_CONFIRM_LINK)
++ smc_llc_rx_confirm_link(link, &llc->confirm_link);
++}
++
++static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
++ {
++ .handler = smc_llc_rx_handler,
++ .type = SMC_LLC_CONFIRM_LINK
++ },
++ {
++ .handler = NULL,
++ }
++};
++
++int __init smc_llc_init(void)
++{
++ struct smc_wr_rx_handler *handler;
++ int rc = 0;
++
++ for (handler = smc_llc_rx_handlers; handler->handler; handler++) {
++ INIT_HLIST_NODE(&handler->list);
++ rc = smc_wr_rx_register_handler(handler);
++ if (rc)
++ break;
++ }
++ return rc;
++}
+--- /dev/null
++++ b/net/smc/smc_llc.h
+@@ -0,0 +1,63 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * Definitions for LLC (link layer control) message handling
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
++ * Ursula Braun <ursula.braun@de.ibm.com>
++ */
++
++#ifndef SMC_LLC_H
++#define SMC_LLC_H
++
++#include "smc_wr.h"
++
++#define SMC_LLC_FLAG_RESP 0x80
++
++#define SMC_LLC_WAIT_FIRST_TIME (5 * HZ)
++
++enum smc_llc_reqresp {
++ SMC_LLC_REQ,
++ SMC_LLC_RESP
++};
++
++enum smc_llc_msg_type {
++ SMC_LLC_CONFIRM_LINK = 0x01,
++};
++
++#define SMC_LLC_DATA_LEN 40
++
++struct smc_llc_hdr {
++ struct smc_wr_rx_hdr common;
++ u8 length; /* 44 */
++ u8 reserved;
++ u8 flags;
++} __packed;
++
++struct smc_llc_msg_confirm_link { /* type 0x01 */
++ struct smc_llc_hdr hd;
++ u8 sender_mac[ETH_ALEN];
++ union ib_gid sender_gid;
++ u8 sender_qp_num[3];
++ u8 link_num;
++ __be32 link_uid;
++ u8 max_links;
++ u8 reserved[9];
++} __packed;
++
++union smc_llc_msg {
++ struct smc_llc_msg_confirm_link confirm_link;
++ struct {
++ struct smc_llc_hdr hdr;
++ u8 data[SMC_LLC_DATA_LEN];
++ } __packed raw;
++} __packed;
++
++/* transmit */
++int smc_llc_send_confirm_link(struct smc_link *, u8 *, union ib_gid *,
++ enum smc_llc_reqresp);
++int smc_llc_init(void) __init;
++
++#endif /* SMC_LLC_H */
diff --git a/patches.arch/s390-sles12sp2-00-05-net-smc-r-11.patch b/patches.arch/s390-sles12sp2-00-05-net-smc-r-11.patch
new file mode 100644
index 0000000000..664efaefcd
--- /dev/null
+++ b/patches.arch/s390-sles12sp2-00-05-net-smc-r-11.patch
@@ -0,0 +1,627 @@
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Subject: smc: connection data control (CDC)
+Patch-mainline: not yet, IBM pushing upstream
+References: bsc#978258,FATE#319593,LTC#131290
+
+Summary: net/smc: Shared Memory Communications - RDMA
+Description: Initial part of the implementation of the "Shared Memory
+ Communications-RDMA" (SMC-R) protocol. The protocol is defined
+ in RFC7609 [1]. It allows transparent transformation of TCP
+ connections using the "Remote Direct Memory Access over
+ Converged Ethernet" (RoCE) feature of certain communication
+ hardware for data center environments. Tested on s390 and x86
+ using Mellanox ConnectX-3 cards.
+
+ A new socket protocol family PF_SMC is being introduced. A
+ preload shared library will be offered to enable TCP-based
+ applications to use SMC-R without changes or recompilation.
+
+ References:
+ [1] SMC-R Informational RFC:
+ https://tools.ietf.org/rfc/rfc7609
+
+Upstream-Description:
+
+ smc: connection data control (CDC)
+
+ send and receive CDC messages (via IB message send and CQE)
+
+ Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+
+Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Acked-by: John Jolly <jjolly@suse.de>
+---
+ net/smc/Makefile | 2
+ net/smc/af_smc.c | 8 +
+ net/smc/smc.h | 98 ++++++++++++++++++++++
+ net/smc/smc_cdc.c | 227 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ net/smc/smc_cdc.h | 160 +++++++++++++++++++++++++++++++++++++
+ net/smc/smc_core.c | 5 +
+ 6 files changed, 499 insertions(+), 1 deletion(-)
+
+--- a/net/smc/Makefile
++++ b/net/smc/Makefile
+@@ -1,2 +1,2 @@
+ obj-$(CONFIG_SMC) += smc.o
+-smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
++smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc_cdc.o
+--- a/net/smc/af_smc.c
++++ b/net/smc/af_smc.c
+@@ -31,6 +31,7 @@
+ #include "smc.h"
+ #include "smc_clc.h"
+ #include "smc_llc.h"
++#include "smc_cdc.h"
+ #include "smc_core.h"
+ #include "smc_ib.h"
+ #include "smc_pnet.h"
+@@ -302,6 +303,7 @@ static void smc_conn_save_peer_info(stru
+ struct smc_clc_msg_accept_confirm *clc)
+ {
+ smc->conn.peer_conn_idx = clc->conn_idx;
++ smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
+ smc->conn.peer_rmbe_len = smc_uncompress_bufsize(clc->rmbe_size);
+ atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_len);
+ }
+@@ -1220,6 +1222,12 @@ static int __init smc_init(void)
+ goto out_pnet;
+ }
+
++ rc = smc_cdc_init();
++ if (rc) {
++ pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
++ goto out_pnet;
++ }
++
+ rc = proto_register(&smc_proto, 1);
+ if (rc) {
+ pr_err("%s: proto_register fails with %d\n", __func__, rc);
+--- a/net/smc/smc.h
++++ b/net/smc/smc.h
+@@ -18,6 +18,15 @@
+
+ #define SMCPROTO_SMC 0 /* SMC protocol */
+
++#define smc_stop_received(conn) \
++ (conn->local_rx_ctrl.conn_state_flags.sending_done || \
++ conn->local_rx_ctrl.conn_state_flags.abnormal_close || \
++ conn->local_rx_ctrl.conn_state_flags.closed_conn)
++
++#define smc_close_received(conn) \
++ (conn->local_rx_ctrl.conn_state_flags.abnormal_close || \
++ conn->local_rx_ctrl.conn_state_flags.closed_conn)
++
+ enum smc_state { /* possible states of an SMC socket */
+ SMC_ACTIVE = 1,
+ SMC_INIT = 2,
+@@ -32,6 +41,66 @@ struct smc_wr_rx_hdr { /* common prefix
+ u8 type;
+ } __packed;
+
++struct smc_cdc_conn_state_flags {
++#if defined(__BIG_ENDIAN_BITFIELD)
++ u8 sending_done : 1; /* Sending done indicator */
++ u8 closed_conn : 1; /* Peer connection closed indicator */
++ u8 abnormal_close : 1; /* Abnormal close indicator */
++ u8 reserved : 5;
++#elif defined(__LITTLE_ENDIAN_BITFIELD)
++ u8 reserved : 5;
++ u8 abnormal_close : 1;
++ u8 closed_conn : 1;
++ u8 sending_done : 1;
++#endif
++} __packed;
++
++struct smc_cdc_producer_flags {
++#if defined(__BIG_ENDIAN_BITFIELD)
++ u8 write_blocked : 1; /* Writing Blocked, no rx buf space */
++ u8 urg_data_pending : 1; /* Urgent Data Pending */
++ u8 urg_data_present : 1; /* Urgent Data Present */
++ u8 cons_curs_upd_req : 1; /* cursor update requested */
++ u8 failover_validation : 1;/* message replay due to failover */
++ u8 reserved : 3;
++#elif defined(__LITTLE_ENDIAN_BITFIELD)
++ u8 reserved : 3;
++ u8 failover_validation : 1;
++ u8 cons_curs_upd_req : 1;
++ u8 urg_data_present : 1;
++ u8 urg_data_pending : 1;
++ u8 write_blocked : 1;
++#endif
++} __packed;
++
++/* in host byte order */
++struct smc_host_cursor { /* SMC cursor - an offset in an RMBE */
++ u16 reserved;
++ u16 wrap; /* window wrap sequence number */
++ u32 count; /* cursor (= offset) part */
++} __aligned(8);
++
++/* in host byte order */
++union smc_host_cursor_ovl { /* overlay for atomic cursor handling */
++ struct smc_host_cursor curs;
++ u64 acurs;
++} __aligned(8);
++
++/* in host byte order, except for flag bitfields in network byte order */
++struct smc_host_cdc_msg { /* Connection Data Control message */
++ struct smc_wr_rx_hdr common; /* .type = 0xFE */
++ u8 len; /* length = 44 */
++ u16 seqno; /* connection seq # */
++ u32 token; /* alert_token */
++ union smc_host_cursor_ovl prod; /* producer cursor */
++ union smc_host_cursor_ovl cons; /* consumer cursor,
++ * piggy backed "ack"
++ */
++ struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */
++ struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/
++ u8 reserved[18];
++} __packed __aligned(8);
++
+ struct smc_connection {
+ struct rb_node alert_node;
+ struct smc_link_group *lgr; /* link group of connection */
+@@ -48,6 +117,35 @@ struct smc_connection {
+ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
+ int rmbe_size; /* RMBE size <== sock rmem */
+ int rmbe_size_short;/* compressed notation */
++
++ struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging
++ * buffer for CDC msg send
++ * .prod cf. TCP snd_nxt
++ * .cons cf. TCP sends ack
++ */
++ union smc_host_cursor_ovl tx_curs_prep; /* tx - prepared data
++ * snd_max..wmem_alloc
++ */
++ union smc_host_cursor_ovl tx_curs_sent; /* tx - sent data
++ * snd_nxt ?
++ */
++ union smc_host_cursor_ovl tx_curs_fin; /* tx - confirmed by peer
++ * snd-wnd-begin ?
++ */
++ atomic_t sndbuf_space; /* remaining space in sndbuf */
++ u16 tx_cdc_seq; /* sequence # for CDC send */
++ spinlock_t send_lock; /* protect wr_sends */
++
++ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl.
++ * .prod cf. TCP rcv_nxt
++ * .cons cf. TCP snd_una
++ */
++ union smc_host_cursor_ovl rx_curs_confirmed; /* confirmed to peer
++ * source of snd_una ?
++ */
++ atomic_t bytes_to_rcv; /* arrived data,
++ * not yet received
++ */
+ };
+
+ struct smc_sock { /* smc sock container */
+--- /dev/null
++++ b/net/smc/smc_cdc.c
+@@ -0,0 +1,227 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * Connection Data Control (CDC)
++ * handles flow control
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Ursula Braun <ursula.braun@de.ibm.com>
++ */
++
++#include <linux/spinlock.h>
++
++#include "smc.h"
++#include "smc_wr.h"
++#include "smc_cdc.h"
++
++struct smc_cdc_tx_pend {
++ struct smc_connection *conn; /* socket connection */
++ union smc_host_cursor_ovl cursor; /* tx sndbuf cursor sent */
++ union smc_host_cursor_ovl p_cursor; /* rx RMBE cursor produced */
++ u16 ctrl_seq; /* conn. tx sequence # */
++};
++
++/* handler for send/transmission completion of a CDC msg */
++static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
++ struct smc_link *link,
++ enum ib_wc_status wc_status)
++{
++ struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd;
++ struct smc_link_group *lgr;
++ struct smc_sock *smc;
++ int diff;
++
++ smc = container_of(cdcpend->conn, struct smc_sock, conn);
++ lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
++ bh_lock_sock(&smc->sk);
++ if (!wc_status) {
++ diff = smc_curs_diff(cdcpend->conn->sndbuf_size,
++ &cdcpend->conn->tx_curs_fin,
++ &cdcpend->cursor);
++ smp_mb__before_atomic();
++ atomic_add(diff, &cdcpend->conn->sndbuf_space);
++ smp_mb__after_atomic();
++ xchg(&cdcpend->conn->tx_curs_fin.acurs,
++ cdcpend->cursor.acurs);
++ }
++ /* subsequent patch: wake if send buffer space available */
++ bh_unlock_sock(&smc->sk);
++}
++
++int smc_cdc_get_free_slot(struct smc_link *link,
++ struct smc_wr_buf **wr_buf,
++ struct smc_cdc_tx_pend **pend)
++{
++ return smc_wr_tx_get_free_slot(
++ link, smc_cdc_tx_handler, wr_buf,
++ (struct smc_wr_tx_pend_priv **)pend);
++}
++
++static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
++ struct smc_cdc_tx_pend *pend)
++{
++ BUILD_BUG_ON_MSG(
++ sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
++ "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
++ BUILD_BUG_ON_MSG(
++ sizeof(struct smc_cdc_msg) != SMC_WR_TX_SIZE,
++ "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
++ BUILD_BUG_ON_MSG(
++ sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
++ "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_cdc_tx_pend)");
++ pend->conn = conn;
++ pend->cursor.curs = conn->tx_curs_sent.curs;
++ pend->p_cursor.curs = conn->local_tx_ctrl.prod.curs;
++ pend->ctrl_seq = conn->tx_cdc_seq;
++}
++
++int smc_cdc_msg_send(struct smc_connection *conn,
++ struct smc_wr_buf *wr_buf,
++ struct smc_cdc_tx_pend *pend)
++{
++ struct smc_link *link;
++ int rc;
++
++ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
++
++ smc_cdc_add_pending_send(conn, pend);
++
++ conn->tx_cdc_seq++;
++ conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
++ smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, &conn->local_tx_ctrl);
++ rc = smc_wr_tx_send(link, conn, (struct smc_wr_tx_pend_priv *)pend);
++ if (rc)
++ goto out;
++ xchg(&conn->rx_curs_confirmed.acurs,
++ smc_curs_read(conn->local_tx_ctrl.cons.acurs));
++
++out:
++ return rc;
++}
++
++static inline bool smc_cdc_before(u16 seq1, u16 seq2)
++{
++ return (s16)(seq1 - seq2) < 0;
++}
++
++static void smc_cdc_msg_recv_action(struct smc_sock *smc,
++ struct smc_link *link,
++ struct smc_cdc_msg *cdc)
++{
++ union smc_host_cursor_ovl cons_old, prod_old;
++ struct smc_connection *conn = &smc->conn;
++ int diff_cons, diff_prod;
++
++ if (!cdc->prod_flags.failover_validation) {
++ if (smc_cdc_before(ntohs(cdc->seqno),
++ conn->local_rx_ctrl.seqno))
++ /* received seqno is old */
++ return;
++ }
++ prod_old.acurs = smc_curs_read(conn->local_rx_ctrl.prod.acurs);
++ cons_old.acurs = smc_curs_read(conn->local_rx_ctrl.cons.acurs);
++ smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc);
++
++ diff_cons = smc_curs_diff(conn->peer_rmbe_len, &cons_old,
++ &conn->local_rx_ctrl.cons);
++ if (diff_cons) {
++ smp_mb__before_atomic();
++ atomic_add(diff_cons, &conn->peer_rmbe_space);
++ smp_mb__after_atomic();
++ }
++
++ diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old,
++ &conn->local_rx_ctrl.prod);
++ if (diff_prod) {
++ smp_mb__before_atomic();
++ atomic_add(diff_prod, &conn->bytes_to_rcv);
++ smp_mb__after_atomic();
++ }
++
++ if (conn->local_rx_ctrl.conn_state_flags.abnormal_close)
++ smc->sk.sk_err = ECONNRESET;
++ if (smc_stop_received(conn)) {
++ smc->sk.sk_shutdown |= RCV_SHUTDOWN;
++ sock_set_flag(&smc->sk, SOCK_DONE);
++
++ /* subsequent patch: terminate connection */
++ }
++
++ /* piggy backed tx info */
++ /* subsequent patch: wake receivers if receive buffer space available */
++
++ /* subsequent patch: trigger socket release if connection closed */
++
++ /* socket connected but not accepted */
++ if (!smc->sk.sk_socket)
++ return;
++
++ /* data available */
++ /* subsequent patch: send delayed ack, wake receivers */
++}
++
++/* called under tasklet context */
++static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc,
++ struct smc_link *link, u64 wr_id)
++{
++ struct smc_link_group *lgr = container_of(link, struct smc_link_group,
++ lnk[SMC_SINGLE_LINK]);
++ struct smc_connection *connection;
++ struct smc_sock *smc;
++
++ /* lookup connection */
++ read_lock_bh(&lgr->conns_lock);
++ connection = smc_lgr_find_conn(ntohl(cdc->token), lgr);
++ if (!connection) {
++ read_unlock_bh(&lgr->conns_lock);
++ return;
++ }
++ smc = container_of(connection, struct smc_sock, conn);
++ if (smc->sk.sk_state == SMC_DESTRUCT) {
++ read_unlock_bh(&lgr->conns_lock);
++ return;
++ }
++ sock_hold(&smc->sk);
++ read_unlock_bh(&lgr->conns_lock);
++ bh_lock_sock(&smc->sk);
++ smc_cdc_msg_recv_action(smc, link, cdc);
++ bh_unlock_sock(&smc->sk);
++ sock_put(&smc->sk); /* no free sk in softirq-context */
++}
++
++static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
++{
++ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
++ struct smc_cdc_msg *cdc = buf;
++
++ if (wc->byte_len < sizeof(*cdc))
++ return; /* short message */
++ if (cdc->len != sizeof(*cdc))
++ return; /* invalid message */
++ smc_cdc_msg_recv(cdc, link, wc->wr_id);
++}
++
++static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = {
++ {
++ .handler = smc_cdc_rx_handler,
++ .type = SMC_CDC_MSG_TYPE
++ },
++ {
++ .handler = NULL,
++ }
++};
++
++int __init smc_cdc_init(void)
++{
++ struct smc_wr_rx_handler *handler;
++ int rc = 0;
++
++ for (handler = smc_cdc_rx_handlers; handler->handler; handler++) {
++ INIT_HLIST_NODE(&handler->list);
++ rc = smc_wr_rx_register_handler(handler);
++ if (rc)
++ break;
++ }
++ return rc;
++}
+--- /dev/null
++++ b/net/smc/smc_cdc.h
+@@ -0,0 +1,160 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * Connection Data Control (CDC)
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Ursula Braun <ursula.braun@de.ibm.com>
++ */
++
++#ifndef SMC_CDC_H
++#define SMC_CDC_H
++
++#include <linux/kernel.h> /* max_t */
++#include <linux/compiler.h> /* __packed */
++#include <linux/atomic.h> /* xchg */
++
++#include "smc.h"
++#include "smc_core.h"
++#include "smc_wr.h"
++
++#define SMC_CDC_MSG_TYPE 0xFE
++
++/* in network byte order */
++struct smc_cdc_cursor { /* SMC cursor */
++ __be16 reserved;
++ __be16 wrap;
++ __be32 count;
++} __packed __aligned(8);
++
++/* in network byte order */
++union smc_cdc_cursor_ovl {
++ struct smc_cdc_cursor curs;
++ __be64 acurs;
++} __packed __aligned(8);
++
++/* in network byte order */
++struct smc_cdc_msg {
++ struct smc_wr_rx_hdr common; /* .type = 0xFE */
++ u8 len; /* 44 */
++ __be16 seqno;
++ __be32 token;
++ union smc_cdc_cursor_ovl prod;
++ union smc_cdc_cursor_ovl cons; /* piggy backed "ack" */
++ struct smc_cdc_producer_flags prod_flags;
++ struct smc_cdc_conn_state_flags conn_state_flags;
++ u8 reserved[18];
++} __packed;
++
++static inline void smc_curs_add(int size, struct smc_host_cursor *curs,
++ int value)
++{
++ curs->wrap += (curs->count + value) / size;
++ curs->count = (curs->count + value) % size;
++}
++
++static inline u64 smc_curs_read(u64 c)
++{
++#if BITS_PER_LONG != 64
++ /* We must enforce atomic readout on 32bit, otherwise the
++ * update on another cpu can hit inbetween the readout of
++ * the low 32bit and the high 32bit portion.
++ */
++ return cmpxchg64(&c, 0, 0);
++#else
++ /* On 64 bit the cursor read is atomic versus the update */
++ return c;
++#endif
++}
++
++static inline __be64 smc_curs_read_net(__be64 c)
++{
++#if BITS_PER_LONG != 64
++ /* We must enforce atomic readout on 32bit, otherwise the
++ * update on another cpu can hit inbetween the readout of
++ * the low 32bit and the high 32bit portion.
++ */
++ return cmpxchg64(&c, 0, 0);
++#else
++ /* On 64 bit the cursor read is atomic versus the update */
++ return c;
++#endif
++}
++
++/* calculate cursor difference between old and new, where old <= new */
++static inline int smc_curs_diff(unsigned int size,
++ union smc_host_cursor_ovl *old,
++ union smc_host_cursor_ovl *new)
++{
++ if (old->curs.wrap != new->curs.wrap)
++ return max_t(int, 0,
++ ((size - old->curs.count) + new->curs.count));
++
++ return max_t(int, 0, (new->curs.count - old->curs.count));
++}
++
++static inline void smc_host_cursor_to_cdc(struct smc_cdc_cursor *peer,
++ union smc_host_cursor_ovl *local)
++{
++ union smc_host_cursor_ovl temp;
++
++ temp.acurs = smc_curs_read(local->acurs);
++ peer->count = htonl(temp.curs.count);
++ peer->wrap = htons(temp.curs.wrap);
++ /* peer->reserved = htons(0); must be ensured by caller */
++}
++
++static inline void smc_host_msg_to_cdc(struct smc_cdc_msg *peer,
++ struct smc_host_cdc_msg *local)
++{
++ peer->common.type = local->common.type;
++ peer->len = local->len;
++ peer->seqno = htons(local->seqno);
++ peer->token = htonl(local->token);
++ smc_host_cursor_to_cdc(&peer->prod.curs, &local->prod);
++ smc_host_cursor_to_cdc(&peer->cons.curs, &local->cons);
++ peer->prod_flags = local->prod_flags;
++ peer->conn_state_flags = local->conn_state_flags;
++}
++
++static inline void smc_cdc_cursor_to_host(union smc_host_cursor_ovl *local,
++ union smc_cdc_cursor_ovl *peer)
++{
++ union smc_host_cursor_ovl temp, old;
++ union smc_cdc_cursor_ovl net;
++
++ old.acurs = smc_curs_read(local->acurs);
++ net.acurs = smc_curs_read_net(peer->acurs);
++ temp.curs.count = ntohl(net.curs.count);
++ temp.curs.wrap = ntohs(net.curs.wrap);
++ if ((old.curs.wrap > temp.curs.wrap) && temp.curs.wrap)
++ return;
++ if ((old.curs.wrap == temp.curs.wrap) &&
++ (old.curs.count > temp.curs.count))
++ return;
++ xchg(&local->acurs, temp.acurs);
++}
++
++static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
++ struct smc_cdc_msg *peer)
++{
++ local->common.type = peer->common.type;
++ local->len = peer->len;
++ local->seqno = ntohs(peer->seqno);
++ local->token = ntohl(peer->token);
++ smc_cdc_cursor_to_host(&local->prod, &peer->prod);
++ smc_cdc_cursor_to_host(&local->cons, &peer->cons);
++ local->prod_flags = peer->prod_flags;
++ local->conn_state_flags = peer->conn_state_flags;
++}
++
++struct smc_cdc_tx_pend;
++
++int smc_cdc_get_free_slot(struct smc_link *, struct smc_wr_buf **,
++ struct smc_cdc_tx_pend **);
++int smc_cdc_msg_send(struct smc_connection *, struct smc_wr_buf *,
++ struct smc_cdc_tx_pend *);
++int smc_cdc_init(void) __init;
++
++#endif /* SMC_CDC_H */
+--- a/net/smc/smc_core.c
++++ b/net/smc/smc_core.c
+@@ -21,6 +21,7 @@
+ #include "smc_ib.h"
+ #include "smc_wr.h"
+ #include "smc_llc.h"
++#include "smc_cdc.h"
+
+ #define SMC_LGR_NUM_INCR 256
+
+@@ -347,6 +348,8 @@ int smc_conn_create(struct smc_sock *smc
+ smc_lgr_register_conn(conn); /* add smc conn to lgr */
+ rc = smc_link_determine_gid(conn->lgr);
+ }
++ conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
++ conn->local_tx_ctrl.len = sizeof(struct smc_cdc_msg);
+
+ out:
+ return rc ? rc : local_contact;
+@@ -446,6 +449,7 @@ int smc_sndbuf_create(struct smc_sock *s
+ conn->sndbuf_desc = sndbuf_desc;
+ conn->sndbuf_size = tmp_bufsize;
+ smc->sk.sk_sndbuf = tmp_bufsize * 2;
++ atomic_set(&conn->sndbuf_space, tmp_bufsize);
+ return 0;
+ } else {
+ return -ENOMEM;
+@@ -521,6 +525,7 @@ int smc_rmb_create(struct smc_sock *smc)
+ conn->rmbe_size = tmp_bufsize;
+ conn->rmbe_size_short = tmp_bufsize_short;
+ smc->sk.sk_rcvbuf = tmp_bufsize * 2;
++ atomic_set(&conn->bytes_to_rcv, 0);
+ return 0;
+ } else {
+ return -ENOMEM;
diff --git a/patches.arch/s390-sles12sp2-00-05-net-smc-r-12.patch b/patches.arch/s390-sles12sp2-00-05-net-smc-r-12.patch
new file mode 100644
index 0000000000..72f6dcd7bf
--- /dev/null
+++ b/patches.arch/s390-sles12sp2-00-05-net-smc-r-12.patch
@@ -0,0 +1,601 @@
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Subject: smc: send data (through RDMA)
+Patch-mainline: not yet, IBM pushing upstream
+References: bsc#978258,FATE#319593,LTC#131290
+
+Summary: net/smc: Shared Memory Communications - RDMA
+Description: Initial part of the implementation of the "Shared Memory
+ Communications-RDMA" (SMC-R) protocol. The protocol is defined
+ in RFC7609 [1]. It allows transparent transformation of TCP
+ connections using the "Remote Direct Memory Access over
+ Converged Ethernet" (RoCE) feature of certain communication
+ hardware for data center environments. Tested on s390 and x86
+ using Mellanox ConnectX-3 cards.
+
+ A new socket protocol family PF_SMC is being introduced. A
+ preload shared library will be offered to enable TCP-based
+ applications to use SMC-R without changes or recompilation.
+
+ References:
+ [1] SMC-R Informational RFC:
+ https://tools.ietf.org/rfc/rfc7609
+
+Upstream-Description:
+
+ smc: send data (through RDMA)
+
+ copy data to kernel send buffer, and trigger RDMA write
+
+ Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+
+Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Acked-by: John Jolly <jjolly@suse.de>
+---
+ net/smc/Makefile | 2
+ net/smc/af_smc.c | 5
+ net/smc/smc.h | 1
+ net/smc/smc_cdc.c | 7
+ net/smc/smc_tx.c | 449 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ net/smc/smc_tx.h | 24 ++
+ 6 files changed, 484 insertions(+), 4 deletions(-)
+
+--- a/net/smc/Makefile
++++ b/net/smc/Makefile
+@@ -1,2 +1,2 @@
+ obj-$(CONFIG_SMC) += smc.o
+-smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc_cdc.o
++smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc_cdc.o smc_tx.o
+--- a/net/smc/af_smc.c
++++ b/net/smc/af_smc.c
+@@ -35,6 +35,7 @@
+ #include "smc_core.h"
+ #include "smc_ib.h"
+ #include "smc_pnet.h"
++#include "smc_tx.h"
+
+ static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group
+ * creation
+@@ -433,6 +434,7 @@ static int smc_connect_rdma(struct smc_s
+ out_connected:
+ smc_copy_sock_settings_to_clc(smc);
+ smc->sk.sk_state = SMC_ACTIVE;
++ smc_tx_init(smc);
+
+ return rc ? rc : local_contact;
+
+@@ -771,6 +773,7 @@ static void smc_listen_worker(struct wor
+ out_connected:
+ sk_refcnt_debug_inc(newsmcsk);
+ newsmcsk->sk_state = SMC_ACTIVE;
++ smc_tx_init(new_smc);
+ enqueue:
+ if (local_contact == SMC_FIRST_CONTACT)
+ mutex_unlock(&smc_create_lgr_pending);
+@@ -942,7 +945,7 @@ static int smc_sendmsg(struct socket *so
+ if (smc->use_fallback)
+ rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
+ else
+- rc = sock_no_sendmsg(sock, msg, len);
++ rc = smc_tx_sendmsg(smc, msg, len);
+ out:
+ release_sock(sk);
+ return rc;
+--- a/net/smc/smc.h
++++ b/net/smc/smc.h
+@@ -135,6 +135,7 @@ struct smc_connection {
+ atomic_t sndbuf_space; /* remaining space in sndbuf */
+ u16 tx_cdc_seq; /* sequence # for CDC send */
+ spinlock_t send_lock; /* protect wr_sends */
++ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
+
+ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl.
+ * .prod cf. TCP rcv_nxt
+--- a/net/smc/smc_cdc.c
++++ b/net/smc/smc_cdc.c
+@@ -14,6 +14,7 @@
+ #include "smc.h"
+ #include "smc_wr.h"
+ #include "smc_cdc.h"
++#include "smc_tx.h"
+
+ struct smc_cdc_tx_pend {
+ struct smc_connection *conn; /* socket connection */
+@@ -45,7 +46,7 @@ static void smc_cdc_tx_handler(struct sm
+ xchg(&cdcpend->conn->tx_curs_fin.acurs,
+ cdcpend->cursor.acurs);
+ }
+- /* subsequent patch: wake if send buffer space available */
++ smc_tx_sndbuf_nonfull(smc);
+ bh_unlock_sock(&smc->sk);
+ }
+
+@@ -149,7 +150,9 @@ static void smc_cdc_msg_recv_action(stru
+ }
+
+ /* piggy backed tx info */
+- /* subsequent patch: wake receivers if receive buffer space available */
++ /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
++ if (diff_cons)
++ smc_tx_sndbuf_nonempty(conn);
+
+ /* subsequent patch: trigger socket release if connection closed */
+
+--- /dev/null
++++ b/net/smc/smc_tx.c
+@@ -0,0 +1,449 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * Manage send buffer.
++ * Producer:
++ * Copy user space data into send buffer, if send buffer space available.
++ * Consumer:
++ * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available.
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Ursula Braun <ursula.braun@de.ibm.com>
++ */
++
++#include <linux/net.h>
++#include <linux/rcupdate.h>
++#include <net/sock.h>
++
++#include "smc.h"
++#include "smc_wr.h"
++#include "smc_cdc.h"
++#include "smc_tx.h"
++
++/***************************** sndbuf producer *******************************/
++
++/* callback implementation for sk.sk_write_space()
++ * to wakeup sndbuf producers that blocked with smc_tx_wait_memory()
++ */
++static void smc_tx_write_space(struct sock *sk)
++{
++ struct socket *sock = sk->sk_socket;
++ struct smc_sock *smc = smc_sk(sk);
++ struct socket_wq *wq;
++
++ /* similar to sk_stream_write_space */
++ if (atomic_read(&smc->conn.sndbuf_space) && sock) {
++ clear_bit(SOCK_NOSPACE, &sock->flags);
++ rcu_read_lock();
++ wq = rcu_dereference(sk->sk_wq);
++ if (wq_has_sleeper(wq))
++ wake_up_interruptible_poll(&wq->wait,
++ POLLOUT | POLLWRNORM |
++ POLLWRBAND);
++ if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
++ sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
++ rcu_read_unlock();
++ }
++}
++
++/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory().
++ * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
++ */
++void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
++{
++ if (smc->sk.sk_socket &&
++ atomic_read(&smc->conn.sndbuf_space) &&
++ test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags))
++ smc->sk.sk_write_space(&smc->sk);
++}
++
++/* sndbuf producer */
++static inline int smc_tx_give_up_send(struct smc_sock *smc, int copied)
++{
++ struct smc_connection *conn = &smc->conn;
++
++ if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
++ conn->local_tx_ctrl.conn_state_flags.abnormal_close)
++ return -EPIPE;
++ if (conn->local_rx_ctrl.conn_state_flags.abnormal_close ||
++ conn->local_rx_ctrl.conn_state_flags.closed_conn)
++ return copied ? copied : -ECONNRESET;
++ return 0;
++}
++
++/* blocks sndbuf producer until at least one byte of free space available */
++static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
++{
++ struct smc_connection *conn = &smc->conn;
++ struct sock *sk = &smc->sk;
++ DEFINE_WAIT(wait);
++ long timeo;
++ int rc = 0;
++
++ /* similar to sk_stream_wait_memory */
++ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
++ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
++ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
++ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) ||
++ conn->local_tx_ctrl.conn_state_flags.sending_done) {
++ rc = -EPIPE;
++ goto out;
++ }
++ if (conn->local_rx_ctrl.conn_state_flags.abnormal_close) {
++ rc = -ECONNRESET;
++ goto out;
++ }
++ if (!timeo) {
++ rc = -EAGAIN;
++ goto out;
++ }
++ if (signal_pending(current)) {
++ rc = -EINTR;
++ goto out;
++ }
++ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
++ if (atomic_read(&conn->sndbuf_space))
++ goto out;
++ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
++ rc = sk_wait_event(sk, &timeo,
++ sk->sk_err ||
++ (sk->sk_shutdown & SEND_SHUTDOWN) ||
++ smc_stop_received(conn) ||
++ atomic_read(&conn->sndbuf_space));
++ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
++ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) ||
++ conn->local_tx_ctrl.conn_state_flags.sending_done) {
++ rc = -EPIPE;
++ }
++ if (conn->local_rx_ctrl.conn_state_flags.abnormal_close)
++ rc = -ECONNRESET;
++out:
++ finish_wait(sk_sleep(sk), &wait);
++ return rc;
++}
++
++/* sndbuf producer: main API called by socket layer */
++int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
++{
++ size_t chunk_len, send_done = 0, send_remaining = len;
++ struct smc_connection *conn = &smc->conn;
++ union smc_host_cursor_ovl prep;
++ int tx_top, tx_bot;
++ char *sndbuf_base;
++ int tx_cnt_prep;
++ int writespace;
++ int rc;
++
++again:
++ if (smc->sk.sk_state == SMC_INIT)
++ return -ENOTCONN;
++ if (smc->sk.sk_state != SMC_ACTIVE)
++ return -ECONNRESET;
++ rc = smc_tx_give_up_send(smc, send_done);
++ if (rc)
++ return rc;
++ /* what to do in case of smc->sk.sk_err ??? */
++
++ writespace = atomic_read(&conn->sndbuf_space);
++ if (!writespace) {
++ int wait_rc;
++
++ wait_rc = smc_tx_wait_memory(smc, msg->msg_flags);
++ if (wait_rc < 0)
++ return ((send_done && (wait_rc == -EAGAIN))
++ ? send_done : wait_rc);
++ if (!wait_rc) {
++ rc = smc_tx_give_up_send(smc, send_done);
++ if (rc)
++ return rc;
++ if (msg->msg_flags & MSG_DONTWAIT)
++ return send_done ? send_done : -EAGAIN;
++ if (smc->sk.sk_err)
++ return send_done ? send_done : -EPIPE;
++ goto again;
++ }
++ }
++ if (smc->sk.sk_err)
++ return -EPIPE;
++ rc = smc_tx_give_up_send(smc, send_done);
++ if (rc)
++ return rc;
++
++ /* re-calc, could be just 1 byte after smc_tx_wait_memory above */
++ writespace = atomic_read(&conn->sndbuf_space);
++ chunk_len = min_t(size_t, send_remaining, writespace);
++ /* determine start of sndbuf */
++ prep.acurs = smc_curs_read(conn->tx_curs_prep.acurs);
++ tx_cnt_prep = prep.curs.count;
++ sndbuf_base = conn->sndbuf_desc->cpu_addr;
++ /* determine sndbuf chunks - top and bottom of sndbuf */
++ if (tx_cnt_prep + chunk_len <= conn->sndbuf_size) {
++ tx_top = 0;
++ tx_bot = chunk_len;
++ if (memcpy_from_msg(sndbuf_base + tx_cnt_prep, msg, chunk_len))
++ return -EFAULT;
++ } else {
++ tx_bot = conn->sndbuf_size - tx_cnt_prep;
++ tx_top = chunk_len - tx_bot;
++ if (memcpy_from_msg(sndbuf_base + tx_cnt_prep, msg, tx_bot))
++ return -EFAULT;
++ if (memcpy_from_msg(sndbuf_base, msg, tx_top))
++ return -EFAULT;
++ }
++ smc_curs_add(conn->sndbuf_size, &prep.curs, chunk_len);
++ xchg(&conn->tx_curs_prep.acurs, prep.acurs);
++ smp_mb__before_atomic();
++ atomic_sub(chunk_len, &conn->sndbuf_space);
++ smp_mb__after_atomic();
++
++ /* since we just produced more new data into sndbuf,
++ * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
++ */
++ rc = smc_tx_sndbuf_nonempty(conn);
++ if (rc)
++ return rc;
++
++ send_done += chunk_len;
++ send_remaining -= chunk_len;
++ if (send_done < len)
++ goto again;
++
++ return send_done;
++}
++
++/***************************** sndbuf consumer *******************************/
++
++/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
++static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
++ int num_sges, struct ib_sge sges[])
++{
++ struct smc_link_group *lgr = conn->lgr;
++ struct ib_send_wr *failed_wr = NULL;
++ struct ib_rdma_wr rdma_wr;
++ struct smc_link *link;
++ int i, rc;
++
++ memset(&rdma_wr, 0, sizeof(rdma_wr));
++ link = &lgr->lnk[SMC_SINGLE_LINK];
++ for (i = 0; i < num_sges; i++) {
++ sges[i].addr =
++ conn->sndbuf_desc->dma_addr[SMC_SINGLE_LINK] +
++ sges[i].addr;
++ sges[i].lkey = link->mr_tx->lkey;
++ }
++ rdma_wr.wr.wr_id = smc_wr_tx_get_next_wr_id(link);
++ rdma_wr.wr.sg_list = sges;
++ rdma_wr.wr.num_sge = num_sges;
++ rdma_wr.wr.opcode = IB_WR_RDMA_WRITE;
++ rdma_wr.remote_addr =
++ lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
++ peer_rmbe_offset +
++ ((conn->peer_conn_idx - 1) * (conn->peer_rmbe_len));
++ rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
++ rc = ib_post_send(link->roce_qp, &rdma_wr.wr, &failed_wr);
++ if (rc)
++ conn->local_tx_ctrl.conn_state_flags.abnormal_close = 1;
++ return rc;
++}
++
++/* sndbuf consumer */
++static inline void smc_tx_fill_sges(int *num_sges, struct ib_sge sges[],
++ u64 sge_offset1, u32 sge_len1,
++ u64 sge_offset2, u32 sge_len2)
++{
++ memset(sges, 0, SMC_IB_MAX_SEND_SGE * sizeof(sges[0]));
++ sges[0].addr = sge_offset1;
++ sges[0].length = sge_len1;
++ if (sge_len2) {
++ *num_sges = 2;
++ sges[1].addr = sge_offset2;
++ sges[1].length = sge_len2;
++ } else {
++ *num_sges = 1;
++ }
++}
++
++/* sndbuf consumer */
++static inline void smc_tx_advance_cursors(struct smc_connection *conn,
++ union smc_host_cursor_ovl *prod,
++ union smc_host_cursor_ovl *sent,
++ size_t len)
++{
++ smc_curs_add(conn->peer_rmbe_len, &prod->curs, len);
++ smp_mb__before_atomic();
++ /* data in flight reduces usable snd_wnd */
++ atomic_sub(len, &conn->peer_rmbe_space);
++ smp_mb__after_atomic();
++ smc_curs_add(conn->sndbuf_size, &sent->curs, len);
++}
++
++/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
++ * usable snd_wnd as max transmit
++ */
++static int smc_tx_rdma_writes(struct smc_connection *conn)
++{
++ union smc_host_cursor_ovl sent, prep, prod, cons;
++ size_t to_copy, space1, space2, send_len;
++ struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
++ size_t tx_top1, tx_top2;
++ size_t tx_bot1, tx_bot2;
++ size_t tx_top, tx_bot;
++ int to_send, rmbespace;
++ int num_sges;
++ int rc;
++
++ sent.acurs = smc_curs_read(conn->tx_curs_sent.acurs);
++ prep.acurs = smc_curs_read(conn->tx_curs_prep.acurs);
++
++ /* cf. wmem_alloc - (snd_max - snd_una) */
++ to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep);
++ if (to_send <= 0)
++ return 0;
++
++ /* cf. snd_wnd */
++ rmbespace = atomic_read(&conn->peer_rmbe_space);
++ if (rmbespace <= 0)
++ return 0;
++
++ if (to_send >= rmbespace)
++ conn->local_tx_ctrl.prod_flags.write_blocked = 1;
++ else
++ conn->local_tx_ctrl.prod_flags.write_blocked = 0;
++
++ /* cf. usable snd_wnd */
++ to_copy = min(to_send, rmbespace);
++
++ if (sent.curs.count + to_copy <= conn->peer_rmbe_len) {
++ tx_top = 0;
++ tx_bot = to_copy;
++ } else {
++ tx_bot = conn->sndbuf_size - sent.curs.count;
++ tx_top = to_copy - tx_bot;
++ }
++ prod.acurs = smc_curs_read(conn->local_tx_ctrl.prod.acurs);
++ cons.acurs = smc_curs_read(conn->local_rx_ctrl.cons.acurs);
++ if (prod.curs.wrap == cons.curs.wrap) {
++ space1 = conn->peer_rmbe_len - prod.curs.count;
++ space2 = cons.curs.count;
++
++ send_len = min(to_copy, space1);
++ if (send_len <= tx_bot) {
++ tx_bot1 = send_len;
++ tx_bot2 = tx_bot - tx_bot1;
++ tx_top1 = 0;
++ tx_top2 = tx_top;
++ } else {
++ tx_bot1 = tx_bot;
++ tx_bot2 = 0;
++ tx_top1 = send_len - tx_bot;
++ tx_top2 = tx_top - tx_top1;
++ }
++ smc_tx_fill_sges(&num_sges, sges, sent.curs.count, tx_bot1, 0,
++ tx_top1);
++ rc = smc_tx_rdma_write(conn, prod.curs.count, num_sges, sges);
++ if (rc)
++ return rc;
++ to_copy -= send_len;
++ smc_tx_advance_cursors(conn, &prod, &sent, send_len);
++
++ if (to_copy && space2 && (tx_bot2 + tx_top2 > 0)) {
++ send_len = min(to_copy, space2);
++ if (tx_bot2 > send_len) {
++ tx_bot2 = send_len;
++ tx_top2 = 0;
++ } else {
++ if (tx_bot2 + tx_top2 > send_len)
++ tx_top2 = send_len - tx_bot2;
++ }
++ if (tx_bot2)
++ smc_tx_fill_sges(&num_sges, sges,
++ sent.curs.count,
++ tx_bot2, tx_top1, tx_top2);
++ else if (tx_top2)
++ smc_tx_fill_sges(&num_sges, sges, tx_top1,
++ tx_top2, 0, 0);
++ rc = smc_tx_rdma_write(conn, 0, num_sges, sges);
++ if (rc)
++ return rc;
++ smc_tx_advance_cursors(conn, &prod, &sent,
++ tx_bot2 + tx_top2);
++ }
++ } else {
++ space1 = cons.curs.count - prod.curs.count;
++ send_len = min(to_copy, space1);
++ if (send_len <= tx_bot) {
++ tx_bot = send_len;
++ tx_top = 0;
++ } else {
++ if ((send_len - tx_bot) <= tx_top)
++ tx_top = send_len - tx_bot;
++ }
++ smc_tx_fill_sges(&num_sges, sges, sent.curs.count, tx_bot, 0,
++ tx_top);
++ rc = smc_tx_rdma_write(conn, prod.curs.count, num_sges, sges);
++ if (rc)
++ return rc;
++ smc_tx_advance_cursors(conn, &prod, &sent, send_len);
++ }
++ xchg(&conn->local_tx_ctrl.prod.acurs, prod.acurs);
++ xchg(&conn->tx_curs_sent.acurs, sent.acurs);
++
++ return 0;
++}
++
++/* Wakeup sndbuf consumers from any context (IRQ or process)
++ * since there is more data to transmit; usable snd_wnd as max transmit
++ */
++int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
++{
++ struct smc_cdc_tx_pend *pend;
++ struct smc_wr_buf *wr_buf;
++ int rc;
++
++ spin_lock_bh(&conn->send_lock);
++ rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
++ &pend);
++ if (rc < 0) {
++ schedule_delayed_work(&conn->tx_work, HZ / 10);
++ goto out_unlock;
++ }
++
++ rc = smc_tx_rdma_writes(conn);
++ if (rc) {
++ smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
++ (struct smc_wr_tx_pend_priv *)pend);
++ goto out_unlock;
++ }
++
++ rc = smc_cdc_msg_send(conn, wr_buf, pend);
++
++out_unlock:
++ spin_unlock_bh(&conn->send_lock);
++ return rc;
++}
++
++/* Wakeup sndbuf consumers from process context
++ * since there is more data to transmit
++ */
++static void smc_tx_worker(struct work_struct *work)
++{
++ struct smc_connection *conn = container_of(to_delayed_work(work),
++ struct smc_connection,
++ tx_work);
++ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
++
++ lock_sock(&smc->sk);
++ smc_tx_sndbuf_nonempty(conn);
++ release_sock(&smc->sk);
++}
++
++/***************************** send initialize *******************************/
++
++/* Initialize send properties on connection establishment. NB: not __init! */
++void smc_tx_init(struct smc_sock *smc)
++{
++ smc->sk.sk_write_space = smc_tx_write_space;
++ INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_worker);
++ spin_lock_init(&smc->conn.send_lock);
++}
+--- /dev/null
++++ b/net/smc/smc_tx.h
+@@ -0,0 +1,24 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * Manage send buffer
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Ursula Braun <ursula.braun@de.ibm.com>
++ */
++
++#ifndef SMC_TX_H
++#define SMC_TX_H
++
++#include <linux/socket.h>
++#include <linux/types.h>
++
++#include "smc.h"
++
++void smc_tx_init(struct smc_sock *);
++int smc_tx_sendmsg(struct smc_sock *, struct msghdr *, size_t);
++int smc_tx_sndbuf_nonempty(struct smc_connection *);
++void smc_tx_sndbuf_nonfull(struct smc_sock *);
++
++#endif /* SMC_TX_H */
diff --git a/patches.arch/s390-sles12sp2-00-05-net-smc-r-13.patch b/patches.arch/s390-sles12sp2-00-05-net-smc-r-13.patch
new file mode 100644
index 0000000000..e8f74484a0
--- /dev/null
+++ b/patches.arch/s390-sles12sp2-00-05-net-smc-r-13.patch
@@ -0,0 +1,398 @@
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Subject: smc: receive data from RMBE
+Patch-mainline: not yet, IBM pushing upstream
+References: bsc#978258,FATE#319593,LTC#131290
+
+Summary: net/smc: Shared Memory Communications - RDMA
+Description: Initial part of the implementation of the "Shared Memory
+ Communications-RDMA" (SMC-R) protocol. The protocol is defined
+ in RFC7609 [1]. It allows transparent transformation of TCP
+ connections using the "Remote Direct Memory Access over
+ Converged Ethernet" (RoCE) feature of certain communication
+ hardware for data center environments. Tested on s390 and x86
+ using Mellanox ConnectX-3 cards.
+
+ A new socket protocol family PF_SMC is being introduced. A
+ preload shared library will be offered to enable TCP-based
+ applications to use SMC-R without changes or recompilation.
+
+ References:
+ [1] SMC-R Informational RFC:
+ https://tools.ietf.org/rfc/rfc7609
+
+Upstream-Description:
+
+ smc: receive data from RMBE
+
+ move RMBE data into user space buffer and update managing cursors
+
+ Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+
+Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Acked-by: John Jolly <jjolly@suse.de>
+---
+ net/smc/Makefile | 2
+ net/smc/af_smc.c | 5 +
+ net/smc/smc.h | 4 +
+ net/smc/smc_cdc.c | 9 ++
+ net/smc/smc_core.c | 3
+ net/smc/smc_rx.c | 184 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ net/smc/smc_rx.h | 23 ++++++
+ net/smc/smc_tx.c | 32 +++++++++
+ net/smc/smc_tx.h | 1
+ 9 files changed, 260 insertions(+), 3 deletions(-)
+
+--- a/net/smc/Makefile
++++ b/net/smc/Makefile
+@@ -1,2 +1,2 @@
+ obj-$(CONFIG_SMC) += smc.o
+-smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc_cdc.o smc_tx.o
++smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc_cdc.o smc_tx.o smc_rx.o
+--- a/net/smc/af_smc.c
++++ b/net/smc/af_smc.c
+@@ -36,6 +36,7 @@
+ #include "smc_ib.h"
+ #include "smc_pnet.h"
+ #include "smc_tx.h"
++#include "smc_rx.h"
+
+ static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group
+ * creation
+@@ -435,6 +436,7 @@ out_connected:
+ smc_copy_sock_settings_to_clc(smc);
+ smc->sk.sk_state = SMC_ACTIVE;
+ smc_tx_init(smc);
++ smc_rx_init(smc);
+
+ return rc ? rc : local_contact;
+
+@@ -774,6 +776,7 @@ out_connected:
+ sk_refcnt_debug_inc(newsmcsk);
+ newsmcsk->sk_state = SMC_ACTIVE;
+ smc_tx_init(new_smc);
++ smc_rx_init(new_smc);
+ enqueue:
+ if (local_contact == SMC_FIRST_CONTACT)
+ mutex_unlock(&smc_create_lgr_pending);
+@@ -966,7 +969,7 @@ static int smc_recvmsg(struct socket *so
+ if (smc->use_fallback)
+ rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
+ else
+- rc = sock_no_recvmsg(sock, msg, len, flags);
++ rc = smc_rx_recvmsg(smc, msg, len, flags);
+ out:
+ release_sock(sk);
+ return rc;
+--- a/net/smc/smc.h
++++ b/net/smc/smc.h
+@@ -117,6 +117,10 @@ struct smc_connection {
+ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
+ int rmbe_size; /* RMBE size <== sock rmem */
+ int rmbe_size_short;/* compressed notation */
++ int rmbe_update_limit;
++ /* lower limit for consumer
++ * cursor update
++ */
+
+ struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging
+ * buffer for CDC msg send
+--- a/net/smc/smc_cdc.c
++++ b/net/smc/smc_cdc.c
+@@ -15,6 +15,7 @@
+ #include "smc_wr.h"
+ #include "smc_cdc.h"
+ #include "smc_tx.h"
++#include "smc_rx.h"
+
+ struct smc_cdc_tx_pend {
+ struct smc_connection *conn; /* socket connection */
+@@ -161,7 +162,13 @@ static void smc_cdc_msg_recv_action(stru
+ return;
+
+ /* data available */
+- /* subsequent patch: send delayed ack, wake receivers */
++ if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
++ (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req))
++ smc_tx_consumer_update(conn);
++ if (diff_prod ||
++ smc_stop_received(conn) ||
++ smc->sk.sk_shutdown & RCV_SHUTDOWN)
++ smc->sk.sk_data_ready(&smc->sk);
+ }
+
+ /* called under tasklet context */
+--- a/net/smc/smc_core.c
++++ b/net/smc/smc_core.c
+@@ -526,6 +526,9 @@ int smc_rmb_create(struct smc_sock *smc)
+ conn->rmbe_size_short = tmp_bufsize_short;
+ smc->sk.sk_rcvbuf = tmp_bufsize * 2;
+ atomic_set(&conn->bytes_to_rcv, 0);
++ conn->rmbe_update_limit =
++ min_t(int, conn->rmbe_size / 10,
++ SOCK_MIN_SNDBUF / 2);
+ return 0;
+ } else {
+ return -ENOMEM;
+--- /dev/null
++++ b/net/smc/smc_rx.c
+@@ -0,0 +1,184 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * Manage RMBE
++ * copy new RMBE data into user space
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Ursula Braun <ursula.braun@de.ibm.com>
++ */
++
++#include <linux/net.h>
++#include <linux/rcupdate.h>
++#include <net/sock.h>
++
++#include "smc.h"
++#include "smc_core.h"
++#include "smc_cdc.h"
++#include "smc_tx.h" /* smc_tx_consumer_update() */
++#include "smc_rx.h"
++
++/* callback implementation for sk.sk_data_ready()
++ * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data().
++ * indirectly called by smc_cdc_msg_recv_action().
++ */
++static void smc_rx_data_ready(struct sock *sk)
++{
++ struct socket_wq *wq;
++
++ /* derived from sock_def_readable() */
++ /* called already in smc_listen_worker() */
++ rcu_read_lock();
++ wq = rcu_dereference(sk->sk_wq);
++ if (wq_has_sleeper(wq))
++ wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
++ POLLRDNORM | POLLRDBAND);
++ if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
++ (sk->sk_state == SMC_CLOSED))
++ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
++ else
++ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
++ rcu_read_unlock();
++}
++
++/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
++ * @smc smc socket
++ * @len num bytes to wait for
++ * @timeo max seconds to wait, 0 for no timeout
++ * Returns:
++ * 1 if at least len bytes available in rcvbuf.
++ * -EAGAIN in case timeout expired.
++ * 0 otherwise (neither enough bytes in rcvbuf nor timeout, e.g. interrupted).
++ */
++static int smc_rx_wait_data(struct smc_sock *smc, int len, long timeo)
++{
++ struct smc_connection *conn = &smc->conn;
++ struct sock *sk = &smc->sk;
++ DEFINE_WAIT(wait);
++ int rc;
++
++ if (atomic_read(&conn->bytes_to_rcv) >= len)
++ return 1;
++ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
++ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
++ rc = sk_wait_event(sk, &timeo,
++ sk->sk_err ||
++ sk->sk_shutdown & RCV_SHUTDOWN ||
++ sock_flag(sk, SOCK_DONE) ||
++ (atomic_read(&conn->bytes_to_rcv) >= len) ||
++ smc_stop_received(conn));
++ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
++ finish_wait(sk_sleep(sk), &wait);
++ return (rc || timeo) ? rc : -EAGAIN;
++}
++
++/* rcvbuf consumer: main API called by socket layer */
++int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
++ int flags)
++{
++ size_t read_done = 0, read_remaining = len;
++ struct smc_connection *conn = &smc->conn;
++ union smc_host_cursor_ovl prod, cons;
++ size_t readable, chunk;
++ char *rcvbuf_base;
++ int to_read;
++ long timeo;
++ int target; /* Read at least these many bytes */
++ int rc;
++
++ msg->msg_namelen = 0;
++ rcvbuf_base = conn->rmb_desc->cpu_addr;
++ read_remaining = min_t(size_t, len, conn->rmbe_size); /* cap rx len */
++
++again:
++ target = sock_rcvlowat(&smc->sk, flags & MSG_WAITALL, read_remaining);
++ timeo = sock_rcvtimeo(&smc->sk, flags & MSG_DONTWAIT);
++ if (signal_pending(current))
++ return timeo ? -EINTR : -EAGAIN;
++ rc = smc_rx_wait_data(smc, target, timeo);
++ if ((rc == -EAGAIN) || (rc == -EINTR))
++ return rc;
++ if (!rc)
++ goto again;
++ to_read = atomic_read(&conn->bytes_to_rcv);
++
++ if ((to_read <= 0) &&
++ (smc->sk.sk_err ||
++ smc->sk.sk_shutdown & RCV_SHUTDOWN ||
++ sock_flag(&smc->sk, SOCK_DONE) ||
++ smc_stop_received(conn)))
++ return sock_error(&smc->sk);
++
++ if (to_read <= 0)
++ goto check_repeat;
++
++ if ((to_read < target) && !smc_stop_received(conn))
++ goto check_repeat;
++
++ prod.acurs = smc_curs_read(conn->local_rx_ctrl.prod.acurs);
++ cons.acurs = smc_curs_read(conn->local_tx_ctrl.cons.acurs);
++ if (prod.curs.wrap == cons.curs.wrap) {
++ /* unwrapped case: copy 1 single chunk */
++ readable = prod.curs.count - cons.curs.count;
++ chunk = min(read_remaining, readable);
++ if (!(flags & MSG_TRUNC)) {
++ if (memcpy_to_msg(msg, rcvbuf_base + cons.curs.count,
++ chunk))
++ return -EFAULT;
++ }
++ read_remaining -= chunk;
++ read_done += chunk;
++ } else {
++ /* wrapped case: top chunk */
++ readable = conn->rmbe_size - cons.curs.count;
++ if (readable) {
++ chunk = min(read_remaining, readable);
++ if (!(flags & MSG_TRUNC)) {
++ if (memcpy_to_msg(msg,
++ rcvbuf_base + cons.curs.count,
++ chunk))
++ return -EFAULT;
++ }
++ read_remaining -= chunk;
++ read_done += chunk;
++ }
++ /* wrapped case: bottom chunk (if any) */
++ if (read_remaining) {
++ readable = prod.curs.count;
++ chunk = min(read_remaining, readable);
++ if (!(flags & MSG_TRUNC)) {
++ if (memcpy_to_msg(msg, rcvbuf_base, chunk))
++ return -EFAULT;
++ }
++ read_remaining -= chunk;
++ read_done += chunk;
++ }
++ }
++
++ /* update cursors */
++ if (!(flags & MSG_PEEK)) {
++ smc_curs_add(conn->rmbe_size, &cons.curs, read_done);
++ smp_mb__before_atomic();
++ atomic_sub(read_done, &conn->bytes_to_rcv);
++ smp_mb__after_atomic();
++ xchg(&conn->local_tx_ctrl.cons.acurs, cons.acurs);
++ /* send consumer cursor update if required */
++ /* analogon to advertising a new TCP rcv_wnd if required */
++ smc_tx_consumer_update(conn);
++ }
++check_repeat:
++ if ((to_read < target) &&
++ !smc_stop_received(conn) &&
++ !conn->local_tx_ctrl.conn_state_flags.abnormal_close) {
++ goto again;
++ }
++
++ return read_done;
++}
++
++/* Initialize receive properties on connection establishment. NB: not __init! */
++void smc_rx_init(struct smc_sock *smc)
++{
++ smc->sk.sk_data_ready = smc_rx_data_ready;
++}
+--- /dev/null
++++ b/net/smc/smc_rx.h
+@@ -0,0 +1,23 @@
++/*
++ * Shared Memory Communications over RDMA (SMC-R) and RoCE
++ *
++ * Manage RMBE
++ *
++ * Copyright IBM Corp. 2016
++ *
++ * Author(s): Ursula Braun <ursula.braun@de.ibm.com>
++ */
++
++#ifndef SMC_RX_H
++#define SMC_RX_H
++
++#include <linux/socket.h>
++#include <linux/types.h>
++
++#include "smc.h"
++
++void smc_rx_init(struct smc_sock *);
++int smc_rx_to_read(struct smc_connection *);
++int smc_rx_recvmsg(struct smc_sock *, struct msghdr *, size_t, int);
++
++#endif /* SMC_RX_H */
+--- a/net/smc/smc_tx.c
++++ b/net/smc/smc_tx.c
+@@ -438,6 +438,38 @@ static void smc_tx_worker(struct work_st
+ release_sock(&smc->sk);
+ }
+
++void smc_tx_consumer_update(struct smc_connection *conn)
++{
++ union smc_host_cursor_ovl cfed, cons;
++ struct smc_cdc_tx_pend *pend;
++ struct smc_wr_buf *wr_buf;
++ int to_confirm, rc;
++
++ cons.acurs = smc_curs_read(conn->local_tx_ctrl.cons.acurs);
++ cfed.acurs = smc_curs_read(conn->rx_curs_confirmed.acurs);
++ to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons);
++
++ if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
++ ((to_confirm > conn->rmbe_update_limit) &&
++ ((to_confirm / (conn->rmbe_size / 2) > 0) ||
++ conn->local_rx_ctrl.prod_flags.write_blocked))) {
++ rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
++ &wr_buf, &pend);
++ if (!rc)
++ rc = smc_cdc_msg_send(conn, wr_buf, pend);
++ if (rc < 0) {
++ schedule_delayed_work(&conn->tx_work, HZ / 10);
++ return;
++ }
++ xchg(&conn->rx_curs_confirmed.acurs,
++ smc_curs_read(conn->local_tx_ctrl.cons.acurs));
++ conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
++ }
++ if (conn->local_rx_ctrl.prod_flags.write_blocked &&
++ !atomic_read(&conn->bytes_to_rcv))
++ conn->local_rx_ctrl.prod_flags.write_blocked = 0;
++}
++
+ /***************************** send initialize *******************************/
+
+ /* Initialize send properties on connection establishment. NB: not __init! */
+--- a/net/smc/smc_tx.h
++++ b/net/smc/smc_tx.h
+@@ -20,5 +20,6 @@ void smc_tx_init(struct smc_sock *);
+ int smc_tx_sendmsg(struct smc_sock *, struct msghdr *, size_t);
+ int smc_tx_sndbuf_nonempty(struct smc_connection *);
+ void smc_tx_sndbuf_nonfull(struct smc_sock *);
++void smc_tx_consumer_update(struct smc_connection *);
+
+ #endif /* SMC_TX_H */
diff --git a/patches.arch/s390-sles12sp2-00-05-net-smc-r-14.patch b/patches.arch/s390-sles12sp2-00-05-net-smc-r-14.patch
new file mode 100644
index 0000000000..8179f91817
--- /dev/null
+++ b/patches.arch/s390-sles12sp2-00-05-net-smc-r-14.patch
@@ -0,0 +1,903 @@
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Subject: smc: socket closing and linkgroup cleanup
+Patch-mainline: not yet, IBM pushing upstream
+References: bsc#978258,FATE#319593,LTC#131290
+
+Summary: net/smc: Shared Memory Communications - RDMA
+Description: Initial part of the implementation of the "Shared Memory
+ Communications-RDMA" (SMC-R) protocol. The protocol is defined
+ in RFC7609 [1]. It allows transparent transformation of TCP
+ connections using the "Remote Direct Memory Access over
+ Converged Ethernet" (RoCE) feature of certain communication
+ hardware for data center environments. Tested on s390 and x86
+ using Mellanox ConnectX-3 cards.
+
+ A new socket protocol family PF_SMC is being introduced. A
+ preload shared library will be offered to enable TCP-based
+ applications to use SMC-R without changes or recompilation.
+
+ References:
+ [1] SMC-R Informational RFC:
+ https://tools.ietf.org/rfc/rfc7609
+
+Upstream-Description:
+
+ smc: socket closing and linkgroup cleanup
+
+ smc_shutdown() and smc_release() handling
+ delayed linkgroup cleanup for linkgroups without connections
+
+ Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+
+Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Acked-by: John Jolly <jjolly@suse.de>
+---
+ net/smc/af_smc.c | 412 +++++++++++++++++++++++++++++++++++++++++++++++++++--
+ net/smc/smc.h | 13 +
+ net/smc/smc_cdc.c | 48 ++++--
+ net/smc/smc_cdc.h | 2
+ net/smc/smc_core.h | 1
+ net/smc/smc_ib.c | 11 +
+ net/smc/smc_rx.c | 5
+ net/smc/smc_rx.h | 1
+ net/smc/smc_tx.c | 35 ++++
+ net/smc/smc_tx.h | 11 +
+ net/smc/smc_wr.c | 10 -
+ net/smc/smc_wr.h | 9 +
+ 12 files changed, 521 insertions(+), 37 deletions(-)
+
+--- a/net/smc/af_smc.c
++++ b/net/smc/af_smc.c
+@@ -38,9 +38,11 @@
+ #include "smc_tx.h"
+ #include "smc_rx.h"
+
+-static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group
+- * creation
+- */
++#define SMC_LISTEN_WORK_WAIT 20
++#define SMC_WAIT_TX_PENDS_TIME (5 * HZ)
++#define SMC_TIMEWAIT_LEN TCP_TIMEWAIT_LEN
++
++DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group creation */
+
+ struct smc_lgr_list smc_lgr_list = { /* established link groups */
+ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
+@@ -64,19 +66,220 @@ static struct proto smc_proto = {
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+ };
+
++static void smc_destruct_non_accepted(struct sock *sk);
++static struct sock *smc_accept_dequeue(struct sock *, struct socket *);
++
++static void smc_sock_cleanup_listen(struct sock *parent)
++{
++ struct sock *sk;
++
++ /* Close non-accepted connections */
++ while ((sk = smc_accept_dequeue(parent, NULL)))
++ smc_destruct_non_accepted(sk);
++}
++
++static int smc_wait_tx_pends(struct smc_sock *smc)
++{
++ struct smc_connection *conn = &smc->conn;
++ struct sock *sk = &smc->sk;
++ signed long timeout;
++ DEFINE_WAIT(wait);
++ int rc = 0;
++
++ timeout = SMC_WAIT_TX_PENDS_TIME;
++ if (smc_cdc_wr_tx_pends(conn) && !(current->flags & PF_EXITING)) {
++ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
++ do {
++ prepare_to_wait(sk_sleep(sk), &wait,
++ TASK_INTERRUPTIBLE);
++ if (sk_wait_event(sk, &timeout,
++ !smc_cdc_wr_tx_pends(conn)))
++ break;
++ } while (!signal_pending(current) && timeout);
++ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
++ finish_wait(sk_sleep(sk), &wait);
++ }
++ if (!timeout) { /* timeout reached, kill tx_pends */
++ smc_cdc_put_conn_slots(conn);
++ rc = -ETIME;
++ }
++ return rc;
++}
++
++static void smc_wait_close_tx_prepared(struct smc_sock *smc, long timeout)
++{
++ struct sock *sk = &smc->sk;
++
++ if (timeout) {
++ DEFINE_WAIT(wait);
++
++ do {
++ prepare_to_wait(sk_sleep(sk), &wait,
++ TASK_INTERRUPTIBLE);
++ if (sk_wait_event(sk, &timeout,
++ !smc_tx_prepared_sends(&smc->conn)))
++ break;
++ } while (!signal_pending(current) && timeout);
++
++ finish_wait(sk_sleep(sk), &wait);
++ }
++}
++
++void smc_wake_close_tx_prepared(struct smc_sock *smc)
++{
++ if (smc->sk.sk_state == SMC_PEERCLW1)
++ /* wake up socket closing */
++ smc->sk.sk_state_change(&smc->sk);
++}
++
++static inline int smc_stream_closing(struct smc_connection *conn)
++{
++ return (!smc_cdc_wr_tx_pends(conn) &&
++ smc_close_received(conn));
++}
++
++static void smc_stream_wait_close(struct smc_sock *smc, long lingertime)
++{
++ struct sock *sk = &smc->sk;
++
++ if (lingertime) {
++ DEFINE_WAIT(wait);
++
++ do {
++ prepare_to_wait(sk_sleep(sk), &wait,
++ TASK_INTERRUPTIBLE);
++ if (sk_wait_event(sk, &lingertime,
++ smc_stream_closing(&smc->conn)))
++ break;
++ } while (!signal_pending(current) && lingertime);
++
++ finish_wait(sk_sleep(sk), &wait);
++ }
++}
++
++static int smc_conn_release(struct smc_sock *smc)
++{
++ struct smc_connection *conn = &smc->conn;
++ long timeout = MAX_SCHEDULE_TIMEOUT;
++ struct sock *sk = &smc->sk;
++ long lingertime = 0;
++ int old_state;
++ int rc = 0;
++
++ if (sock_flag(sk, SOCK_LINGER) &&
++ !(current->flags & PF_EXITING)) {
++ lingertime = sk->sk_lingertime;
++ timeout = sk->sk_lingertime;
++ }
++
++ old_state = sk->sk_state;
++ switch (old_state) {
++ case SMC_INIT:
++ sk->sk_state = SMC_CLOSED;
++ schedule_delayed_work(&smc->fin_work, SMC_TIMEWAIT_LEN);
++ break;
++ case SMC_LISTEN:
++ sk->sk_state = SMC_CLOSED;
++ sk->sk_state_change(sk);
++ old_state = SMC_CLOSED;
++ if (smc->clcsock && smc->clcsock->sk) {
++ rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
++ /* wake up kernel_accept of smc_tcp_listen_worker */
++ smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
++ }
++ release_sock(sk);
++ smc_sock_cleanup_listen(sk);
++ flush_work(&smc->tcp_listen_work);
++ flush_work(&smc->smc_listen_work);
++ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
++ schedule_delayed_work(&smc->fin_work, SMC_TIMEWAIT_LEN);
++ break;
++ case SMC_ACTIVE:
++ /* active close */
++ /* wait for sndbuf data being posted */
++ /* SLD: postpone smc_tx_close, return immediately, no wait ???*/
++ smc_wait_close_tx_prepared(smc, timeout);
++ /* wait for confirmation of previous postings */
++ smc_wait_tx_pends(smc);
++ /* send close request */
++ rc = smc_tx_close(conn);
++ if (conn->local_rx_ctrl.conn_state_flags.sending_done)
++ sk->sk_state = SMC_PEERCLW2;
++ else
++ sk->sk_state = SMC_PEERCLW1;
++ /* fall through */
++ case SMC_PEERCLW1:
++ case SMC_PEERCLW2:
++ /* wait for confirmation of close request posting */
++ smc_wait_tx_pends(smc);
++ /* wait for close request from peer - comparable to
++ * sk_stream_wait_close call of tcp
++ */
++ smc_stream_wait_close(smc, lingertime);
++ if (smc_close_received(conn)) {
++ sk->sk_state = SMC_CLOSED;
++ schedule_delayed_work(&smc->fin_work, SMC_TIMEWAIT_LEN);
++ }
++ break;
++ case SMC_APPLFINCLW:
++ /* socket already shutdown wr or both (active close) */
++ sk->sk_state = SMC_CLOSED;
++ schedule_delayed_work(&smc->fin_work, SMC_TIMEWAIT_LEN);
++ break;
++ case SMC_APPLCLW1:
++ case SMC_APPLCLW2:
++ /* passive close */
++ if (!smc_close_received(conn))
++ /* wait for sndbuf data being posted */
++ smc_wait_close_tx_prepared(smc, timeout);
++ /* wait for confirmation of previous postings */
++ smc_wait_tx_pends(smc);
++ /* confirm close from peer */
++ rc = smc_tx_close(conn);
++ /* wait for confirmation of close request posting */
++ smc_wait_tx_pends(smc);
++ if (smc_close_received(conn)) {
++ sk->sk_state = SMC_CLOSED;
++ schedule_delayed_work(&smc->fin_work, SMC_TIMEWAIT_LEN);
++ } else {
++ sk->sk_state = SMC_PEERFINCLW;
++ }
++ break;
++ case SMC_PEERFINCLW:
++ case SMC_CLOSED:
++ default:
++ break;
++ }
++
++ if (old_state != sk->sk_state)
++ sk->sk_state_change(&smc->sk);
++ return rc;
++}
++
+ static int smc_release(struct socket *sock)
+ {
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
++ int rc = 0;
+
+ if (!sk || (sk->sk_state == SMC_DESTRUCT))
+ goto out;
+
+ smc = smc_sk(sk);
+ sock_hold(sk);
+- lock_sock(sk);
++ if (sk->sk_state == SMC_LISTEN)
++ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
++ else
++ lock_sock(sk);
+
+- sk->sk_state = SMC_CLOSED;
++ if (smc->use_fallback) {
++ sk->sk_state = SMC_CLOSED;
++ sk->sk_state_change(sk);
++ } else {
++ sock_set_flag(sk, SOCK_DEAD);
++ sk->sk_shutdown = SHUTDOWN_MASK;
++ rc = smc_conn_release(smc);
++ }
+ if (smc->clcsock) {
+ sock_release(smc->clcsock);
+ smc->clcsock = NULL;
+@@ -90,7 +293,80 @@ static int smc_release(struct socket *so
+
+ sock_put(sk);
+ out:
+- return 0;
++ return rc;
++}
++
++static void smc_accept_unlink(struct sock *);
++
++/* some kind of closing has been received - normal, abnormal, or sending_done */
++void smc_conn_release_handler(struct smc_sock *smc)
++{
++ struct smc_connection *conn = &smc->conn;
++ struct sock *sk = &smc->sk;
++ int old_state;
++
++ old_state = sk->sk_state;
++ switch (sk->sk_state) {
++ /* Normal termination - Passive close part */
++ case SMC_INIT:
++ case SMC_ACTIVE:
++ if (conn->local_rx_ctrl.conn_state_flags.sending_done ||
++ conn->local_rx_ctrl.conn_state_flags.closed_conn) {
++ /* complete any outstanding recv with zero-length
++ * if peerclosedconn and pending data to be written
++ * then reset conn
++ */
++ sk->sk_state = SMC_APPLCLW1;
++ }
++ break;
++ case SMC_PEERFINCLW:
++ if (conn->local_rx_ctrl.conn_state_flags.closed_conn)
++ sk->sk_state = SMC_CLOSED;
++ break;
++ /* Normal termination - Active close part */
++ case SMC_PEERCLW1:
++ if (conn->local_rx_ctrl.conn_state_flags.sending_done) {
++ /* complete any outstanding recv with zero-length */
++ sk->sk_state = SMC_PEERCLW2;
++ } /* fall through */
++ case SMC_PEERCLW2:
++ if (conn->local_rx_ctrl.conn_state_flags.closed_conn) {
++ struct smc_host_cdc_msg *tx_ctrl = &conn->local_tx_ctrl;
++ /* complete any outstanding recv with zero-length */
++ if (sk->sk_shutdown == SHUTDOWN_MASK &&
++ (tx_ctrl->conn_state_flags.closed_conn ||
++ tx_ctrl->conn_state_flags.abnormal_close)) {
++ sk->sk_state = SMC_CLOSED;
++ } else {
++ sk->sk_state = SMC_APPLFINCLW;
++ }
++ }
++ break;
++ default:
++ break;
++ }
++
++ sock_set_flag(&smc->sk, SOCK_DONE);
++ if (smc_stop_received(conn)) {
++ sk->sk_shutdown = sk->sk_shutdown | RCV_SHUTDOWN;
++ if (smc->clcsock && smc->clcsock->sk) {
++ struct sock *tcpsk;
++
++ tcpsk = smc->clcsock->sk;
++ tcpsk->sk_shutdown = tcpsk->sk_shutdown | RCV_SHUTDOWN;
++ }
++ }
++ if (smc_close_received(conn) &&
++ (sk->sk_state == SMC_CLOSED) &&
++ sock_flag(sk, SOCK_DEAD) &&
++ !smc_cdc_wr_tx_pends(conn)) /* make sure socket is freed */
++ schedule_delayed_work(&smc->fin_work, SMC_TIMEWAIT_LEN);
++ if ((old_state != sk->sk_state) &&
++ (old_state != SMC_INIT))
++ sk->sk_state_change(sk);
++
++ smc->sk.sk_data_ready(&smc->sk);
++ smc->sk.sk_write_space(&smc->sk);
+ }
+
+ static void smc_destruct(struct sock *sk)
+@@ -108,11 +384,21 @@ static void smc_destruct(struct sock *sk
+ }
+
+ sk->sk_state = SMC_DESTRUCT;
+- smc_conn_free(&smc->conn);
++ if (smc->conn.lgr)
++ smc_conn_free(&smc->conn);
+
+ sk_refcnt_debug_dec(sk);
+ }
+
++static void smc_fin_worker(struct work_struct *work)
++{
++ struct smc_sock *smc =
++ container_of(work, struct smc_sock, fin_work.work);
++
++ cancel_delayed_work(&smc->fin_work);
++ sock_put(&smc->sk);
++}
++
+ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
+ {
+ struct smc_sock *smc;
+@@ -137,6 +423,7 @@ static struct sock *smc_sock_alloc(struc
+ INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_worker);
+ INIT_LIST_HEAD(&smc->accept_q);
+ spin_lock_init(&smc->accept_q_lock);
++ INIT_DELAYED_WORK(&smc->fin_work, smc_fin_worker);
+
+ return sk;
+ }
+@@ -529,6 +816,8 @@ static int smc_clcsock_accept(struct smc
+ lock_sock(&lsmc->sk);
+ if (rc < 0) {
+ lsmc->sk.sk_err = -rc;
++ new_sk->sk_state = SMC_CLOSED;
++ sock_set_flag(sk, SOCK_DEAD);
+ sock_put(new_sk);
+ *new_smc = NULL;
+ goto out;
+@@ -536,6 +825,8 @@ static int smc_clcsock_accept(struct smc
+ if (lsmc->sk.sk_state == SMC_CLOSED) {
+ if (new_clcsock)
+ sock_release(new_clcsock);
++ new_sk->sk_state = SMC_CLOSED;
++ sock_set_flag(sk, SOCK_DEAD);
+ sock_put(new_sk);
+ *new_smc = NULL;
+ goto out;
+@@ -602,6 +893,11 @@ static void smc_destruct_non_accepted(st
+ struct smc_sock *smc = smc_sk(sk);
+
+ sock_hold(sk);
++ lock_sock(sk);
++ if (!sk->sk_lingertime)
++ /* wait long for peer closing */
++ sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
++ smc_conn_release(smc);
+ if (smc->clcsock) {
+ struct socket *tcp;
+
+@@ -609,7 +905,9 @@ static void smc_destruct_non_accepted(st
+ smc->clcsock = NULL;
+ sock_release(tcp);
+ }
+- /* more closing stuff to be added with socket closing patch */
++ release_sock(sk);
++ sock_set_flag(sk, SOCK_ZAPPED);
++ sock_set_flag(sk, SOCK_DEAD);
+ sock_put(sk);
+ }
+
+@@ -806,6 +1104,7 @@ decline_rdma:
+
+ out_err:
+ newsmcsk->sk_state = SMC_CLOSED;
++ schedule_delayed_work(&new_smc->fin_work, TCP_TIMEWAIT_LEN);
+ goto enqueue; /* queue new sock with sk_err set */
+ }
+
+@@ -963,7 +1262,13 @@ static int smc_recvmsg(struct socket *so
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+- if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
++ if ((sk->sk_state != SMC_ACTIVE) &&
++ (sk->sk_state != SMC_PEERCLW1) &&
++ (sk->sk_state != SMC_PEERCLW2) &&
++ (sk->sk_state != SMC_APPLCLW1) &&
++ (sk->sk_state != SMC_APPLCLW2) &&
++ (sk->sk_state != SMC_PEERABORTW) &&
++ (sk->sk_state != SMC_PROCESSABORT))
+ goto out;
+
+ if (smc->use_fallback)
+@@ -1029,12 +1334,72 @@ static unsigned int smc_poll(struct file
+ mask |= smc_accept_poll(sk);
+ if (sk->sk_err)
+ mask |= POLLERR;
+- /* for now - to be enhanced in follow-on patch */
++ if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
++ (sk->sk_state == SMC_CLOSED))
++ mask |= POLLHUP;
++ if (sk->sk_shutdown & RCV_SHUTDOWN)
++ mask |= POLLIN | POLLRDNORM | POLLRDHUP;
++ if (atomic_read(&smc->conn.bytes_to_rcv))
++ mask |= POLLIN | POLLRDNORM; /* in earlier patch */
++ if (sk->sk_state == SMC_APPLCLW1)
++ mask |= POLLIN;
++ if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { /* in earlier patch */
++ if (atomic_read(&smc->conn.sndbuf_space)) {
++ mask |= POLLOUT | POLLWRNORM;
++ } else {
++ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
++ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
++ }
++ } else {
++ mask |= POLLOUT | POLLWRNORM;
++ }
+ }
+
+ return mask;
+ }
+
++static int smc_conn_shutdown_write(struct smc_sock *smc)
++{
++ struct smc_connection *conn = &smc->conn;
++ long timeout = MAX_SCHEDULE_TIMEOUT;
++ struct sock *sk = &smc->sk;
++ int old_state;
++ int rc = 0;
++
++ if (sock_flag(sk, SOCK_LINGER))
++ timeout = sk->sk_lingertime;
++
++ old_state = sk->sk_state;
++ switch (old_state) {
++ case SMC_ACTIVE:
++ /* active close */
++ /* wait for sndbuf data being posted */
++ smc_wait_close_tx_prepared(smc, timeout);
++ rc = smc_tx_close_wr(conn);
++ if (conn->local_rx_ctrl.conn_state_flags.sending_done)
++ sk->sk_state = SMC_PEERCLW2;
++ else
++ sk->sk_state = SMC_PEERCLW1;
++ sk->sk_state_change(sk);
++ break;
++ case SMC_APPLCLW1:
++ /* passive close */
++ if (!smc_close_received(conn))
++ /* wait for sndbuf data being posted */
++ smc_wait_close_tx_prepared(smc, timeout);
++ /* confirm close from peer */
++ rc = smc_tx_close_wr(conn);
++ sk->sk_state = SMC_APPLCLW2;
++ break;
++ default:
++ break;
++ }
++
++ if (old_state != sk->sk_state)
++ sk->sk_state_change(&smc->sk);
++ return rc;
++}
++
+ static int smc_shutdown(struct socket *sock, int how)
+ {
+ struct sock *sk = sock->sk;
+@@ -1049,7 +1414,11 @@ static int smc_shutdown(struct socket *s
+ lock_sock(sk);
+
+ rc = -ENOTCONN;
+- if (sk->sk_state == SMC_CLOSED)
++ if ((sk->sk_state != SMC_ACTIVE) &&
++ (sk->sk_state != SMC_PEERCLW1) &&
++ (sk->sk_state != SMC_PEERCLW2) &&
++ (sk->sk_state != SMC_APPLCLW1) &&
++ (sk->sk_state != SMC_APPLCLW2))
+ goto out;
+ if (smc->use_fallback) {
+ rc = kernel_sock_shutdown(smc->clcsock, how);
+@@ -1057,7 +1426,18 @@ static int smc_shutdown(struct socket *s
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ sk->sk_state = SMC_CLOSED;
+ } else {
+- rc = sock_no_shutdown(sock, how);
++ switch (how) {
++ case SHUT_RDWR: /* shutdown in both directions */
++ rc = smc_conn_release(smc);
++ break;
++ case SHUT_WR:
++ rc = smc_conn_shutdown_write(smc);
++ break;
++ case SHUT_RD:
++ break;
++ }
++ rc = kernel_sock_shutdown(smc->clcsock, how);
++ sk->sk_shutdown |= ++how;
+ }
+
+ out:
+@@ -1265,14 +1645,18 @@ out_pnet:
+
+ static void __exit smc_exit(void)
+ {
++ LIST_HEAD(lgr_freeing_list);
+ struct smc_link_group *lgr, *lg;
+
+ spin_lock(&smc_lgr_list.lock);
+- list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) {
++ if (!list_empty(&smc_lgr_list.list))
++ list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
++ spin_unlock(&smc_lgr_list.lock);
++ list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
++ cancel_delayed_work_sync(&lgr->free_work);
+ list_del_init(&lgr->list);
+ smc_lgr_free(lgr); /* free link group */
+ }
+- spin_unlock(&smc_lgr_list.lock);
+ smc_ib_unregister_client();
+ sock_unregister(PF_SMC);
+ proto_unregister(&smc_proto);
+--- a/net/smc/smc.h
++++ b/net/smc/smc.h
+@@ -32,6 +32,14 @@ enum smc_state { /* possible states of
+ SMC_INIT = 2,
+ SMC_CLOSED = 7,
+ SMC_LISTEN = 10,
++ SMC_PEERCLW1 = 20,
++ SMC_PEERCLW2 = 21,
++ SMC_APPLCLW1 = 22,
++ SMC_APPLCLW2 = 23,
++ SMC_APPLFINCLW = 24,
++ SMC_PEERFINCLW = 25,
++ SMC_PEERABORTW = 26,
++ SMC_PROCESSABORT = 27,
+ SMC_DESTRUCT = 32
+ };
+
+@@ -163,6 +171,7 @@ struct smc_sock { /* smc sock contain
+ struct work_struct smc_listen_work;/* prepare new accept socket */
+ struct list_head accept_q; /* sockets to be accepted */
+ spinlock_t accept_q_lock; /* protects accept_q */
++ struct delayed_work fin_work; /* final socket freeing */
+ u8 use_fallback : 1, /* fallback to tcp */
+ clc_started : 1;/* smc_connect_rdma ran */
+ };
+@@ -176,6 +185,8 @@ static inline struct smc_sock *smc_sk(co
+
+ extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
+
++extern struct mutex smc_create_lgr_pending;
++
+ /* convert an u32 value into network byte order, store it into a 3 byte field */
+ static inline void hton24(u8 *net, u32 host)
+ {
+@@ -236,5 +247,7 @@ int smc_netinfo_by_tcpsk(struct socket *
+ void smc_conn_free(struct smc_connection *);
+ int smc_conn_create(struct smc_sock *, __be32, struct smc_ib_device *, u8,
+ struct smc_clc_msg_local *, int);
++void smc_conn_release_handler(struct smc_sock *);
++void smc_wake_close_tx_prepared(struct smc_sock *);
+
+ #endif /* _SMC_H */
+--- a/net/smc/smc_cdc.c
++++ b/net/smc/smc_cdc.c
+@@ -102,6 +102,35 @@ out:
+ return rc;
+ }
+
++int smc_cdc_wr_tx_pends(struct smc_connection *conn)
++{
++ struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
++ int i;
++
++ for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
++ struct smc_cdc_tx_pend *tx_pend;
++
++ tx_pend = (struct smc_cdc_tx_pend *)&link->wr_tx_pends[i].priv;
++ if (tx_pend->conn == conn)
++ return 1;
++ }
++ return 0;
++}
++
++void smc_cdc_put_conn_slots(struct smc_connection *conn)
++{
++ struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
++ int i;
++
++ for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
++ struct smc_wr_tx_pend_priv *tx_pend;
++
++ tx_pend = &link->wr_tx_pends[i].priv;
++ if (((struct smc_cdc_tx_pend *)tx_pend)->conn == conn)
++ smc_wr_tx_put_slot(link, tx_pend);
++ }
++}
++
+ static inline bool smc_cdc_before(u16 seq1, u16 seq2)
+ {
+ return (s16)(seq1 - seq2) < 0;
+@@ -131,6 +160,7 @@ static void smc_cdc_msg_recv_action(stru
+ smp_mb__before_atomic();
+ atomic_add(diff_cons, &conn->peer_rmbe_space);
+ smp_mb__after_atomic();
++ smc_rx_handler(smc);
+ }
+
+ diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old,
+@@ -143,19 +173,15 @@ static void smc_cdc_msg_recv_action(stru
+
+ if (conn->local_rx_ctrl.conn_state_flags.abnormal_close)
+ smc->sk.sk_err = ECONNRESET;
+- if (smc_stop_received(conn)) {
+- smc->sk.sk_shutdown |= RCV_SHUTDOWN;
+- sock_set_flag(&smc->sk, SOCK_DONE);
+-
+- /* subsequent patch: terminate connection */
+- }
++ if (smc_stop_received(conn))
++ smc_conn_release_handler(smc);
+
+ /* piggy backed tx info */
+ /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
+- if (diff_cons)
++ if (diff_cons && smc_tx_prepared_sends(conn)) {
+ smc_tx_sndbuf_nonempty(conn);
+-
+- /* subsequent patch: trigger socket release if connection closed */
++ smc_wake_close_tx_prepared(smc);
++ }
+
+ /* socket connected but not accepted */
+ if (!smc->sk.sk_socket)
+@@ -165,10 +191,6 @@ static void smc_cdc_msg_recv_action(stru
+ if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
+ (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req))
+ smc_tx_consumer_update(conn);
+- if (diff_prod ||
+- smc_stop_received(conn) ||
+- smc->sk.sk_shutdown & RCV_SHUTDOWN)
+- smc->sk.sk_data_ready(&smc->sk);
+ }
+
+ /* called under tasklet context */
+--- a/net/smc/smc_cdc.h
++++ b/net/smc/smc_cdc.h
+@@ -153,8 +153,10 @@ struct smc_cdc_tx_pend;
+
+ int smc_cdc_get_free_slot(struct smc_link *, struct smc_wr_buf **,
+ struct smc_cdc_tx_pend **);
++void smc_cdc_put_conn_slots(struct smc_connection *conn);
+ int smc_cdc_msg_send(struct smc_connection *, struct smc_wr_buf *,
+ struct smc_cdc_tx_pend *);
++int smc_cdc_wr_tx_pends(struct smc_connection *);
+ int smc_cdc_init(void) __init;
+
+ #endif /* SMC_CDC_H */
+--- a/net/smc/smc_core.h
++++ b/net/smc/smc_core.h
+@@ -131,6 +131,7 @@ struct smc_link_group {
+ /* used rtoken elements */
+
+ u32 id; /* unique lgr id */
++ struct delayed_work free_work; /* delayed freeing of an lgr */
+ };
+
+ /* Find the connection associated with the given alert token in the link group.
+--- a/net/smc/smc_ib.c
++++ b/net/smc/smc_ib.c
+@@ -390,6 +390,14 @@ out:
+ return rc;
+ }
+
++static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
++{
++ ib_destroy_cq(smcibdev->roce_cq_send);
++ ib_destroy_cq(smcibdev->roce_cq_recv);
++ ib_unregister_event_handler(&smcibdev->event_handler);
++ smc_wr_remove_dev(smcibdev);
++}
++
+ static struct ib_client smc_ib_client;
+
+ /* callback function for ib_register_client() */
+@@ -436,8 +444,9 @@ static void smc_ib_remove_dev(struct ib_
+ struct smc_ib_device *smcibdev;
+
+ smcibdev = ib_get_client_data(ibdev, &smc_ib_client);
+- smc_wr_remove_dev(smcibdev);
+ ib_set_client_data(ibdev, &smc_ib_client, NULL);
++ if (smcibdev->initialized)
++ smc_ib_cleanup_per_ibdev(smcibdev);
+ spin_lock(&smc_ib_devices.lock);
+ list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
+ spin_unlock(&smc_ib_devices.lock);
+--- a/net/smc/smc_rx.c
++++ b/net/smc/smc_rx.c
+@@ -177,6 +177,11 @@ check_repeat:
+ return read_done;
+ }
+
++void smc_rx_handler(struct smc_sock *smc)
++{
++ smc->sk.sk_data_ready(&smc->sk);
++}
++
+ /* Initialize receive properties on connection establishment. NB: not __init! */
+ void smc_rx_init(struct smc_sock *smc)
+ {
+--- a/net/smc/smc_rx.h
++++ b/net/smc/smc_rx.h
+@@ -19,5 +19,6 @@
+ void smc_rx_init(struct smc_sock *);
+ int smc_rx_to_read(struct smc_connection *);
+ int smc_rx_recvmsg(struct smc_sock *, struct msghdr *, size_t, int);
++void smc_rx_handler(struct smc_sock *);
+
+ #endif /* SMC_RX_H */
+--- a/net/smc/smc_tx.c
++++ b/net/smc/smc_tx.c
+@@ -423,6 +423,41 @@ out_unlock:
+ return rc;
+ }
+
++int smc_tx_close_wr(struct smc_connection *conn)
++{
++ struct smc_cdc_tx_pend *pend;
++ struct smc_wr_buf *wr_buf;
++ int rc;
++
++ conn->local_tx_ctrl.conn_state_flags.sending_done = 1;
++
++ rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
++ &pend);
++
++ rc = smc_cdc_msg_send(conn, wr_buf, pend);
++
++ return rc;
++}
++
++int smc_tx_close(struct smc_connection *conn)
++{
++ struct smc_cdc_tx_pend *pend;
++ struct smc_wr_buf *wr_buf;
++ int rc;
++
++ if (atomic_read(&conn->bytes_to_rcv))
++ conn->local_tx_ctrl.conn_state_flags.abnormal_close = 1;
++ else
++ conn->local_tx_ctrl.conn_state_flags.closed_conn = 1;
++
++ rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
++ &pend);
++
++ rc = smc_cdc_msg_send(conn, wr_buf, pend);
++
++ return rc;
++}
++
+ /* Wakeup sndbuf consumers from process context
+ * since there is more data to transmit
+ */
+--- a/net/smc/smc_tx.h
++++ b/net/smc/smc_tx.h
+@@ -16,10 +16,21 @@
+
+ #include "smc.h"
+
++static inline int smc_tx_prepared_sends(struct smc_connection *conn)
++{
++ union smc_host_cursor_ovl sent, prep;
++
++ sent.acurs = smc_curs_read(conn->tx_curs_sent.acurs);
++ prep.acurs = smc_curs_read(conn->tx_curs_prep.acurs);
++ return smc_curs_diff(conn->sndbuf_size, &sent, &prep);
++}
++
+ void smc_tx_init(struct smc_sock *);
+ int smc_tx_sendmsg(struct smc_sock *, struct msghdr *, size_t);
+ int smc_tx_sndbuf_nonempty(struct smc_connection *);
+ void smc_tx_sndbuf_nonfull(struct smc_sock *);
+ void smc_tx_consumer_update(struct smc_connection *);
++int smc_tx_close(struct smc_connection *);
++int smc_tx_close_wr(struct smc_connection *);
+
+ #endif /* SMC_TX_H */
+--- a/net/smc/smc_wr.c
++++ b/net/smc/smc_wr.c
+@@ -32,15 +32,6 @@
+ static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
+ static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
+
+-struct smc_wr_tx_pend { /* control data for a pending send request */
+- u64 wr_id; /* work request id sent */
+- smc_wr_tx_handler handler;
+- enum ib_wc_status wc_status; /* CQE status */
+- struct smc_link *link;
+- u32 idx;
+- struct smc_wr_tx_pend_priv priv;
+-};
+-
+ static bool smc_wr_tx_pending_on_link(struct smc_link *link)
+ {
+ return find_first_bit(link->wr_tx_mask, link->wr_tx_cnt)
+@@ -214,6 +205,7 @@ int smc_wr_tx_put_slot(struct smc_link *
+ pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
+ if (pend->idx < link->wr_tx_cnt) {
+ test_and_clear_bit(pend->idx, link->wr_tx_mask);
++ memset(&pend, 0, sizeof(pend));
+ return 1;
+ }
+
+--- a/net/smc/smc_wr.h
++++ b/net/smc/smc_wr.h
+@@ -40,6 +40,15 @@ struct smc_wr_rx_handler {
+ u8 type;
+ };
+
++struct smc_wr_tx_pend { /* control data for a pending send request */
++ u64 wr_id; /* work request id sent */
++ smc_wr_tx_handler handler;
++ enum ib_wc_status wc_status; /* CQE status */
++ struct smc_link *link;
++ u32 idx;
++ struct smc_wr_tx_pend_priv priv;
++};
++
+ /* Only used by RDMA write WRs.
+ * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
+ */
diff --git a/series.conf b/series.conf
index 2cfcbbc2a7..dd33662177 100644
--- a/series.conf
+++ b/series.conf
@@ -869,6 +869,21 @@
patches.arch/s390-sles12sp2-00-04-s390-pci-fmb-enhancements-03.patch
+ patches.arch/s390-sles12sp2-00-05-net-smc-r-01.patch
+ patches.arch/s390-sles12sp2-00-05-net-smc-r-02.patch
+ patches.arch/s390-sles12sp2-00-05-net-smc-r-03.patch
+ patches.arch/s390-sles12sp2-00-05-net-smc-r-04.patch
+ patches.arch/s390-sles12sp2-00-05-net-smc-r-05.patch
+ patches.arch/s390-sles12sp2-00-05-net-smc-r-06.patch
+ patches.arch/s390-sles12sp2-00-05-net-smc-r-07.patch
+ patches.arch/s390-sles12sp2-00-05-net-smc-r-08.patch
+ patches.arch/s390-sles12sp2-00-05-net-smc-r-09.patch
+ patches.arch/s390-sles12sp2-00-05-net-smc-r-10.patch
+ patches.arch/s390-sles12sp2-00-05-net-smc-r-11.patch
+ patches.arch/s390-sles12sp2-00-05-net-smc-r-12.patch
+ patches.arch/s390-sles12sp2-00-05-net-smc-r-13.patch
+ patches.arch/s390-sles12sp2-00-05-net-smc-r-14.patch
+
########################################################
# VM/FS patches
########################################################
diff --git a/supported.conf b/supported.conf
index 9fecb5ffcc..b4db387cdd 100644
--- a/supported.conf
+++ b/supported.conf
@@ -2204,6 +2204,7 @@
net/sched/sch_teql
net/sctp/sctp # Support for the SCTP protocol (RFC2960)
net/sctp/sctp_probe
+ net/smc/smc # Shared Memory Communications - RDMA (fate#319593)
net/sunrpc/auth_gss/auth_rpcgss
net/sunrpc/auth_gss/rpcsec_gss_krb5
net/sunrpc/sunrpc