diff --git a/Makefile b/Makefile
index e3c81f470b0da925cb3970d6342a36a25e88bc9e..8d18b901c554095f155698e9e52c6acef8d7dcb0 100644
--- a/Makefile
+++ b/Makefile
@@ -92,7 +92,7 @@ CC_basicprof = $(DEFAULT_CC)
 CXX_basicprof = $(DEFAULT_CXX)
 LD_basicprof = $(DEFAULT_CC)
 LDXX_basicprof = $(DEFAULT_CXX)
-CPPFLAGS_basicprof = -O2 -DGRPC_BASIC_PROFILER
+CPPFLAGS_basicprof = -O2 -DGRPC_BASIC_PROFILER -DGRPC_TIMERS_RDTSC
 LDFLAGS_basicprof =
 DEFINES_basicprof = NDEBUG
 
@@ -2471,7 +2471,7 @@ endif
 
 
 ifeq ($(SYSTEM),MINGW32)
-$(LIBDIR)/$(CONFIG)/grpc.$(SHARED_EXT): $(LIBGRPC_OBJS)  $(ZLIB_DEP)$(LIBDIR)/$(CONFIG)/gpr.$(SHARED_EXT) $(OPENSSL_DEP)
+$(LIBDIR)/$(CONFIG)/grpc.$(SHARED_EXT): $(LIBGRPC_OBJS)  $(ZLIB_DEP) $(LIBDIR)/$(CONFIG)/gpr.$(SHARED_EXT) $(OPENSSL_DEP)
 	$(E) "[LD]      Linking $@"
 	$(Q) mkdir -p `dirname $@`
 	$(Q) $(LD) $(LDFLAGS) -L$(LIBDIR)/$(CONFIG) -shared -Wl,--output-def=$(LIBDIR)/$(CONFIG)/grpc.def -Wl,--out-implib=$(LIBDIR)/$(CONFIG)/libgrpc-imp.a -o $(LIBDIR)/$(CONFIG)/grpc.$(SHARED_EXT) $(LIBGRPC_OBJS) $(LDLIBS) $(LDLIBS_SECURE) $(OPENSSL_MERGE_LIBS) -lgpr-imp
@@ -2697,7 +2697,7 @@ endif
 
 
 ifeq ($(SYSTEM),MINGW32)
-$(LIBDIR)/$(CONFIG)/grpc_unsecure.$(SHARED_EXT): $(LIBGRPC_UNSECURE_OBJS)  $(ZLIB_DEP)$(LIBDIR)/$(CONFIG)/gpr.$(SHARED_EXT)
+$(LIBDIR)/$(CONFIG)/grpc_unsecure.$(SHARED_EXT): $(LIBGRPC_UNSECURE_OBJS)  $(ZLIB_DEP) $(LIBDIR)/$(CONFIG)/gpr.$(SHARED_EXT)
 	$(E) "[LD]      Linking $@"
 	$(Q) mkdir -p `dirname $@`
 	$(Q) $(LD) $(LDFLAGS) -L$(LIBDIR)/$(CONFIG) -shared -Wl,--output-def=$(LIBDIR)/$(CONFIG)/grpc_unsecure.def -Wl,--out-implib=$(LIBDIR)/$(CONFIG)/libgrpc_unsecure-imp.a -o $(LIBDIR)/$(CONFIG)/grpc_unsecure.$(SHARED_EXT) $(LIBGRPC_UNSECURE_OBJS) $(LDLIBS) -lgpr-imp
@@ -2825,12 +2825,12 @@ endif
 
 
 ifeq ($(SYSTEM),MINGW32)
-$(LIBDIR)/$(CONFIG)/grpc++.$(SHARED_EXT): $(LIBGRPC++_OBJS)  $(ZLIB_DEP)$(LIBDIR)/$(CONFIG)/gpr.$(SHARED_EXT)$(LIBDIR)/$(CONFIG)/grpc.$(SHARED_EXT) $(OPENSSL_DEP)
+$(LIBDIR)/$(CONFIG)/grpc++.$(SHARED_EXT): $(LIBGRPC++_OBJS)  $(ZLIB_DEP) $(PROTOBUF_DEP) $(LIBDIR)/$(CONFIG)/gpr.$(SHARED_EXT) $(LIBDIR)/$(CONFIG)/grpc.$(SHARED_EXT) $(OPENSSL_DEP)
 	$(E) "[LD]      Linking $@"
 	$(Q) mkdir -p `dirname $@`
 	$(Q) $(LDXX) $(LDFLAGS) -L$(LIBDIR)/$(CONFIG) -shared -Wl,--output-def=$(LIBDIR)/$(CONFIG)/grpc++.def -Wl,--out-implib=$(LIBDIR)/$(CONFIG)/libgrpc++-imp.a -o $(LIBDIR)/$(CONFIG)/grpc++.$(SHARED_EXT) $(LIBGRPC++_OBJS) $(LDLIBS) $(LDLIBSXX) $(LDLIBS_PROTOBUF) -lgpr-imp -lgrpc-imp
 else
-$(LIBDIR)/$(CONFIG)/libgrpc++.$(SHARED_EXT): $(LIBGRPC++_OBJS)  $(ZLIB_DEP) $(LIBDIR)/$(CONFIG)/libgpr.$(SHARED_EXT) $(LIBDIR)/$(CONFIG)/libgrpc.$(SHARED_EXT) $(OPENSSL_DEP)
+$(LIBDIR)/$(CONFIG)/libgrpc++.$(SHARED_EXT): $(LIBGRPC++_OBJS)  $(ZLIB_DEP) $(PROTOBUF_DEP) $(LIBDIR)/$(CONFIG)/libgpr.$(SHARED_EXT) $(LIBDIR)/$(CONFIG)/libgrpc.$(SHARED_EXT) $(OPENSSL_DEP)
 	$(E) "[LD]      Linking $@"
 	$(Q) mkdir -p `dirname $@`
 ifeq ($(SYSTEM),Darwin)
@@ -3043,12 +3043,12 @@ endif
 
 
 ifeq ($(SYSTEM),MINGW32)
-$(LIBDIR)/$(CONFIG)/grpc++_unsecure.$(SHARED_EXT): $(LIBGRPC++_UNSECURE_OBJS)  $(ZLIB_DEP)$(LIBDIR)/$(CONFIG)/gpr.$(SHARED_EXT)$(LIBDIR)/$(CONFIG)/grpc_unsecure.$(SHARED_EXT)
+$(LIBDIR)/$(CONFIG)/grpc++_unsecure.$(SHARED_EXT): $(LIBGRPC++_UNSECURE_OBJS)  $(ZLIB_DEP) $(PROTOBUF_DEP) $(LIBDIR)/$(CONFIG)/gpr.$(SHARED_EXT) $(LIBDIR)/$(CONFIG)/grpc_unsecure.$(SHARED_EXT)
 	$(E) "[LD]      Linking $@"
 	$(Q) mkdir -p `dirname $@`
 	$(Q) $(LDXX) $(LDFLAGS) -L$(LIBDIR)/$(CONFIG) -shared -Wl,--output-def=$(LIBDIR)/$(CONFIG)/grpc++_unsecure.def -Wl,--out-implib=$(LIBDIR)/$(CONFIG)/libgrpc++_unsecure-imp.a -o $(LIBDIR)/$(CONFIG)/grpc++_unsecure.$(SHARED_EXT) $(LIBGRPC++_UNSECURE_OBJS) $(LDLIBS) $(LDLIBSXX) $(LDLIBS_PROTOBUF) -lgpr-imp -lgrpc_unsecure-imp
 else
-$(LIBDIR)/$(CONFIG)/libgrpc++_unsecure.$(SHARED_EXT): $(LIBGRPC++_UNSECURE_OBJS)  $(ZLIB_DEP) $(LIBDIR)/$(CONFIG)/libgpr.$(SHARED_EXT) $(LIBDIR)/$(CONFIG)/libgrpc_unsecure.$(SHARED_EXT)
+$(LIBDIR)/$(CONFIG)/libgrpc++_unsecure.$(SHARED_EXT): $(LIBGRPC++_UNSECURE_OBJS)  $(ZLIB_DEP) $(PROTOBUF_DEP) $(LIBDIR)/$(CONFIG)/libgpr.$(SHARED_EXT) $(LIBDIR)/$(CONFIG)/libgrpc_unsecure.$(SHARED_EXT)
 	$(E) "[LD]      Linking $@"
 	$(Q) mkdir -p `dirname $@`
 ifeq ($(SYSTEM),Darwin)
@@ -3451,7 +3451,7 @@ endif
 
 
 ifeq ($(SYSTEM),MINGW32)
-$(LIBDIR)/$(CONFIG)/grpc_csharp_ext.$(SHARED_EXT): $(LIBGRPC_CSHARP_EXT_OBJS)  $(ZLIB_DEP)$(LIBDIR)/$(CONFIG)/gpr.$(SHARED_EXT)$(LIBDIR)/$(CONFIG)/grpc.$(SHARED_EXT) $(OPENSSL_DEP)
+$(LIBDIR)/$(CONFIG)/grpc_csharp_ext.$(SHARED_EXT): $(LIBGRPC_CSHARP_EXT_OBJS)  $(ZLIB_DEP) $(LIBDIR)/$(CONFIG)/gpr.$(SHARED_EXT) $(LIBDIR)/$(CONFIG)/grpc.$(SHARED_EXT) $(OPENSSL_DEP)
 	$(E) "[LD]      Linking $@"
 	$(Q) mkdir -p `dirname $@`
 	$(Q) $(LD) $(LDFLAGS) -L$(LIBDIR)/$(CONFIG) -shared -Wl,--output-def=$(LIBDIR)/$(CONFIG)/grpc_csharp_ext.def -Wl,--out-implib=$(LIBDIR)/$(CONFIG)/libgrpc_csharp_ext-imp.a -o $(LIBDIR)/$(CONFIG)/grpc_csharp_ext.$(SHARED_EXT) $(LIBGRPC_CSHARP_EXT_OBJS) $(LDLIBS) -lgpr-imp -lgrpc-imp
diff --git a/src/compiler/cpp_generator.cc b/src/compiler/cpp_generator.cc
index 1324198847b0090168fcf82bee4b4e69a20646af..735e7e58a82194fddf0d8e967c41a971c1e35570 100644
--- a/src/compiler/cpp_generator.cc
+++ b/src/compiler/cpp_generator.cc
@@ -828,9 +828,7 @@ void PrintSourceService(grpc::protobuf::io::Printer *printer,
           "    new ::grpc::RpcMethodHandler< $ns$$Service$::Service, "
           "$Request$, "
           "$Response$>(\n"
-          "        std::function< ::grpc::Status($ns$$Service$::Service*, "
-          "::grpc::ServerContext*, const $Request$*, $Response$*)>("
-          "&$ns$$Service$::Service::$Method$), this),\n"
+          "        std::mem_fn(&$ns$$Service$::Service::$Method$), this),\n"
           "    new $Request$, new $Response$));\n");
     } else if (ClientOnlyStreaming(method)) {
       printer->Print(
@@ -840,10 +838,7 @@ void PrintSourceService(grpc::protobuf::io::Printer *printer,
           "    ::grpc::RpcMethod::CLIENT_STREAMING,\n"
           "    new ::grpc::ClientStreamingHandler< "
           "$ns$$Service$::Service, $Request$, $Response$>(\n"
-          "        std::function< ::grpc::Status($ns$$Service$::Service*, "
-          "::grpc::ServerContext*, "
-          "::grpc::ServerReader< $Request$>*, $Response$*)>("
-          "&$ns$$Service$::Service::$Method$), this),\n"
+          "        std::mem_fn(&$ns$$Service$::Service::$Method$), this),\n"
           "    new $Request$, new $Response$));\n");
     } else if (ServerOnlyStreaming(method)) {
       printer->Print(
@@ -853,10 +848,7 @@ void PrintSourceService(grpc::protobuf::io::Printer *printer,
           "    ::grpc::RpcMethod::SERVER_STREAMING,\n"
           "    new ::grpc::ServerStreamingHandler< "
           "$ns$$Service$::Service, $Request$, $Response$>(\n"
-          "        std::function< ::grpc::Status($ns$$Service$::Service*, "
-          "::grpc::ServerContext*, "
-          "const $Request$*, ::grpc::ServerWriter< $Response$>*)>("
-          "&$ns$$Service$::Service::$Method$), this),\n"
+          "        std::mem_fn(&$ns$$Service$::Service::$Method$), this),\n"
           "    new $Request$, new $Response$));\n");
     } else if (BidiStreaming(method)) {
       printer->Print(
@@ -866,10 +858,7 @@ void PrintSourceService(grpc::protobuf::io::Printer *printer,
           "    ::grpc::RpcMethod::BIDI_STREAMING,\n"
           "    new ::grpc::BidiStreamingHandler< "
           "$ns$$Service$::Service, $Request$, $Response$>(\n"
-          "        std::function< ::grpc::Status($ns$$Service$::Service*, "
-          "::grpc::ServerContext*, "
-          "::grpc::ServerReaderWriter< $Response$, $Request$>*)>("
-          "&$ns$$Service$::Service::$Method$), this),\n"
+          "        std::mem_fn(&$ns$$Service$::Service::$Method$), this),\n"
           "    new $Request$, new $Response$));\n");
     }
   }
diff --git a/src/core/iomgr/endpoint_pair_windows.c b/src/core/iomgr/endpoint_pair_windows.c
index d78b6ea957e35b0441a014e62ca9f716b6cd522c..58960b6028ae559b8e413d99276d2a554e538811 100644
--- a/src/core/iomgr/endpoint_pair_windows.c
+++ b/src/core/iomgr/endpoint_pair_windows.c
@@ -50,7 +50,7 @@ static void create_sockets(SOCKET sv[2]) {
   SOCKET lst_sock = INVALID_SOCKET;
   SOCKET cli_sock = INVALID_SOCKET;
   SOCKADDR_IN addr;
-  int addr_len;
+  int addr_len = sizeof(addr);
 
   lst_sock = WSASocket(AF_INET, SOCK_STREAM, IPPROTO_TCP, NULL, 0, WSA_FLAG_OVERLAPPED);
   GPR_ASSERT(lst_sock != INVALID_SOCKET);
diff --git a/src/core/iomgr/iocp_windows.c b/src/core/iomgr/iocp_windows.c
index 7968729353a1ec1185b3fd61454ecb3629b8a841..1cdf3da0d6258338449a32477ec2ecd610adb67f 100644
--- a/src/core/iomgr/iocp_windows.c
+++ b/src/core/iomgr/iocp_windows.c
@@ -172,9 +172,15 @@ void grpc_iocp_add_socket(grpc_winsocket *socket) {
 }
 
 void grpc_iocp_socket_orphan(grpc_winsocket *socket) {
+  GPR_ASSERT(!socket->orphan);
   gpr_atm_full_fetch_add(&g_orphans, 1);
+  socket->orphan = 1;
 }
 
+/* Calling notify_on_read or write means either of two things:
+   -) The IOCP already completed in the background, and we need to call
+   the callback now.
+   -) The IOCP hasn't completed yet, and we're queuing it for later. */
 static void socket_notify_on_iocp(grpc_winsocket *socket,
                                   void(*cb)(void *, int), void *opaque,
                                   grpc_winsocket_callback_info *info) {
diff --git a/src/core/iomgr/iomgr_windows.c b/src/core/iomgr/iomgr_windows.c
index f130ab9a078ead0ef81fd3ef68b2ef53d2b4eee0..74cd5a829b28d4be850cbc50fadba9cb333d5d18 100644
--- a/src/core/iomgr/iomgr_windows.c
+++ b/src/core/iomgr/iomgr_windows.c
@@ -43,6 +43,10 @@
 #include "src/core/iomgr/iocp_windows.h"
 #include "src/core/iomgr/iomgr.h"
 
+/* Windows' io manager is going to be fully designed using IO completion
+   ports. All of what we're doing here is basically make sure that
+   Windows sockets are initialized in and out. */
+
 static void winsock_init(void) {
   WSADATA wsaData;
   int status = WSAStartup(MAKEWORD(2, 0), &wsaData);
diff --git a/src/core/iomgr/pollset_kick_windows.h b/src/core/iomgr/pollset_kick_windows.h
index 3836aa00820a046d966a073c2bf2636dce68b3ef..c675c119aba0185026e2bb72259658ee73110222 100644
--- a/src/core/iomgr/pollset_kick_windows.h
+++ b/src/core/iomgr/pollset_kick_windows.h
@@ -36,6 +36,9 @@
 
 #include <grpc/support/sync.h>
 
+/* There isn't really any such thing as a pollset under Windows, due to the
+   nature of the IO completion ports. */
+
 struct grpc_kick_fd_info;
 
 typedef struct grpc_pollset_kick_state {
diff --git a/src/core/iomgr/pollset_windows.c b/src/core/iomgr/pollset_windows.c
index bea67116116feccd94a9101ea6fc06f66ccf2ea8..5af0685f9d97818dee8255028af3db176155174f 100644
--- a/src/core/iomgr/pollset_windows.c
+++ b/src/core/iomgr/pollset_windows.c
@@ -41,6 +41,11 @@
 #include "src/core/iomgr/iomgr_internal.h"
 #include "src/core/iomgr/pollset_windows.h"
 
+/* There isn't really any such thing as a pollset under Windows, due to the
+   nature of the IO completion ports. We're still going to provide a minimal
+   set of features for the sake of the rest of grpc. But grpc_pollset_work
+   won't actually do any polling, and return as quickly as possible. */
+
 void grpc_pollset_init(grpc_pollset *pollset) {
   gpr_mu_init(&pollset->mu);
   gpr_cv_init(&pollset->cv);
diff --git a/src/core/iomgr/pollset_windows.h b/src/core/iomgr/pollset_windows.h
index 266175abfb91816954dd30f87c2fd88eac67f119..e1115bac4ff0fc74425c451f59caa8f07f761aa3 100644
--- a/src/core/iomgr/pollset_windows.h
+++ b/src/core/iomgr/pollset_windows.h
@@ -40,10 +40,10 @@
 #include "src/core/iomgr/pollset_kick.h"
 #include "src/core/iomgr/socket_windows.h"
 
-/* forward declare only in this file to avoid leaking impl details via
-   pollset.h; real users of grpc_fd should always include 'fd_posix.h' and not
-   use the struct tag */
-struct grpc_fd;
+/* There isn't really any such thing as a pollset under Windows, due to the
+   nature of the IO completion ports. A Windows "pollset" is merely a mutex
+   and a condition variable, as this is the minimal set of features we need
+   implemented for the rest of grpc. But we won't use them directly. */
 
 typedef struct grpc_pollset {
   gpr_mu mu;
diff --git a/src/core/iomgr/socket_windows.c b/src/core/iomgr/socket_windows.c
index 91268c04e6dba7cd4e05db8aaefef42ced0c87e8..9306310d435ba5f5b6e5aaf2f2c2415a4f8aca96 100644
--- a/src/core/iomgr/socket_windows.c
+++ b/src/core/iomgr/socket_windows.c
@@ -32,17 +32,18 @@
  */
 
 #include <grpc/support/port_platform.h>
-#include <grpc/support/alloc.h>
-#include <grpc/support/log.h>
 
 #ifdef GPR_WINSOCK_SOCKET
 
+#include <grpc/support/alloc.h>
+#include <grpc/support/log.h>
+
 #include "src/core/iomgr/iocp_windows.h"
 #include "src/core/iomgr/iomgr.h"
 #include "src/core/iomgr/iomgr_internal.h"
-#include "src/core/iomgr/socket_windows.h"
 #include "src/core/iomgr/pollset.h"
 #include "src/core/iomgr/pollset_windows.h"
+#include "src/core/iomgr/socket_windows.h"
 
 grpc_winsocket *grpc_winsocket_create(SOCKET socket) {
   grpc_winsocket *r = gpr_malloc(sizeof(grpc_winsocket));
@@ -54,26 +55,44 @@ grpc_winsocket *grpc_winsocket_create(SOCKET socket) {
   return r;
 }
 
-static void shutdown_op(grpc_winsocket_callback_info *info) {
-  if (!info->cb) return;
-  grpc_iomgr_add_delayed_callback(info->cb, info->opaque, 0);
-}
-
+/* Schedule a shutdown of the socket operations. Will call the pending
+   operations to abort them. We need to do that this way because of the
+   various callsites of that function, which happens to be in various
+   mutex hold states, and that'd be unsafe to call them directly. */
 void grpc_winsocket_shutdown(grpc_winsocket *socket) {
-  shutdown_op(&socket->read_info);
-  shutdown_op(&socket->write_info);
+  gpr_mu_lock(&socket->state_mu);
+  if (socket->read_info.cb) {
+    grpc_iomgr_add_delayed_callback(socket->read_info.cb,
+                                    socket->read_info.opaque, 0);
+  }
+  if (socket->write_info.cb) {
+    grpc_iomgr_add_delayed_callback(socket->write_info.cb,
+                                    socket->write_info.opaque, 0);
+  }
+  gpr_mu_unlock(&socket->state_mu);
 }
 
-void grpc_winsocket_orphan(grpc_winsocket *socket) {
-  grpc_iocp_socket_orphan(socket);
-  socket->orphan = 1;
+/* Abandons a socket. Either we're going to queue it up for garbage collecting
+   from the IO Completion Port thread, or destroy it immediately. Note that this
+   mechanisms assumes that we're either always waiting for an operation, or we
+   explicitely know that we don't. If there is a future case where we can have
+   an "idle" socket which is neither trying to read or write, we'd start leaking
+   both memory and sockets. */
+void grpc_winsocket_orphan(grpc_winsocket *winsocket) {
+  SOCKET socket = winsocket->socket;
+  if (!winsocket->closed_early) {
+    grpc_iocp_socket_orphan(winsocket);
+  }
+  if (winsocket->closed_early) {
+    grpc_winsocket_destroy(winsocket);
+  }
+  closesocket(socket);
   grpc_iomgr_unref();
-  closesocket(socket->socket);
 }
 
-void grpc_winsocket_destroy(grpc_winsocket *socket) {
-  gpr_mu_destroy(&socket->state_mu);
-  gpr_free(socket);
+void grpc_winsocket_destroy(grpc_winsocket *winsocket) {
+  gpr_mu_destroy(&winsocket->state_mu);
+  gpr_free(winsocket);
 }
 
 #endif  /* GPR_WINSOCK_SOCKET */
diff --git a/src/core/iomgr/socket_windows.h b/src/core/iomgr/socket_windows.h
index cbae91692cbba76aba65bcb0544011f96633a209..6e778a776a3bbc43003e3b199a0bc0ec78e2e8fd 100644
--- a/src/core/iomgr/socket_windows.h
+++ b/src/core/iomgr/socket_windows.h
@@ -39,21 +39,43 @@
 #include <grpc/support/sync.h>
 #include <grpc/support/atm.h>
 
+/* This holds the data for an outstanding read or write on a socket.
+   The mutex to protect the concurrent access to that data is the one
+   inside the winsocket wrapper. */
 typedef struct grpc_winsocket_callback_info {
   /* This is supposed to be a WSAOVERLAPPED, but in order to get that
-   * definition, we need to include ws2tcpip.h, which needs to be included
-   * from the top, otherwise it'll clash with a previous inclusion of
-   * windows.h that in turns includes winsock.h. If anyone knows a way
-   * to do it properly, feel free to send a patch.
-   */
+     definition, we need to include ws2tcpip.h, which needs to be included
+     from the top, otherwise it'll clash with a previous inclusion of
+     windows.h that in turns includes winsock.h. If anyone knows a way
+     to do it properly, feel free to send a patch. */
   OVERLAPPED overlapped;
+  /* The callback information for the pending operation. May be empty if the
+     caller hasn't registered a callback yet. */
   void(*cb)(void *opaque, int success);
   void *opaque;
+  /* A boolean to describe if the IO Completion Port got a notification for
+     that operation. This will happen if the operation completed before the
+     called had time to register a callback. We could avoid that behavior
+     altogether by forcing the caller to always register its callback before
+     proceeding queue an operation, but it is frequent for an IO Completion
+     Port to trigger quickly. This way we avoid a context switch for calling
+     the callback. We also simplify the read / write operations to avoid having
+     to hold a mutex for a long amount of time. */
   int has_pending_iocp;
+  /* The results of the overlapped operation. */
   DWORD bytes_transfered;
   int wsa_error;
 } grpc_winsocket_callback_info;
 
+/* This is a wrapper to a Windows socket. A socket can have one outstanding
+   read, and one outstanding write. Doing an asynchronous accept means waiting
+   for a read operation. Doing an asynchronous connect means waiting for a
+   write operation. These are completely abitrary ties between the operation
+   and the kind of event, because we can have one overlapped per pending
+   operation, whichever its nature is. So we could have more dedicated pending
+   operation callbacks for connect and listen. But given the scope of listen
+   and accept, we don't need to go to that extent and waste memory. Also, this
+   is closer to what happens in posix world. */
 typedef struct grpc_winsocket {
   SOCKET socket;
 
@@ -62,16 +84,35 @@ typedef struct grpc_winsocket {
 
   gpr_mu state_mu;
 
+  /* You can't add the same socket twice to the same IO Completion Port.
+     This prevents that. */
   int added_to_iocp;
+  /* A boolean to indicate that the caller has abandonned that socket, but
+     there is a pending operation that the IO Completion Port will have to
+     wait for. The socket will be collected at that time. */
   int orphan;
+  /* A boolean to indicate that the socket was already closed somehow, and
+     that no operation is going to be pending. Trying to abandon a socket in
+     that state won't result in an orphan, but will instead be destroyed
+     without further delay. We could avoid that boolean by adding one into
+     grpc_winsocket_callback_info describing that the operation is pending,
+     but that 1) waste memory more and 2) obfuscate the intent a bit more. */
+  int closed_early;
 } grpc_winsocket;
 
-/* Create a wrapped windows handle.
-This takes ownership of closing it. */
+/* Create a wrapped windows handle. This takes ownership of it, meaning that
+   it will be responsible for closing it. */
 grpc_winsocket *grpc_winsocket_create(SOCKET socket);
 
+/* Initiate an asynchronous shutdown of the socket. Will call off any pending
+   operation to cancel them. */
 void grpc_winsocket_shutdown(grpc_winsocket *socket);
+
+/* Abandon a socket. */
 void grpc_winsocket_orphan(grpc_winsocket *socket);
+
+/* Destroy a socket. Should only be called by the IO Completion Port thread,
+   or by grpc_winsocket_orphan if there's no pending operation. */
 void grpc_winsocket_destroy(grpc_winsocket *socket);
 
 #endif  /* GRPC_INTERNAL_CORE_IOMGR_SOCKET_WINDOWS_H */
diff --git a/src/core/iomgr/tcp_client_windows.c b/src/core/iomgr/tcp_client_windows.c
index 00c8da601b4e42f907403a6309b033cef3d0bc2b..653c0c65c5157c08208bc1292f75b0966c855ca4 100644
--- a/src/core/iomgr/tcp_client_windows.c
+++ b/src/core/iomgr/tcp_client_windows.c
@@ -59,6 +59,7 @@ typedef struct {
   gpr_timespec deadline;
   grpc_alarm alarm;
   int refs;
+  int aborted;
 } async_connect;
 
 static void async_connect_cleanup(async_connect *ac) {
@@ -70,26 +71,31 @@ static void async_connect_cleanup(async_connect *ac) {
   }
 }
 
-static void on_alarm(void *acp, int success) {
+static void on_alarm(void *acp, int occured) {
   async_connect *ac = acp;
   gpr_mu_lock(&ac->mu);
-  if (ac->socket != NULL && success) {
+  /* If the alarm didn't occor, it got cancelled. */
+  if (ac->socket != NULL && occured) {
     grpc_winsocket_shutdown(ac->socket);
   }
   async_connect_cleanup(ac);
 }
 
-static void on_connect(void *acp, int success) {
+static void on_connect(void *acp, int from_iocp) {
   async_connect *ac = acp;
   SOCKET sock = ac->socket->socket;
   grpc_endpoint *ep = NULL;
   grpc_winsocket_callback_info *info = &ac->socket->write_info;
   void(*cb)(void *arg, grpc_endpoint *tcp) = ac->cb;
   void *cb_arg = ac->cb_arg;
+  int aborted;
 
   grpc_alarm_cancel(&ac->alarm);
 
-  if (success) {
+  gpr_mu_lock(&ac->mu);
+  aborted = ac->aborted;
+
+  if (from_iocp) {
     DWORD transfered_bytes = 0;
     DWORD flags;
     BOOL wsa_success = WSAGetOverlappedResult(sock, &info->overlapped,
@@ -107,20 +113,40 @@ static void on_connect(void *acp, int success) {
     }
   } else {
     gpr_log(GPR_ERROR, "on_connect is shutting down");
-    goto finish;
+    /* If the connection timeouts, we will still get a notification from
+       the IOCP whatever happens. So we're just going to flag that connection
+       as being in the process of being aborted, and wait for the IOCP. We
+       can't just orphan the socket now, because the IOCP might already have
+       gotten a successful connection, which is our worst-case scenario.
+       We need to call our callback now to respect the deadline. */
+    ac->aborted = 1;
+    gpr_mu_unlock(&ac->mu);
+    cb(cb_arg, NULL);
+    return;
   }
 
   abort();
 
 finish:
-  gpr_mu_lock(&ac->mu);
-  if (!ep) {
+  /* If we don't have an endpoint, it means the connection failed,
+     so it doesn't matter if it aborted or failed. We need to orphan
+     that socket. */
+  if (!ep || aborted) {
+    /* If the connection failed, it means we won't get an IOCP notification,
+       so let's flag it as already closed. But if the connection was aborted,
+       while we still got an endpoint, we have to wait for the IOCP to collect
+       that socket. So let's properly flag that. */
+    ac->socket->closed_early = !ep;
     grpc_winsocket_orphan(ac->socket);
   }
   async_connect_cleanup(ac);
-  cb(cb_arg, ep);
+  /* If the connection was aborted, the callback was already called when
+     the deadline was met. */
+  if (!aborted) cb(cb_arg, ep);
 }
 
+/* Tries to issue one async connection, then schedules both an IOCP
+   notification request for the connection, and one timeout alert. */
 void grpc_tcp_client_connect(void(*cb)(void *arg, grpc_endpoint *tcp),
                              void *arg, const struct sockaddr *addr,
                              int addr_len, gpr_timespec deadline) {
@@ -156,6 +182,8 @@ void grpc_tcp_client_connect(void(*cb)(void *arg, grpc_endpoint *tcp),
     goto failure;
   }
 
+  /* Grab the function pointer for ConnectEx for that specific socket.
+     It may change depending on the interface. */
   status = WSAIoctl(sock, SIO_GET_EXTENSION_FUNCTION_POINTER,
                     &guid, sizeof(guid), &ConnectEx, sizeof(ConnectEx),
                     &ioctl_num_bytes, NULL, NULL);
@@ -178,6 +206,8 @@ void grpc_tcp_client_connect(void(*cb)(void *arg, grpc_endpoint *tcp),
   info = &socket->write_info;
   success = ConnectEx(sock, addr, addr_len, NULL, 0, NULL, &info->overlapped);
 
+  /* It wouldn't be unusual to get a success immediately. But we'll still get
+     an IOCP notification, so let's ignore it. */
   if (!success) {
     int error = WSAGetLastError();
     if (error != ERROR_IO_PENDING) {
@@ -192,6 +222,7 @@ void grpc_tcp_client_connect(void(*cb)(void *arg, grpc_endpoint *tcp),
   ac->socket = socket;
   gpr_mu_init(&ac->mu);
   ac->refs = 2;
+  ac->aborted = 0;
 
   grpc_alarm_init(&ac->alarm, deadline, on_alarm, ac, gpr_now());
   grpc_socket_notify_on_write(socket, on_connect, ac);
@@ -202,6 +233,7 @@ failure:
   gpr_log(GPR_ERROR, message, utf8_message);
   gpr_free(utf8_message);
   if (socket) {
+    socket->closed_early = 1;
     grpc_winsocket_orphan(socket);
   } else if (sock != INVALID_SOCKET) {
     closesocket(sock);
diff --git a/src/core/iomgr/tcp_server_windows.c b/src/core/iomgr/tcp_server_windows.c
index fe92846a71622739523f151b4fd27b5c350d9157..c6137e1e1d635fd653bcd1e52516126285a0d599 100644
--- a/src/core/iomgr/tcp_server_windows.c
+++ b/src/core/iomgr/tcp_server_windows.c
@@ -55,11 +55,17 @@
 
 /* one listening port */
 typedef struct server_port {
-  gpr_uint8 addresses[sizeof(struct sockaddr_in6) * 2 + 32];
+  /* This seemingly magic number comes from AcceptEx's documentation. each
+     address buffer needs to have at least 16 more bytes at their end. */
+  gpr_uint8 addresses[(sizeof(struct sockaddr_in6) + 16) * 2];
+  /* This will hold the socket for the next accept. */
   SOCKET new_socket;
+  /* The listener winsocked. */
   grpc_winsocket *socket;
   grpc_tcp_server *server;
+  /* The cached AcceptEx for that port. */
   LPFN_ACCEPTEX AcceptEx;
+  int shutting_down;
 } server_port;
 
 /* the overall server */
@@ -79,6 +85,8 @@ struct grpc_tcp_server {
   size_t port_capacity;
 };
 
+/* Public function. Allocates the proper data structures to hold a
+   grpc_tcp_server. */
 grpc_tcp_server *grpc_tcp_server_create(void) {
   grpc_tcp_server *s = gpr_malloc(sizeof(grpc_tcp_server));
   gpr_mu_init(&s->mu);
@@ -92,24 +100,30 @@ grpc_tcp_server *grpc_tcp_server_create(void) {
   return s;
 }
 
+/* Public function. Stops and destroys a grpc_tcp_server. */
 void grpc_tcp_server_destroy(grpc_tcp_server *s,
                              void (*shutdown_done)(void *shutdown_done_arg),
                              void *shutdown_done_arg) {
   size_t i;
   gpr_mu_lock(&s->mu);
-  /* shutdown all fd's */
+  /* First, shutdown all fd's. This will queue abortion calls for all
+     of the pending accepts. */
   for (i = 0; i < s->nports; i++) {
-    grpc_winsocket_shutdown(s->ports[i].socket);
+    server_port *sp = &s->ports[i];
+    grpc_winsocket_shutdown(sp->socket);
   }
-  /* wait while that happens */
+  /* This happens asynchronously. Wait while that happens. */
   while (s->active_ports) {
     gpr_cv_wait(&s->cv, &s->mu, gpr_inf_future);
   }
   gpr_mu_unlock(&s->mu);
 
-  /* delete ALL the things */
+  /* Now that the accepts have been aborted, we can destroy the sockets.
+     The IOCP won't get notified on these, so we can flag them as already
+     closed by the system. */
   for (i = 0; i < s->nports; i++) {
     server_port *sp = &s->ports[i];
+    sp->socket->closed_early = 1;
     grpc_winsocket_orphan(sp->socket);
   }
   gpr_free(s->ports);
@@ -120,7 +134,7 @@ void grpc_tcp_server_destroy(grpc_tcp_server *s,
   }
 }
 
-/* Prepare a recently-created socket for listening. */
+/* Prepare (bind) a recently-created socket for listening. */
 static int prepare_socket(SOCKET sock, const struct sockaddr *addr,
                           int addr_len) {
   struct sockaddr_storage sockname_temp;
@@ -168,8 +182,11 @@ error:
   return -1;
 }
 
-static void on_accept(void *arg, int success);
+/* start_accept will reference that for the IOCP notification request. */
+static void on_accept(void *arg, int from_iocp);
 
+/* In order to do an async accept, we need to create a socket first which
+   will be the one assigned to the new incoming connection. */
 static void start_accept(server_port *port) {
   SOCKET sock = INVALID_SOCKET;
   char *message;
@@ -191,12 +208,13 @@ static void start_accept(server_port *port) {
     goto failure;
   }
 
-  /* TODO(jtattermusch): probably a race here, we regularly get use-after-free on server shutdown */
-  GPR_ASSERT(port->socket != (grpc_winsocket*)0xfeeefeee);
+  /* Start the "accept" asynchronously. */
   success = port->AcceptEx(port->socket->socket, sock, port->addresses, 0,
                            addrlen, addrlen, &bytes_received,
                            &port->socket->read_info.overlapped);
 
+  /* It is possible to get an accept immediately without delay. However, we
+     will still get an IOCP notification for it. So let's just ignore it. */
   if (!success) {
     int error = WSAGetLastError();
     if (error != ERROR_IO_PENDING) {
@@ -205,6 +223,8 @@ static void start_accept(server_port *port) {
     }
   }
 
+  /* We're ready to do the accept. Calling grpc_socket_notify_on_read may
+     immediately process an accept that happened in the meantime. */
   port->new_socket = sock;
   grpc_socket_notify_on_read(port->socket, on_accept, port);
   return;
@@ -216,14 +236,30 @@ failure:
   if (sock != INVALID_SOCKET) closesocket(sock);
 }
 
-/* event manager callback when reads are ready */
-static void on_accept(void *arg, int success) {
+/* Event manager callback when reads are ready. */
+static void on_accept(void *arg, int from_iocp) {
   server_port *sp = arg;
   SOCKET sock = sp->new_socket;
   grpc_winsocket_callback_info *info = &sp->socket->read_info;
   grpc_endpoint *ep = NULL;
 
-  if (success) {
+  /* The shutdown sequence is done in two parts. This is the second
+     part here, acknowledging the IOCP notification, and doing nothing
+     else, especially not queuing a new accept. */
+  if (sp->shutting_down) {
+    GPR_ASSERT(from_iocp);
+    sp->shutting_down = 0;
+    gpr_mu_lock(&sp->server->mu);
+    if (0 == --sp->server->active_ports) {
+      gpr_cv_broadcast(&sp->server->cv);
+    }
+    gpr_mu_unlock(&sp->server->mu);
+    return;
+  }
+
+  if (from_iocp) {
+    /* The IOCP notified us of a completed operation. Let's grab the results,
+       and act accordingly. */
     DWORD transfered_bytes = 0;
     DWORD flags;
     BOOL wsa_success = WSAGetOverlappedResult(sock, &info->overlapped,
@@ -237,18 +273,31 @@ static void on_accept(void *arg, int success) {
       ep = grpc_tcp_create(grpc_winsocket_create(sock));
     }
   } else {
-    closesocket(sock);
-    gpr_mu_lock(&sp->server->mu);
-    if (0 == --sp->server->active_ports) {
-      gpr_cv_broadcast(&sp->server->cv);
+    /* If we're not notified from the IOCP, it means we are asked to shutdown.
+       This will initiate that shutdown. Calling closesocket will trigger an
+       IOCP notification, that will call this function a second time, from
+       the IOCP thread. Of course, this only works if the socket was, in fact,
+       listening. If that's not the case, we'd wait indefinitely. That's a bit
+       of a degenerate case, but it can happen if you create a server, but
+       don't start it. So let's support that by recursing once. */
+    sp->shutting_down = 1;
+    sp->new_socket = INVALID_SOCKET;
+    if (sock != INVALID_SOCKET) {
+      closesocket(sock);
+    } else {
+      on_accept(sp, 1);
     }
-    gpr_mu_unlock(&sp->server->mu);
+    return;
   }
 
+  /* The only time we should call our callback, is where we successfully
+     managed to accept a connection, and created an endpoint. */
   if (ep) sp->server->cb(sp->server->cb_arg, ep);
-  if (success) {
-    start_accept(sp);
-  }
+  /* As we were notified from the IOCP of one and exactly one accept,
+      the former socked we created has now either been destroy or assigned
+      to the new connection. We need to create a new one for the next
+      connection. */
+  start_accept(sp);
 }
 
 static int add_socket_to_server(grpc_tcp_server *s, SOCKET sock,
@@ -262,6 +311,8 @@ static int add_socket_to_server(grpc_tcp_server *s, SOCKET sock,
 
   if (sock == INVALID_SOCKET) return -1;
 
+  /* We need to grab the AcceptEx pointer for that port, as it may be
+     interface-dependent. We'll cache it to avoid doing that again. */
   status =
       WSAIoctl(sock, SIO_GET_EXTENSION_FUNCTION_POINTER, &guid, sizeof(guid),
                &AcceptEx, sizeof(AcceptEx), &ioctl_num_bytes, NULL, NULL);
@@ -286,7 +337,9 @@ static int add_socket_to_server(grpc_tcp_server *s, SOCKET sock,
     sp = &s->ports[s->nports++];
     sp->server = s;
     sp->socket = grpc_winsocket_create(sock);
+    sp->shutting_down = 0;
     sp->AcceptEx = AcceptEx;
+    sp->new_socket = INVALID_SOCKET;
     GPR_ASSERT(sp->socket);
     gpr_mu_unlock(&s->mu);
   }
diff --git a/src/core/iomgr/tcp_windows.c b/src/core/iomgr/tcp_windows.c
index 940cd5bcde17c1ea2c0f0e3a069111173eac483e..c8483bd891cbe73aac0a1faa9e0fe6fe04413b51 100644
--- a/src/core/iomgr/tcp_windows.c
+++ b/src/core/iomgr/tcp_windows.c
@@ -76,8 +76,11 @@ int grpc_tcp_prepare_socket(SOCKET sock) {
 }
 
 typedef struct grpc_tcp {
+  /* This is our C++ class derivation emulation. */
   grpc_endpoint base;
+  /* The one socket this endpoint is using. */
   grpc_winsocket *socket;
+  /* Refcounting how many operations are in progress. */
   gpr_refcount refcount;
 
   grpc_endpoint_read_cb read_cb;
@@ -90,6 +93,10 @@ typedef struct grpc_tcp {
   gpr_slice_buffer write_slices;
   int outstanding_write;
 
+  /* The IO Completion Port runs from another thread. We need some mechanism
+     to protect ourselves when requesting a shutdown. */
+  gpr_mu mu;
+  int shutting_down;
 } grpc_tcp;
 
 static void tcp_ref(grpc_tcp *tcp) {
@@ -100,11 +107,13 @@ static void tcp_unref(grpc_tcp *tcp) {
   if (gpr_unref(&tcp->refcount)) {
     gpr_slice_buffer_destroy(&tcp->write_slices);
     grpc_winsocket_orphan(tcp->socket);
+    gpr_mu_destroy(&tcp->mu);
     gpr_free(tcp);
   }
 }
 
-static void on_read(void *tcpp, int success) {
+/* Asynchronous callback from the IOCP, or the background thread. */
+static void on_read(void *tcpp, int from_iocp) {
   grpc_tcp *tcp = (grpc_tcp *) tcpp;
   grpc_winsocket *socket = tcp->socket;
   gpr_slice sub;
@@ -114,22 +123,32 @@ static void on_read(void *tcpp, int success) {
   grpc_endpoint_read_cb cb = tcp->read_cb;
   grpc_winsocket_callback_info *info = &socket->read_info;
   void *opaque = tcp->read_user_data;
+  int do_abort = 0;
+
+  gpr_mu_lock(&tcp->mu);
+  if (!from_iocp || tcp->shutting_down) {
+    /* If we are here with from_iocp set to true, it means we got raced to
+    shutting down the endpoint. No actual abort callback will happen
+    though, so we're going to do it from here. */
+    do_abort = 1;
+  }
+  gpr_mu_unlock(&tcp->mu);
 
-  GPR_ASSERT(tcp->outstanding_read);
-
-  if (!success) {
+  if (do_abort) {
+    if (from_iocp) gpr_slice_unref(tcp->read_slice);
     tcp_unref(tcp);
     cb(opaque, NULL, 0, GRPC_ENDPOINT_CB_SHUTDOWN);
     return;
   }
 
-  tcp->outstanding_read = 0;
+  GPR_ASSERT(tcp->outstanding_read);
 
   if (socket->read_info.wsa_error != 0) {
     char *utf8_message = gpr_format_message(info->wsa_error);
     gpr_log(GPR_ERROR, "ReadFile overlapped error: %s", utf8_message);
     gpr_free(utf8_message);
     status = GRPC_ENDPOINT_CB_ERROR;
+    socket->closed_early = 1;
   } else {
     if (info->bytes_transfered != 0) {
       sub = gpr_slice_sub(tcp->read_slice, 0, info->bytes_transfered);
@@ -141,6 +160,9 @@ static void on_read(void *tcpp, int success) {
       status = GRPC_ENDPOINT_CB_EOF;
     }
   }
+
+  tcp->outstanding_read = 0;
+
   tcp_unref(tcp);
   cb(opaque, slice, nslices, status);
 }
@@ -157,6 +179,7 @@ static void win_notify_on_read(grpc_endpoint *ep,
   WSABUF buffer;
 
   GPR_ASSERT(!tcp->outstanding_read);
+  GPR_ASSERT(!tcp->shutting_down);
   tcp_ref(tcp);
   tcp->outstanding_read = 1;
   tcp->read_cb = cb;
@@ -167,10 +190,12 @@ static void win_notify_on_read(grpc_endpoint *ep,
   buffer.len = GPR_SLICE_LENGTH(tcp->read_slice);
   buffer.buf = (char *)GPR_SLICE_START_PTR(tcp->read_slice);
 
+  /* First let's try a synchronous, non-blocking read. */
   status = WSARecv(tcp->socket->socket, &buffer, 1, &bytes_read, &flags,
                    NULL, NULL);
   info->wsa_error = status == 0 ? 0 : WSAGetLastError();
 
+  /* Did we get data immediately ? Yay. */
   if (info->wsa_error != WSAEWOULDBLOCK) {
     info->bytes_transfered = bytes_read;
     /* This might heavily recurse. */
@@ -178,6 +203,7 @@ static void win_notify_on_read(grpc_endpoint *ep,
     return;
   }
 
+  /* Otherwise, let's retry, by queuing a read. */
   memset(&tcp->socket->read_info.overlapped, 0, sizeof(OVERLAPPED));
   status = WSARecv(tcp->socket->socket, &buffer, 1, &bytes_read, &flags,
                    &info->overlapped, NULL);
@@ -191,30 +217,53 @@ static void win_notify_on_read(grpc_endpoint *ep,
 
   if (error != WSA_IO_PENDING) {
     char *utf8_message = gpr_format_message(WSAGetLastError());
-    __debugbreak();
-    gpr_log(GPR_ERROR, "WSARecv error: %s", utf8_message);
+    gpr_log(GPR_ERROR, "WSARecv error: %s - this means we're going to leak.",
+            utf8_message);
     gpr_free(utf8_message);
-    /* would the IO completion port be called anyway... ? Let's assume not. */
+    /* I'm pretty sure this is a very bad situation there. Hence the log.
+       What will happen now is that the socket will neither wait for read
+       or write, unless the caller retry, which is unlikely, but I am not
+       sure if that's guaranteed. And there might also be a write pending.
+       This means that the future orphanage of that socket will be in limbo,
+       and we're going to leak it. I have no idea what could cause this
+       specific case however, aside from a parameter error from our call.
+       Normal read errors would actually happen during the overlapped
+       operation, which is the supported way to go for that. */
     tcp->outstanding_read = 0;
     tcp_unref(tcp);
     cb(arg, NULL, 0, GRPC_ENDPOINT_CB_ERROR);
+    /* Per the comment above, I'm going to treat that case as a hard failure
+       for now, and leave the option to catch that and debug. */
+    __debugbreak();
     return;
   }
 
   grpc_socket_notify_on_read(tcp->socket, on_read, tcp);
 }
 
-static void on_write(void *tcpp, int success) {
+/* Asynchronous callback from the IOCP, or the background thread. */
+static void on_write(void *tcpp, int from_iocp) {
   grpc_tcp *tcp = (grpc_tcp *) tcpp;
   grpc_winsocket *handle = tcp->socket;
   grpc_winsocket_callback_info *info = &handle->write_info;
   grpc_endpoint_cb_status status = GRPC_ENDPOINT_CB_OK;
   grpc_endpoint_write_cb cb = tcp->write_cb;
   void *opaque = tcp->write_user_data;
+  int do_abort = 0;
+
+  gpr_mu_lock(&tcp->mu);
+  if (!from_iocp || tcp->shutting_down) {
+    /* If we are here with from_iocp set to true, it means we got raced to
+        shutting down the endpoint. No actual abort callback will happen
+        though, so we're going to do it from here. */
+    do_abort = 1;
+  }
+  gpr_mu_unlock(&tcp->mu);
 
   GPR_ASSERT(tcp->outstanding_write);
 
-  if (!success) {
+  if (do_abort) {
+    if (from_iocp) gpr_slice_buffer_reset_and_unref(&tcp->write_slices);
     tcp_unref(tcp);
     cb(opaque, GRPC_ENDPOINT_CB_SHUTDOWN);
     return;
@@ -225,6 +274,7 @@ static void on_write(void *tcpp, int success) {
     gpr_log(GPR_ERROR, "WSASend overlapped error: %s", utf8_message);
     gpr_free(utf8_message);
     status = GRPC_ENDPOINT_CB_ERROR;
+    tcp->socket->closed_early = 1;
   } else {
     GPR_ASSERT(info->bytes_transfered == tcp->write_slices.length);
   }
@@ -236,6 +286,7 @@ static void on_write(void *tcpp, int success) {
   cb(opaque, status);
 }
 
+/* Initiates a write. */
 static grpc_endpoint_write_status win_write(grpc_endpoint *ep,
                                             gpr_slice *slices, size_t nslices,
                                             grpc_endpoint_write_cb cb,
@@ -251,11 +302,13 @@ static grpc_endpoint_write_status win_write(grpc_endpoint *ep,
   WSABUF *buffers = local_buffers;
 
   GPR_ASSERT(!tcp->outstanding_write);
+  GPR_ASSERT(!tcp->shutting_down);
   tcp_ref(tcp);
 
   tcp->outstanding_write = 1;
   tcp->write_cb = cb;
   tcp->write_user_data = arg;
+
   gpr_slice_buffer_addn(&tcp->write_slices, slices, nslices);
 
   if (tcp->write_slices.count > GPR_ARRAY_SIZE(local_buffers)) {
@@ -268,10 +321,14 @@ static grpc_endpoint_write_status win_write(grpc_endpoint *ep,
     buffers[i].buf = (char *)GPR_SLICE_START_PTR(tcp->write_slices.slices[i]);
   }
 
+  /* First, let's try a synchronous, non-blocking write. */
   status = WSASend(socket->socket, buffers, tcp->write_slices.count,
                    &bytes_sent, 0, NULL, NULL);
   info->wsa_error = status == 0 ? 0 : WSAGetLastError();
 
+  /* We would kind of expect to get a WSAEWOULDBLOCK here, especially on a busy
+     connection that has its send queue filled up. But if we don't, then we can
+     avoid doing an async write operation at all. */
   if (info->wsa_error != WSAEWOULDBLOCK) {
     grpc_endpoint_write_status ret = GRPC_ENDPOINT_WRITE_ERROR;
     if (status == 0) {
@@ -289,25 +346,42 @@ static grpc_endpoint_write_status win_write(grpc_endpoint *ep,
     return ret;
   }
 
+  /* If we got a WSAEWOULDBLOCK earlier, then we need to re-do the same
+     operation, this time asynchronously. */
   memset(&socket->write_info.overlapped, 0, sizeof(OVERLAPPED));
   status = WSASend(socket->socket, buffers, tcp->write_slices.count,
                    &bytes_sent, 0, &socket->write_info.overlapped, NULL);
   if (allocated) gpr_free(allocated);
 
+  /* It is possible the operation completed then. But we'd still get an IOCP
+     notification. So let's ignore it and wait for the IOCP. */
   if (status != 0) {
     int error = WSAGetLastError();
     if (error != WSA_IO_PENDING) {
       char *utf8_message = gpr_format_message(WSAGetLastError());
-      __debugbreak();
-      gpr_log(GPR_ERROR, "WSASend error: %s", utf8_message);
+      gpr_log(GPR_ERROR, "WSASend error: %s - this means we're going to leak.",
+              utf8_message);
       gpr_free(utf8_message);
-      /* would the IO completion port be called anyway ? Let's assume not. */
+    /* I'm pretty sure this is a very bad situation there. Hence the log.
+       What will happen now is that the socket will neither wait for read
+       or write, unless the caller retry, which is unlikely, but I am not
+       sure if that's guaranteed. And there might also be a read pending.
+       This means that the future orphanage of that socket will be in limbo,
+       and we're going to leak it. I have no idea what could cause this
+       specific case however, aside from a parameter error from our call.
+       Normal read errors would actually happen during the overlapped
+       operation, which is the supported way to go for that. */
       tcp->outstanding_write = 0;
       tcp_unref(tcp);
+      /* Per the comment above, I'm going to treat that case as a hard failure
+         for now, and leave the option to catch that and debug. */
+      __debugbreak();
       return GRPC_ENDPOINT_WRITE_ERROR;
     }
   }
 
+  /* As all is now setup, we can now ask for the IOCP notification. It may
+     trigger the callback immediately however, but no matter. */
   grpc_socket_notify_on_write(socket, on_write, tcp);
   return GRPC_ENDPOINT_WRITE_PENDING;
 }
@@ -317,9 +391,20 @@ static void win_add_to_pollset(grpc_endpoint *ep, grpc_pollset *pollset) {
   grpc_iocp_add_socket(tcp->socket);
 }
 
+/* Initiates a shutdown of the TCP endpoint. This will queue abort callbacks
+   for the potential read and write operations. It is up to the caller to
+   guarantee this isn't called in parallel to a read or write request, so
+   we're not going to protect against these. However the IO Completion Port
+   callback will happen from another thread, so we need to protect against
+   concurrent access of the data structure in that regard. */
 static void win_shutdown(grpc_endpoint *ep) {
   grpc_tcp *tcp = (grpc_tcp *) ep;
+  gpr_mu_lock(&tcp->mu);
+  /* At that point, what may happen is that we're already inside the IOCP
+     callback. See the comments in on_read and on_write. */
+  tcp->shutting_down = 1;
   grpc_winsocket_shutdown(tcp->socket);
+  gpr_mu_unlock(&tcp->mu);
 }
 
 static void win_destroy(grpc_endpoint *ep) {
@@ -336,6 +421,7 @@ grpc_endpoint *grpc_tcp_create(grpc_winsocket *socket) {
   memset(tcp, 0, sizeof(grpc_tcp));
   tcp->base.vtable = &vtable;
   tcp->socket = socket;
+  gpr_mu_init(&tcp->mu);
   gpr_slice_buffer_init(&tcp->write_slices);
   gpr_ref_init(&tcp->refcount, 1);
   return &tcp->base;
diff --git a/src/core/profiling/basic_timers.c b/src/core/profiling/basic_timers.c
index d89bba7b8788283cca4c3b1fc6f05eabbbfe1253..866833f22576d01bf130c961bad65a71ff4c6ab9 100644
--- a/src/core/profiling/basic_timers.c
+++ b/src/core/profiling/basic_timers.c
@@ -45,108 +45,78 @@
 #include <grpc/support/thd.h>
 #include <stdio.h>
 
+typedef enum { BEGIN = '{', END = '}', MARK = '.' } marker_type;
+
 typedef struct grpc_timer_entry {
   grpc_precise_clock tm;
-  gpr_thd_id thd;
   int tag;
+  marker_type type;
   void* id;
   const char* file;
   int line;
 } grpc_timer_entry;
 
-struct grpc_timers_log {
-  gpr_mu mu;
-  grpc_timer_entry* log;
-  int num_entries;
-  int capacity;
-  int capacity_limit;
-  FILE* fp;
-};
-
-grpc_timers_log* grpc_timers_log_global = NULL;
-
-static grpc_timers_log* grpc_timers_log_create(int capacity_limit, FILE* dump) {
-  grpc_timers_log* log = gpr_malloc(sizeof(*log));
-
-  /* TODO (vpai): Allow allocation below limit */
-  log->log = gpr_malloc(capacity_limit * sizeof(*log->log));
-
-  /* TODO (vpai): Improve concurrency, do per-thread logging? */
-  gpr_mu_init(&log->mu);
-
-  log->num_entries = 0;
-  log->capacity = log->capacity_limit = capacity_limit;
+#define MAX_COUNT (1024 * 1024 / sizeof(grpc_timer_entry))
 
-  log->fp = dump;
+static __thread grpc_timer_entry log[MAX_COUNT];
+static __thread int count;
 
-  return log;
-}
-
-static void log_report_locked(grpc_timers_log* log) {
-  FILE* fp = log->fp;
+static void log_report() {
   int i;
-  for (i = 0; i < log->num_entries; i++) {
-    grpc_timer_entry* entry = &(log->log[i]);
-    fprintf(fp, "GRPC_LAT_PROF ");
-    grpc_precise_clock_print(&entry->tm, fp);
-    fprintf(fp, " %p %d %p %s %d\n", (void*)(gpr_intptr)entry->thd, entry->tag,
-            entry->id, entry->file, entry->line);
+  for (i = 0; i < count; i++) {
+    grpc_timer_entry* entry = &(log[i]);
+    printf("GRPC_LAT_PROF " GRPC_PRECISE_CLOCK_FORMAT " %p %c %d %p %s %d\n",
+           GRPC_PRECISE_CLOCK_PRINTF_ARGS(&entry->tm),
+           (void*)(gpr_intptr)gpr_thd_currentid(), entry->type, entry->tag,
+           entry->id, entry->file, entry->line);
   }
 
   /* Now clear out the log */
-  log->num_entries = 0;
-}
-
-static void grpc_timers_log_destroy(grpc_timers_log* log) {
-  gpr_mu_lock(&log->mu);
-  log_report_locked(log);
-  gpr_mu_unlock(&log->mu);
-
-  gpr_free(log->log);
-  gpr_mu_destroy(&log->mu);
-
-  gpr_free(log);
+  count = 0;
 }
 
-static void grpc_timers_log_add(grpc_timers_log* log, int tag, void* id,
+static void grpc_timers_log_add(int tag, marker_type type, void* id,
                                 const char* file, int line) {
   grpc_timer_entry* entry;
 
   /* TODO (vpai) : Improve concurrency */
-  gpr_mu_lock(&log->mu);
-  if (log->num_entries == log->capacity_limit) {
-    log_report_locked(log);
+  if (count == MAX_COUNT) {
+    log_report();
   }
 
-  entry = &log->log[log->num_entries++];
+  entry = &log[count++];
 
   grpc_precise_clock_now(&entry->tm);
   entry->tag = tag;
+  entry->type = type;
   entry->id = id;
   entry->file = file;
   entry->line = line;
-  entry->thd = gpr_thd_currentid();
-
-  gpr_mu_unlock(&log->mu);
 }
 
 /* Latency profiler API implementation. */
 void grpc_timer_add_mark(int tag, void* id, const char* file, int line) {
-  grpc_timers_log_add(grpc_timers_log_global, tag, id, file, line);
+  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {
+    grpc_timers_log_add(tag, MARK, id, file, line);
+  }
 }
 
-void grpc_timer_begin(int tag, void* id, const char *file, int line) {}
-void grpc_timer_end(int tag, void* id, const char *file, int line) {}
-
-/* Basic profiler specific API functions. */
-void grpc_timers_global_init(void) {
-  grpc_timers_log_global = grpc_timers_log_create(100000, stdout);
+void grpc_timer_begin(int tag, void* id, const char* file, int line) {
+  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {
+    grpc_timers_log_add(tag, BEGIN, id, file, line);
+  }
 }
 
-void grpc_timers_global_destroy(void) {
-  grpc_timers_log_destroy(grpc_timers_log_global);
+void grpc_timer_end(int tag, void* id, const char* file, int line) {
+  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {
+    grpc_timers_log_add(tag, END, id, file, line);
+  }
 }
 
+/* Basic profiler specific API functions. */
+void grpc_timers_global_init(void) {}
+
+void grpc_timers_global_destroy(void) {}
 
 #else  /* !GRPC_BASIC_PROFILER */
 void grpc_timers_global_init(void) {}
diff --git a/src/core/profiling/timers.h b/src/core/profiling/timers.h
index d0b8286c03eceb77f7a546b3c6d355193d2d1443..ccde6acd5857b8a59786d436e22ff4fce93700e8 100644
--- a/src/core/profiling/timers.h
+++ b/src/core/profiling/timers.h
@@ -41,9 +41,9 @@ extern "C" {
 void grpc_timers_global_init(void);
 void grpc_timers_global_destroy(void);
 
-void grpc_timer_add_mark(int tag, void* id, const char *file, int line);
-void grpc_timer_begin(int tag, void* id, const char *file, int line);
-void grpc_timer_end(int tag, void* id, const char *file, int line);
+void grpc_timer_add_mark(int tag, void *id, const char *file, int line);
+void grpc_timer_begin(int tag, void *id, const char *file, int line);
+void grpc_timer_end(int tag, void *id, const char *file, int line);
 
 enum grpc_profiling_tags {
   /* Any GRPC_PTAG_* >= than the threshold won't generate any profiling mark. */
@@ -74,13 +74,16 @@ enum grpc_profiling_tags {
 #if !(defined(GRPC_STAP_PROFILER) + defined(GRPC_BASIC_PROFILER))
 /* No profiling. No-op all the things. */
 #define GRPC_TIMER_MARK(tag, id) \
-  do {} while(0)
+  do {                           \
+  } while (0)
 
 #define GRPC_TIMER_BEGIN(tag, id) \
-  do {} while(0)
+  do {                            \
+  } while (0)
 
 #define GRPC_TIMER_END(tag, id) \
-  do {} while(0)
+  do {                          \
+  } while (0)
 
 #else /* at least one profiler requested... */
 /* ... hopefully only one. */
@@ -94,14 +97,14 @@ enum grpc_profiling_tags {
     grpc_timer_add_mark(tag, ((void *)(gpr_intptr)(id)), __FILE__, __LINE__); \
   }
 
-#define GRPC_TIMER_BEGIN(tag, id)                                             \
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {                                     \
-    grpc_timer_begin(tag, ((void *)(gpr_intptr)(id)), __FILE__, __LINE__);    \
+#define GRPC_TIMER_BEGIN(tag, id)                                          \
+  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {                                  \
+    grpc_timer_begin(tag, ((void *)(gpr_intptr)(id)), __FILE__, __LINE__); \
   }
 
-#define GRPC_TIMER_END(tag, id)                                               \
-  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {                                     \
-    grpc_timer_end(tag, ((void *)(gpr_intptr)(id)), __FILE__, __LINE__);      \
+#define GRPC_TIMER_END(tag, id)                                          \
+  if (tag < GRPC_PTAG_IGNORE_THRESHOLD) {                                \
+    grpc_timer_end(tag, ((void *)(gpr_intptr)(id)), __FILE__, __LINE__); \
   }
 
 #ifdef GRPC_STAP_PROFILER
@@ -109,9 +112,7 @@ enum grpc_profiling_tags {
 #endif /* GRPC_STAP_PROFILER */
 
 #ifdef GRPC_BASIC_PROFILER
-typedef struct grpc_timers_log grpc_timers_log;
-
-extern grpc_timers_log *grpc_timers_log_global;
+/* Empty placeholder for now. */
 #endif /* GRPC_BASIC_PROFILER */
 
 #endif /* at least one profiler requested. */
diff --git a/src/core/profiling/timers_preciseclock.h b/src/core/profiling/timers_preciseclock.h
index bf4a0eab8a4a209b393389c2c3c18eae1c0b10d7..163d52b7974bb2b0478ec9482c537c527c3d57a3 100644
--- a/src/core/profiling/timers_preciseclock.h
+++ b/src/core/profiling/timers_preciseclock.h
@@ -34,20 +34,59 @@
 #ifndef GRPC_CORE_PROFILING_TIMERS_PRECISECLOCK_H
 #define GRPC_CORE_PROFILING_TIMERS_PRECISECLOCK_H
 
+#include <grpc/support/sync.h>
 #include <grpc/support/time.h>
 #include <stdio.h>
 
-typedef struct grpc_precise_clock grpc_precise_clock;
-
 #ifdef GRPC_TIMERS_RDTSC
-#error RDTSC timers not currently supported
+typedef long long int grpc_precise_clock;
+#if defined(__i386__)
+static void grpc_precise_clock_now(grpc_precise_clock *clk) {
+  grpc_precise_clock ret;
+  __asm__ volatile("rdtsc" : "=A"(ret));
+  *clk = ret;
+}
+
+// ----------------------------------------------------------------
+#elif defined(__x86_64__) || defined(__amd64__)
+static void grpc_precise_clock_now(grpc_precise_clock *clk) {
+  unsigned long long low, high;
+  __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
+  *clk = (high << 32) | low;
+}
+#endif
+static gpr_once precise_clock_init = GPR_ONCE_INIT;
+static double cycles_per_second = 0.0;
+static void grpc_precise_clock_init() {
+  time_t start = time(NULL);
+  grpc_precise_clock start_time;
+  grpc_precise_clock end_time;
+  while (time(NULL) == start)
+    ;
+  grpc_precise_clock_now(&start_time);
+  while (time(NULL) == start + 1)
+    ;
+  grpc_precise_clock_now(&end_time);
+  cycles_per_second = end_time - start_time;
+}
+static double grpc_precise_clock_scaling_factor() {
+  gpr_once_init(&precise_clock_init, grpc_precise_clock_init);
+  return 1e6 / cycles_per_second;
+}
+#define GRPC_PRECISE_CLOCK_FORMAT "%f"
+#define GRPC_PRECISE_CLOCK_PRINTF_ARGS(clk) \
+  (*(clk)*grpc_precise_clock_scaling_factor())
 #else
+typedef struct grpc_precise_clock grpc_precise_clock;
 struct grpc_precise_clock {
   gpr_timespec clock;
 };
 static void grpc_precise_clock_now(grpc_precise_clock* clk) {
   clk->clock = gpr_now();
 }
+#define GRPC_PRECISE_CLOCK_FORMAT "%ld.%09d"
+#define GRPC_PRECISE_CLOCK_PRINTF_ARGS(clk) \
+  (clk)->clock.tv_sec, (clk)->clock.tv_nsec
 static void grpc_precise_clock_print(const grpc_precise_clock* clk, FILE* fp) {
   fprintf(fp, "%ld.%09d", clk->clock.tv_sec, clk->clock.tv_nsec);
 }
diff --git a/templates/Makefile.template b/templates/Makefile.template
index 1e7a2a49bdde0332b94e771e6a8b55972b405a55..e446f2b40c20e16af0db9d699c11fd83134184a2 100644
--- a/templates/Makefile.template
+++ b/templates/Makefile.template
@@ -106,7 +106,7 @@ CC_basicprof = $(DEFAULT_CC)
 CXX_basicprof = $(DEFAULT_CXX)
 LD_basicprof = $(DEFAULT_CC)
 LDXX_basicprof = $(DEFAULT_CXX)
-CPPFLAGS_basicprof = -O2 -DGRPC_BASIC_PROFILER
+CPPFLAGS_basicprof = -O2 -DGRPC_BASIC_PROFILER -DGRPC_TIMERS_RDTSC
 LDFLAGS_basicprof =
 DEFINES_basicprof = NDEBUG
 
@@ -1191,11 +1191,14 @@ endif
   lib_deps = ' $(ZLIB_DEP)'
   mingw_libs = ''
   mingw_lib_deps = ' $(ZLIB_DEP)'
+  if lib.language == 'c++':
+    lib_deps += ' $(PROTOBUF_DEP)'
+    mingw_lib_deps += ' $(PROTOBUF_DEP)'
   for dep in lib.get('deps', []):
     libs = libs + ' -l' + dep
     lib_deps = lib_deps + ' $(LIBDIR)/$(CONFIG)/lib' + dep + '.$(SHARED_EXT)'
     mingw_libs = mingw_libs + ' -l' + dep + '-imp'
-    mingw_lib_deps = mingw_lib_deps + '$(LIBDIR)/$(CONFIG)/' + dep + '.$(SHARED_EXT)'
+    mingw_lib_deps = mingw_lib_deps + ' $(LIBDIR)/$(CONFIG)/' + dep + '.$(SHARED_EXT)'
 
   if lib.get('secure', 'check') == 'yes':
     common = common + ' $(LDLIBS_SECURE) $(OPENSSL_MERGE_LIBS)'
diff --git a/tools/run_tests/run_sanity.sh b/tools/run_tests/run_sanity.sh
index e915af93d4b3600a0490fa944a1e6d58ab12b7a9..959197eb3d94836c3750b1db7d209f8e25e368ff 100755
--- a/tools/run_tests/run_sanity.sh
+++ b/tools/run_tests/run_sanity.sh
@@ -37,3 +37,14 @@ export TEST=true
 cd `dirname $0`/../..
 
 ./tools/buildgen/generate_projects.sh
+
+submodules=`mktemp`
+
+git submodule > $submodules
+
+diff -u $submodules - << EOF
+ 05b155ff59114735ec8cd089f669c4c3d8f59029 third_party/gflags (v2.1.0-45-g05b155f)
+ 3df69d3aefde7671053d4e3c242b228e5d79c83f third_party/openssl (OpenSSL_1_0_2a)
+ 644a6a1da71385e9d7a7a26b3476c93fdd71788c third_party/protobuf (v3.0.0-alpha-1-35-g644a6a1)
+ 50893291621658f355bc5b4d450a8d06a563053d third_party/zlib (v1.2.8)
+EOF
diff --git a/vsprojects/openssl.props b/vsprojects/openssl.props
new file mode 100644
index 0000000000000000000000000000000000000000..94e3ff98fffc9dfb9a1db994077bff465ccdbf4d
--- /dev/null
+++ b/vsprojects/openssl.props
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ImportGroup Label="PropertySheets" />
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup />
+  <ItemDefinitionGroup>
+    <Link>
+      <AdditionalDependencies>ssleay32.lib;libeay32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(MSBuildProjectDirectory)\..\packages\grpc.dependencies.openssl.1.0.2.2\build\native\lib\$(PlatformToolset)\$(Platform)\$(Configuration)\static;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup />
+</Project>
\ No newline at end of file
diff --git a/vsprojects/zlib.props b/vsprojects/zlib.props
new file mode 100644
index 0000000000000000000000000000000000000000..ac881a5904836664a3df9191515e06e54b89963b
--- /dev/null
+++ b/vsprojects/zlib.props
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ImportGroup Label="PropertySheets" />
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup />
+  <ItemDefinitionGroup>
+    <Link>
+      <AdditionalDependencies>zlib.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(MSBuildProjectDirectory)\..\packages\grpc.dependencies.zlib.1.2.8.9\build\native\lib\$(PlatformToolset)\$(Platform)\$(Configuration)\static\cdecl;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup />
+</Project>
\ No newline at end of file