From 0a06cd7b6839f8fad89b3976199901a0150c3734 Mon Sep 17 00:00:00 2001
From: Craig Tiller <ctiller@google.com>
Date: Thu, 14 Jul 2016 13:21:24 -0700
Subject: [PATCH] Cleanup from code review

---
 src/core/lib/iomgr/ev_epoll_linux.c | 24 ++++++++++++++++++------
 src/core/lib/surface/server.c       |  2 +-
 test/cpp/qps/client_async.cc        |  2 ++
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/core/lib/iomgr/ev_epoll_linux.c b/src/core/lib/iomgr/ev_epoll_linux.c
index 1d04ef1ef0..6a63c4d1d1 100644
--- a/src/core/lib/iomgr/ev_epoll_linux.c
+++ b/src/core/lib/iomgr/ev_epoll_linux.c
@@ -517,14 +517,10 @@ static polling_island *polling_island_create(grpc_exec_ctx *exec_ctx,
 
 done:
   if (*error != GRPC_ERROR_NONE) {
-    if (pi->epoll_fd >= 0) {
-      close(pi->epoll_fd);
-    }
     if (pi->workqueue != NULL) {
       GRPC_WORKQUEUE_UNREF(exec_ctx, pi->workqueue, "polling_island");
     }
-    gpr_mu_destroy(&pi->mu);
-    gpr_free(pi);
+    polling_island_delete(exec_ctx, pi);
     pi = NULL;
   }
   return pi;
@@ -533,7 +529,9 @@ done:
 static void polling_island_delete(grpc_exec_ctx *exec_ctx, polling_island *pi) {
   GPR_ASSERT(pi->fd_cnt == 0);
 
-  close(pi->epoll_fd);
+  if (pi->epoll_fd >= 0) {
+    close(pi->epoll_fd);
+  }
   gpr_mu_destroy(&pi->mu);
   gpr_free(pi->fds);
   gpr_free(pi);
@@ -934,6 +932,10 @@ static void fd_orphan(grpc_exec_ctx *exec_ctx, grpc_fd *fd,
   gpr_mu_unlock(&fd->mu);
   UNREF_BY(fd, 2, reason); /* Drop the reference */
   if (unref_pi != NULL) {
+    /* Unref stale polling island here, outside the fd lock above.
+       The polling island owns a workqueue which owns an fd, and unreffing
+       inside the lock can cause an eventual lock loop that makes TSAN very
+       unhappy. */
     PI_UNREF(exec_ctx, unref_pi, "fd_orphan");
   }
   GRPC_LOG_IF_ERROR("fd_orphan", GRPC_ERROR_REF(error));
@@ -1557,9 +1559,19 @@ retry:
   if (fd->polling_island == pollset->polling_island) {
     pi_new = fd->polling_island;
     if (pi_new == NULL) {
+      /* Unlock before creating a new polling island: the polling island will
+         create a workqueue which creates a file descriptor, and holding an fd
+         lock here can eventually cause a loop to appear to TSAN (making it
+         unhappy). We don't think it's a real loop (there's an epoch point where
+         that loop possibility disappears), but the advantages of keeping TSAN
+         happy outweigh any performance advantage we might have by keeping the
+         lock held. */
       gpr_mu_unlock(&fd->mu);
       pi_new = polling_island_create(exec_ctx, fd, &error);
       gpr_mu_lock(&fd->mu);
+      /* Need to reverify any assumptions made between the initial lock and
+         getting to this branch: if they've changed, we need to throw away our
+         work and figure things out again. */
       if (fd->polling_island != NULL) {
         GRPC_POLLING_TRACE(
             "pollset_add_fd: Raced creating new polling island. pi_new: %p "
diff --git a/src/core/lib/surface/server.c b/src/core/lib/surface/server.c
index d4421be936..2f108af48a 100644
--- a/src/core/lib/surface/server.c
+++ b/src/core/lib/surface/server.c
@@ -207,7 +207,7 @@ struct grpc_server {
   registered_method *registered_methods;
   /** one request matcher for unregistered methods */
   request_matcher unregistered_request_matcher;
-  /** free list of available requested_calls indices */
+  /** free list of available requested_calls_per_cq indices */
   gpr_stack_lockfree **request_freelist_per_cq;
   /** requested call backing data */
   requested_call **requested_calls_per_cq;
diff --git a/test/cpp/qps/client_async.cc b/test/cpp/qps/client_async.cc
index 24bc0eb5f4..5dd0bd8533 100644
--- a/test/cpp/qps/client_async.cc
+++ b/test/cpp/qps/client_async.cc
@@ -225,6 +225,8 @@ class AsyncClient : public ClientImpl<StubType, RequestType> {
         return true;
       }
       case CompletionQueue::TIMEOUT:
+        // TODO(ctiller): do something here to track how frequently we pass
+        // through this codepath.
         return true;
     }
     GPR_UNREACHABLE_CODE(return false);
-- 
GitLab