From 1e55bd455d5fa901e19c1ccf64996c0adc7a4584 Mon Sep 17 00:00:00 2001
From: Craig Tiller <ctiller@google.com>
Date: Fri, 11 Mar 2016 09:47:43 -0800
Subject: [PATCH] Add retry for dns resolution

---
 src/core/client_config/client_config.c        |  4 +-
 .../client_config/lb_policies/pick_first.c    |  2 +-
 .../client_config/resolvers/dns_resolver.c    | 53 ++++++++++++++++---
 .../dns_resolver_connectivity_test.c          | 21 ++++++--
 4 files changed, 67 insertions(+), 13 deletions(-)

diff --git a/src/core/client_config/client_config.c b/src/core/client_config/client_config.c
index 6ecffb3854..25f4ac2517 100644
--- a/src/core/client_config/client_config.c
+++ b/src/core/client_config/client_config.c
@@ -53,7 +53,9 @@ void grpc_client_config_ref(grpc_client_config *c) { gpr_ref(&c->refs); }
 
 void grpc_client_config_unref(grpc_exec_ctx *exec_ctx, grpc_client_config *c) {
   if (gpr_unref(&c->refs)) {
-    GRPC_LB_POLICY_UNREF(exec_ctx, c->lb_policy, "client_config");
+    if (c->lb_policy != NULL) {
+      GRPC_LB_POLICY_UNREF(exec_ctx, c->lb_policy, "client_config");
+    }
     gpr_free(c);
   }
 }
diff --git a/src/core/client_config/lb_policies/pick_first.c b/src/core/client_config/lb_policies/pick_first.c
index 81167b31c8..8ed1223d39 100644
--- a/src/core/client_config/lb_policies/pick_first.c
+++ b/src/core/client_config/lb_policies/pick_first.c
@@ -387,8 +387,8 @@ static void pick_first_factory_unref(grpc_lb_policy_factory *factory) {}
 
 static grpc_lb_policy *create_pick_first(grpc_lb_policy_factory *factory,
                                          grpc_lb_policy_args *args) {
+  if (args->num_subchannels == 0) return NULL;
   pick_first_lb_policy *p = gpr_malloc(sizeof(*p));
-  GPR_ASSERT(args->num_subchannels > 0);
   memset(p, 0, sizeof(*p));
   grpc_lb_policy_init(&p->base, &pick_first_lb_policy_vtable);
   p->subchannels =
diff --git a/src/core/client_config/resolvers/dns_resolver.c b/src/core/client_config/resolvers/dns_resolver.c
index 376b6b3d76..55457647b3 100644
--- a/src/core/client_config/resolvers/dns_resolver.c
+++ b/src/core/client_config/resolvers/dns_resolver.c
@@ -41,6 +41,7 @@
 
 #include "src/core/client_config/lb_policy_registry.h"
 #include "src/core/iomgr/resolve_address.h"
+#include "src/core/iomgr/timer.h"
 #include "src/core/support/string.h"
 
 typedef struct {
@@ -71,6 +72,9 @@ typedef struct {
   grpc_client_config **target_config;
   /** current (fully resolved) config */
   grpc_client_config *resolved_config;
+  /** retry timer */
+  bool have_retry_timer;
+  grpc_timer retry_timer;
 } dns_resolver;
 
 static void dns_destroy(grpc_exec_ctx *exec_ctx, grpc_resolver *r);
@@ -125,6 +129,21 @@ static void dns_next(grpc_exec_ctx *exec_ctx, grpc_resolver *resolver,
   gpr_mu_unlock(&r->mu);
 }
 
+static void dns_on_retry_timer(grpc_exec_ctx *exec_ctx, void *arg,
+                               bool success) {
+  dns_resolver *r = arg;
+
+  if (success) {
+    gpr_mu_lock(&r->mu);
+    if (!r->resolving) {
+      dns_start_resolving_locked(r);
+    }
+    gpr_mu_unlock(&r->mu);
+  }
+
+  GRPC_RESOLVER_UNREF(exec_ctx, &r->base, "retry-timer");
+}
+
 static void dns_on_resolved(grpc_exec_ctx *exec_ctx, void *arg,
                             grpc_resolved_addresses *addresses) {
   dns_resolver *r = arg;
@@ -133,29 +152,47 @@ static void dns_on_resolved(grpc_exec_ctx *exec_ctx, void *arg,
   grpc_subchannel_args args;
   grpc_lb_policy *lb_policy;
   size_t i;
-  if (addresses) {
+  gpr_mu_lock(&r->mu);
+  GPR_ASSERT(r->resolving);
+  r->resolving = 0;
+  if (addresses != NULL) {
     grpc_lb_policy_args lb_policy_args;
     config = grpc_client_config_create();
     subchannels = gpr_malloc(sizeof(grpc_subchannel *) * addresses->naddrs);
+    size_t naddrs = 0;
     for (i = 0; i < addresses->naddrs; i++) {
       memset(&args, 0, sizeof(args));
       args.addr = (struct sockaddr *)(addresses->addrs[i].addr);
       args.addr_len = (size_t)addresses->addrs[i].len;
-      subchannels[i] = grpc_subchannel_factory_create_subchannel(
+      grpc_subchannel *subchannel = grpc_subchannel_factory_create_subchannel(
           exec_ctx, r->subchannel_factory, &args);
+      if (subchannel != NULL) {
+        subchannels[naddrs++] = subchannel;
+      }
     }
     memset(&lb_policy_args, 0, sizeof(lb_policy_args));
     lb_policy_args.subchannels = subchannels;
-    lb_policy_args.num_subchannels = addresses->naddrs;
+    lb_policy_args.num_subchannels = naddrs;
     lb_policy = grpc_lb_policy_create(r->lb_policy_name, &lb_policy_args);
-    grpc_client_config_set_lb_policy(config, lb_policy);
-    GRPC_LB_POLICY_UNREF(exec_ctx, lb_policy, "construction");
+    if (lb_policy != NULL) {
+      grpc_client_config_set_lb_policy(config, lb_policy);
+      GRPC_LB_POLICY_UNREF(exec_ctx, lb_policy, "construction");
+    }
     grpc_resolved_addresses_destroy(addresses);
     gpr_free(subchannels);
+  } else {
+    int retry_seconds = 15;
+    gpr_log(GPR_DEBUG, "dns resolution failed: retrying in %d seconds",
+            retry_seconds);
+    GPR_ASSERT(!r->have_retry_timer);
+    r->have_retry_timer = true;
+    gpr_timespec now = gpr_now(GPR_CLOCK_MONOTONIC);
+    GRPC_RESOLVER_REF(&r->base, "retry-timer");
+    grpc_timer_init(
+        exec_ctx, &r->retry_timer,
+        gpr_time_add(now, gpr_time_from_seconds(retry_seconds, GPR_TIMESPAN)),
+        dns_on_retry_timer, r, now);
   }
-  gpr_mu_lock(&r->mu);
-  GPR_ASSERT(r->resolving);
-  r->resolving = 0;
   if (r->resolved_config) {
     grpc_client_config_unref(exec_ctx, r->resolved_config);
   }
diff --git a/test/core/client_config/resolvers/dns_resolver_connectivity_test.c b/test/core/client_config/resolvers/dns_resolver_connectivity_test.c
index e4d83539fb..75d1eb674f 100644
--- a/test/core/client_config/resolvers/dns_resolver_connectivity_test.c
+++ b/test/core/client_config/resolvers/dns_resolver_connectivity_test.c
@@ -39,6 +39,7 @@
 #include <grpc/support/alloc.h>
 
 #include "src/core/iomgr/resolve_address.h"
+#include "src/core/iomgr/timer.h"
 #include "test/core/util/test_config.h"
 
 static void subchannel_factory_ref(grpc_subchannel_factory *scv) {}
@@ -47,7 +48,7 @@ static void subchannel_factory_unref(grpc_exec_ctx *exec_ctx,
 static grpc_subchannel *subchannel_factory_create_subchannel(
     grpc_exec_ctx *exec_ctx, grpc_subchannel_factory *factory,
     grpc_subchannel_args *args) {
-  GPR_UNREACHABLE_CODE(return NULL);
+  return NULL;
 }
 
 static const grpc_subchannel_factory_vtable sc_vtable = {
@@ -96,6 +97,20 @@ static void on_done(grpc_exec_ctx *exec_ctx, void *ev, bool success) {
   gpr_event_set(ev, (void *)1);
 }
 
+// interleave waiting for an event with a timer check
+static bool wait_loop(int deadline_seconds, gpr_event *ev) {
+  while (deadline_seconds) {
+    gpr_log(GPR_DEBUG, "Test: waiting for %d more seconds", deadline_seconds);
+    if (gpr_event_wait(ev, GRPC_TIMEOUT_SECONDS_TO_DEADLINE(1))) return true;
+    deadline_seconds--;
+
+    grpc_exec_ctx exec_ctx = GRPC_EXEC_CTX_INIT;
+    grpc_timer_check(&exec_ctx, gpr_now(GPR_CLOCK_MONOTONIC), NULL);
+    grpc_exec_ctx_finish(&exec_ctx);
+  }
+  return false;
+}
+
 int main(int argc, char **argv) {
   grpc_test_init(argc, argv);
 
@@ -113,7 +128,7 @@ int main(int argc, char **argv) {
   grpc_resolver_next(&exec_ctx, resolver, &config,
                      grpc_closure_create(on_done, &ev1));
   grpc_exec_ctx_flush(&exec_ctx);
-  GPR_ASSERT(gpr_event_wait(&ev1, GRPC_TIMEOUT_SECONDS_TO_DEADLINE(5)));
+  GPR_ASSERT(wait_loop(5, &ev1));
   GPR_ASSERT(config == NULL);
 
   gpr_event ev2;
@@ -121,7 +136,7 @@ int main(int argc, char **argv) {
   grpc_resolver_next(&exec_ctx, resolver, &config,
                      grpc_closure_create(on_done, &ev2));
   grpc_exec_ctx_flush(&exec_ctx);
-  GPR_ASSERT(gpr_event_wait(&ev2, GRPC_TIMEOUT_SECONDS_TO_DEADLINE(5)));
+  GPR_ASSERT(wait_loop(30, &ev2));
   GPR_ASSERT(config != NULL);
 
   grpc_client_config_unref(&exec_ctx, config);
-- 
GitLab