Skip to content
Snippets Groups Projects
  • Craig Tiller's avatar
    0fcd53c7
    Fix a TSAN reported error · 0fcd53c7
    Craig Tiller authored
    We now pass down pointers to closures instead of (callback, arg) pair
    elements separately. This allows us to store one word atomically, fixing
    a race condition.
    
    All call sites have been updated to the new API. No new allocations are
    incurred. grpc_fd_state is deleted to avoid any temptation to ever add
    anything there again.
    0fcd53c7
    History
    Fix a TSAN reported error
    Craig Tiller authored
    We now pass down pointers to closures instead of (callback, arg) pair
    elements separately. This allows us to store one word atomically, fixing
    a race condition.
    
    All call sites have been updated to the new API. No new allocations are
    incurred. grpc_fd_state is deleted to avoid any temptation to ever add
    anything there again.
tcp_posix.c 17.87 KiB
/*
 *
 * Copyright 2015, Google Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *     * Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above
 * copyright notice, this list of conditions and the following disclaimer
 * in the documentation and/or other materials provided with the
 * distribution.
 *     * Neither the name of Google Inc. nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include <grpc/support/port_platform.h>

#ifdef GPR_POSIX_SOCKET

#include "src/core/iomgr/tcp_posix.h"

#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <unistd.h>

#include "src/core/support/string.h"
#include <grpc/support/alloc.h>
#include <grpc/support/log.h>
#include <grpc/support/slice.h>
#include <grpc/support/sync.h>
#include <grpc/support/time.h>

/* Holds a slice array and associated state. */
typedef struct grpc_tcp_slice_state {
  gpr_slice *slices;       /* Array of slices */
  size_t nslices;          /* Size of slices array. */
  ssize_t first_slice;     /* First valid slice in array */
  ssize_t last_slice;      /* Last valid slice in array */
  gpr_slice working_slice; /* pointer to original final slice */
  int working_slice_valid; /* True if there is a working slice */
  int memory_owned;        /* True if slices array is owned */
} grpc_tcp_slice_state;

static void slice_state_init(grpc_tcp_slice_state *state, gpr_slice *slices,
                             size_t nslices, size_t valid_slices) {
  state->slices = slices;
  state->nslices = nslices;
  if (valid_slices == 0) {
    state->first_slice = -1;
  } else {
    state->first_slice = 0;
  }
  state->last_slice = valid_slices - 1;
  state->working_slice_valid = 0;
  state->memory_owned = 0;
}

/* Returns true if there is still available data */
static int slice_state_has_available(grpc_tcp_slice_state *state) {
  return state->first_slice != -1 && state->last_slice >= state->first_slice;
}

static ssize_t slice_state_slices_allocated(grpc_tcp_slice_state *state) {
  if (state->first_slice == -1) {
    return 0;
  } else {
    return state->last_slice - state->first_slice + 1;
  }
}

static void slice_state_realloc(grpc_tcp_slice_state *state, size_t new_size) {
  /* TODO(klempner): use realloc instead when first_slice is 0 */
  /* TODO(klempner): Avoid a realloc in cases where it is unnecessary */
  gpr_slice *slices = state->slices;
  size_t original_size = slice_state_slices_allocated(state);
  size_t i;
  gpr_slice *new_slices = gpr_malloc(sizeof(gpr_slice) * new_size);

  for (i = 0; i < original_size; ++i) {
    new_slices[i] = slices[i + state->first_slice];
  }

  state->slices = new_slices;
  state->last_slice = original_size - 1;
  if (original_size > 0) {
    state->first_slice = 0;
  } else {
    state->first_slice = -1;
  }
  state->nslices = new_size;

  if (state->memory_owned) {
    gpr_free(slices);
  }
  state->memory_owned = 1;
}

static void slice_state_remove_prefix(grpc_tcp_slice_state *state,
                                      size_t prefix_bytes) {
  gpr_slice *current_slice = &state->slices[state->first_slice];
  size_t current_slice_size;

  while (slice_state_has_available(state)) {
    current_slice_size = GPR_SLICE_LENGTH(*current_slice);
    if (current_slice_size > prefix_bytes) {
      /* TODO(klempner): Get rid of the extra refcount created here by adding a
         native "trim the first N bytes" operation to splice */
      /* TODO(klempner): This really shouldn't be modifying the current slice
         unless we own the slices array. */
      *current_slice = gpr_slice_split_tail(current_slice, prefix_bytes);
      gpr_slice_unref(*current_slice);
      return;
    } else {
      gpr_slice_unref(*current_slice);
      ++state->first_slice;
      ++current_slice;
      prefix_bytes -= current_slice_size;
    }
  }
}

static void slice_state_destroy(grpc_tcp_slice_state *state) {
  while (slice_state_has_available(state)) {
    gpr_slice_unref(state->slices[state->first_slice]);
    ++state->first_slice;
  }

  if (state->memory_owned) {
    gpr_free(state->slices);
    state->memory_owned = 0;
  }
}

void slice_state_transfer_ownership(grpc_tcp_slice_state *state,
                                    gpr_slice **slices, size_t *nslices) {
  *slices = state->slices + state->first_slice;
  *nslices = state->last_slice - state->first_slice + 1;

  state->first_slice = -1;
  state->last_slice = -1;
}

/* Fills iov with the first min(iov_size, available) slices, returns number
   filled */
static size_t slice_state_to_iovec(grpc_tcp_slice_state *state,
                                   struct iovec *iov, size_t iov_size) {
  size_t nslices = state->last_slice - state->first_slice + 1;
  gpr_slice *slices = state->slices + state->first_slice;
  size_t i;
  if (nslices < iov_size) {
    iov_size = nslices;
  }

  for (i = 0; i < iov_size; ++i) {
    iov[i].iov_base = GPR_SLICE_START_PTR(slices[i]);
    iov[i].iov_len = GPR_SLICE_LENGTH(slices[i]);
  }
  return iov_size;
}

/* Makes n blocks available at the end of state, writes them into iov, and
   returns the number of bytes allocated */
static size_t slice_state_append_blocks_into_iovec(grpc_tcp_slice_state *state,
                                                   struct iovec *iov, size_t n,
                                                   size_t slice_size) {
  size_t target_size;
  size_t i;
  size_t allocated_bytes;
  ssize_t allocated_slices = slice_state_slices_allocated(state);

  if (n - state->working_slice_valid >= state->nslices - state->last_slice) {
    /* Need to grow the slice array */
    target_size = state->nslices;
    do {
      target_size = target_size * 2;
    } while (target_size < allocated_slices + n - state->working_slice_valid);
    /* TODO(klempner): If this ever needs to support both prefix removal and
       append, we should be smarter about the growth logic here */
    slice_state_realloc(state, target_size);
  }

  i = 0;
  allocated_bytes = 0;

  if (state->working_slice_valid) {
    iov[0].iov_base = GPR_SLICE_END_PTR(state->slices[state->last_slice]);
    iov[0].iov_len = GPR_SLICE_LENGTH(state->working_slice) -
                     GPR_SLICE_LENGTH(state->slices[state->last_slice]);
    allocated_bytes += iov[0].iov_len;
    ++i;
    state->slices[state->last_slice] = state->working_slice;
    state->working_slice_valid = 0;
  }

  for (; i < n; ++i) {
    ++state->last_slice;
    state->slices[state->last_slice] = gpr_slice_malloc(slice_size);
    iov[i].iov_base = GPR_SLICE_START_PTR(state->slices[state->last_slice]);
    iov[i].iov_len = slice_size;
    allocated_bytes += slice_size;
  }
  if (state->first_slice == -1) {
    state->first_slice = 0;
  }
  return allocated_bytes;
}

/* Remove the last n bytes from state */
/* TODO(klempner): Consider having this defer actual deletion until later */
static void slice_state_remove_last(grpc_tcp_slice_state *state, size_t bytes) {
  while (bytes > 0 && slice_state_has_available(state)) {
    if (GPR_SLICE_LENGTH(state->slices[state->last_slice]) > bytes) {
      state->working_slice = state->slices[state->last_slice];
      state->working_slice_valid = 1;
      /* TODO(klempner): Combine these into a single operation that doesn't need
         to refcount */
      gpr_slice_unref(gpr_slice_split_tail(
          &state->slices[state->last_slice],
          GPR_SLICE_LENGTH(state->slices[state->last_slice]) - bytes));
      bytes = 0;
    } else {
      bytes -= GPR_SLICE_LENGTH(state->slices[state->last_slice]);
      gpr_slice_unref(state->slices[state->last_slice]);
      --state->last_slice;
      if (state->last_slice == -1) {
        state->first_slice = -1;
      }
    }
  }
}

typedef struct {
  grpc_endpoint base;
  grpc_fd *em_fd;
  int fd;
  size_t slice_size;
  gpr_refcount refcount;

  grpc_endpoint_read_cb read_cb;
  void *read_user_data;
  grpc_endpoint_write_cb write_cb;
  void *write_user_data;

  grpc_tcp_slice_state write_state;

  grpc_iomgr_closure read_closure;
  grpc_iomgr_closure write_closure;
} grpc_tcp;

static void grpc_tcp_handle_read(void *arg /* grpc_tcp */, int success);
static void grpc_tcp_handle_write(void *arg /* grpc_tcp */, int success);

static void grpc_tcp_shutdown(grpc_endpoint *ep) {
  grpc_tcp *tcp = (grpc_tcp *)ep;
  grpc_fd_shutdown(tcp->em_fd);
}

static void grpc_tcp_unref(grpc_tcp *tcp) {
  int refcount_zero = gpr_unref(&tcp->refcount);
  if (refcount_zero) {
    grpc_fd_orphan(tcp->em_fd, NULL, NULL);
    gpr_free(tcp);
  }
}

static void grpc_tcp_destroy(grpc_endpoint *ep) {
  grpc_tcp *tcp = (grpc_tcp *)ep;
  grpc_tcp_unref(tcp);
}

static void call_read_cb(grpc_tcp *tcp, gpr_slice *slices, size_t nslices,
                         grpc_endpoint_cb_status status) {
  grpc_endpoint_read_cb cb = tcp->read_cb;

#ifdef GRPC_TRACE_TCP
  size_t i;
  gpr_log(GPR_DEBUG, "read: status=%d", status);
  for (i = 0; i < nslices; i++) {
    char *dump =
        gpr_hexdump((char *)GPR_SLICE_START_PTR(slices[i]),
                    GPR_SLICE_LENGTH(slices[i]), GPR_HEXDUMP_PLAINTEXT);
    gpr_log(GPR_DEBUG, "READ: %s", dump);
    gpr_free(dump);
  }
#endif

  tcp->read_cb = NULL;
  cb(tcp->read_user_data, slices, nslices, status);
}

#define INLINE_SLICE_BUFFER_SIZE 8
#define MAX_READ_IOVEC 4
static void grpc_tcp_handle_read(void *arg /* grpc_tcp */, int success) {
  grpc_tcp *tcp = (grpc_tcp *)arg;
  int iov_size = 1;
  gpr_slice static_read_slices[INLINE_SLICE_BUFFER_SIZE];
  struct msghdr msg;
  struct iovec iov[MAX_READ_IOVEC];
  ssize_t read_bytes;
  ssize_t allocated_bytes;
  struct grpc_tcp_slice_state read_state;
  gpr_slice *final_slices;
  size_t final_nslices;

  slice_state_init(&read_state, static_read_slices, INLINE_SLICE_BUFFER_SIZE,
                   0);

  if (!success) {
    call_read_cb(tcp, NULL, 0, GRPC_ENDPOINT_CB_SHUTDOWN);
    grpc_tcp_unref(tcp);
    return;
  }

  /* TODO(klempner): Limit the amount we read at once. */
  for (;;) {
    allocated_bytes = slice_state_append_blocks_into_iovec(
        &read_state, iov, iov_size, tcp->slice_size);

    msg.msg_name = NULL;
    msg.msg_namelen = 0;
    msg.msg_iov = iov;
    msg.msg_iovlen = iov_size;
    msg.msg_control = NULL;
    msg.msg_controllen = 0;
    msg.msg_flags = 0;

    do {
      read_bytes = recvmsg(tcp->fd, &msg, 0);
    } while (read_bytes < 0 && errno == EINTR);

    if (read_bytes < allocated_bytes) {
      /* TODO(klempner): Consider a second read first, in hopes of getting a
       * quick EAGAIN and saving a bunch of allocations. */
      slice_state_remove_last(&read_state, read_bytes < 0
                                               ? allocated_bytes
                                               : allocated_bytes - read_bytes);
    }

    if (read_bytes < 0) {
      /* NB: After calling the user_cb a parallel call of the read handler may
       * be running. */
      if (errno == EAGAIN) {
        if (slice_state_has_available(&read_state)) {
          /* TODO(klempner): We should probably do the call into the application
             without all this junk on the stack */
          /* FIXME(klempner): Refcount properly */
          slice_state_transfer_ownership(&read_state, &final_slices,
                                         &final_nslices);
          call_read_cb(tcp, final_slices, final_nslices, GRPC_ENDPOINT_CB_OK);
          slice_state_destroy(&read_state);
          grpc_tcp_unref(tcp);
        } else {
          /* Spurious read event, consume it here */
          slice_state_destroy(&read_state);
          grpc_fd_notify_on_read(tcp->em_fd, &tcp->read_closure);
        }
      } else {
        /* TODO(klempner): Log interesting errors */
        call_read_cb(tcp, NULL, 0, GRPC_ENDPOINT_CB_ERROR);
        slice_state_destroy(&read_state);
        grpc_tcp_unref(tcp);
      }
      return;
    } else if (read_bytes == 0) {
      /* 0 read size ==> end of stream */
      if (slice_state_has_available(&read_state)) {
        /* there were bytes already read: pass them up to the application */
        slice_state_transfer_ownership(&read_state, &final_slices,
                                       &final_nslices);
        call_read_cb(tcp, final_slices, final_nslices, GRPC_ENDPOINT_CB_EOF);
      } else {
        call_read_cb(tcp, NULL, 0, GRPC_ENDPOINT_CB_EOF);
      }
      slice_state_destroy(&read_state);
      grpc_tcp_unref(tcp);
      return;
    } else if (iov_size < MAX_READ_IOVEC) {
      ++iov_size;
    }
  }
}

static void grpc_tcp_notify_on_read(grpc_endpoint *ep, grpc_endpoint_read_cb cb,
                                    void *user_data) {
  grpc_tcp *tcp = (grpc_tcp *)ep;
  GPR_ASSERT(tcp->read_cb == NULL);
  tcp->read_cb = cb;
  tcp->read_user_data = user_data;
  gpr_ref(&tcp->refcount);
  grpc_fd_notify_on_read(tcp->em_fd, &tcp->read_closure);
}

#define MAX_WRITE_IOVEC 16
static grpc_endpoint_write_status grpc_tcp_flush(grpc_tcp *tcp) {
  struct msghdr msg;
  struct iovec iov[MAX_WRITE_IOVEC];
  int iov_size;
  ssize_t sent_length;
  grpc_tcp_slice_state *state = &tcp->write_state;

  for (;;) {
    iov_size = slice_state_to_iovec(state, iov, MAX_WRITE_IOVEC);

    msg.msg_name = NULL;
    msg.msg_namelen = 0;
    msg.msg_iov = iov;
    msg.msg_iovlen = iov_size;
    msg.msg_control = NULL;
    msg.msg_controllen = 0;
    msg.msg_flags = 0;

    do {
      /* TODO(klempner): Cork if this is a partial write */
      sent_length = sendmsg(tcp->fd, &msg, 0);
    } while (sent_length < 0 && errno == EINTR);

    if (sent_length < 0) {
      if (errno == EAGAIN) {
        return GRPC_ENDPOINT_WRITE_PENDING;
      } else {
        /* TODO(klempner): Log some of these */
        slice_state_destroy(state);
        return GRPC_ENDPOINT_WRITE_ERROR;
      }
    }

    /* TODO(klempner): Probably better to batch this after we finish flushing */
    slice_state_remove_prefix(state, sent_length);

    if (!slice_state_has_available(state)) {
      return GRPC_ENDPOINT_WRITE_DONE;
    }
  };
}

static void grpc_tcp_handle_write(void *arg /* grpc_tcp */, int success) {
  grpc_tcp *tcp = (grpc_tcp *)arg;
  grpc_endpoint_write_status write_status;
  grpc_endpoint_cb_status cb_status;
  grpc_endpoint_write_cb cb;

  if (!success) {
    slice_state_destroy(&tcp->write_state);
    cb = tcp->write_cb;
    tcp->write_cb = NULL;
    cb(tcp->write_user_data, GRPC_ENDPOINT_CB_SHUTDOWN);
    grpc_tcp_unref(tcp);
    return;
  }

  write_status = grpc_tcp_flush(tcp);
  if (write_status == GRPC_ENDPOINT_WRITE_PENDING) {
    grpc_fd_notify_on_write(tcp->em_fd, &tcp->write_closure);
  } else {
    slice_state_destroy(&tcp->write_state);
    if (write_status == GRPC_ENDPOINT_WRITE_DONE) {
      cb_status = GRPC_ENDPOINT_CB_OK;
    } else {
      cb_status = GRPC_ENDPOINT_CB_ERROR;
    }
    cb = tcp->write_cb;
    tcp->write_cb = NULL;
    cb(tcp->write_user_data, cb_status);
    grpc_tcp_unref(tcp);
  }
}

static grpc_endpoint_write_status grpc_tcp_write(grpc_endpoint *ep,
                                                 gpr_slice *slices,
                                                 size_t nslices,
                                                 grpc_endpoint_write_cb cb,
                                                 void *user_data) {
  grpc_tcp *tcp = (grpc_tcp *)ep;
  grpc_endpoint_write_status status;

#ifdef GRPC_TRACE_TCP
  size_t i;

  for (i = 0; i < nslices; i++) {
    char *data =
        gpr_hexdump((char *)GPR_SLICE_START_PTR(slices[i]),
                    GPR_SLICE_LENGTH(slices[i]), GPR_HEXDUMP_PLAINTEXT);
    gpr_log(GPR_DEBUG, "WRITE %p: %s", tcp, data);
    gpr_free(data);
  }
#endif

  GPR_ASSERT(tcp->write_cb == NULL);
  slice_state_init(&tcp->write_state, slices, nslices, nslices);

  status = grpc_tcp_flush(tcp);
  if (status == GRPC_ENDPOINT_WRITE_PENDING) {
    /* TODO(klempner): Consider inlining rather than malloc for small nslices */
    slice_state_realloc(&tcp->write_state, nslices);
    gpr_ref(&tcp->refcount);
    tcp->write_cb = cb;
    tcp->write_user_data = user_data;
    grpc_fd_notify_on_write(tcp->em_fd, &tcp->write_closure);
  }

  return status;
}

static void grpc_tcp_add_to_pollset(grpc_endpoint *ep, grpc_pollset *pollset) {
  grpc_tcp *tcp = (grpc_tcp *)ep;
  grpc_pollset_add_fd(pollset, tcp->em_fd);
}

static const grpc_endpoint_vtable vtable = {
    grpc_tcp_notify_on_read, grpc_tcp_write, grpc_tcp_add_to_pollset,
    grpc_tcp_shutdown, grpc_tcp_destroy};

grpc_endpoint *grpc_tcp_create(grpc_fd *em_fd, size_t slice_size) {
  grpc_tcp *tcp = (grpc_tcp *)gpr_malloc(sizeof(grpc_tcp));
  tcp->base.vtable = &vtable;
  tcp->fd = em_fd->fd;
  tcp->read_cb = NULL;
  tcp->write_cb = NULL;
  tcp->read_user_data = NULL;
  tcp->write_user_data = NULL;
  tcp->slice_size = slice_size;
  slice_state_init(&tcp->write_state, NULL, 0, 0);
  /* paired with unref in grpc_tcp_destroy */
  gpr_ref_init(&tcp->refcount, 1);
  tcp->em_fd = em_fd;
  tcp->read_closure.cb = grpc_tcp_handle_read;
  tcp->read_closure.cb_arg = tcp;
  tcp->write_closure.cb = grpc_tcp_handle_write;
  tcp->write_closure.cb_arg = tcp;
  return &tcp->base;
}

#endif