diff --git a/tools/profiling/microbenchmarks/README.md b/tools/profiling/microbenchmarks/README.md
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..035888ee188fbc97e4254b85c7a95708f0fca4d8 100644
--- a/tools/profiling/microbenchmarks/README.md
+++ b/tools/profiling/microbenchmarks/README.md
@@ -0,0 +1,4 @@
+Microbenchmarks
+====
+
+This directory contains helper scripts for the microbenchmark suites.
diff --git a/tools/profiling/microbenchmarks/bm_diff/README.md b/tools/profiling/microbenchmarks/bm_diff/README.md
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e1c728ffef3aa0424d488f29237139fab989b90b 100644
--- a/tools/profiling/microbenchmarks/bm_diff/README.md
+++ b/tools/profiling/microbenchmarks/bm_diff/README.md
@@ -0,0 +1,101 @@
+The bm_diff Family
+====
+
+This family of python scripts can be incredibly useful for fast iteration over
+different performance tweaks. The tools allow you to save performance data from
+a baseline commit, then quickly compare data from your working branch to that
+baseline data to see if you have made any performance wins.
+
+The tools operates with three concrete steps, which can be invoked separately,
+or all together via the driver script, bm_main.py. This readme will describe 
+the typical workflow for these scripts, then it will include sections on the
+details of every script for advanced usage.
+
+## Normal Workflow
+
+Let's say you are working on a performance optimization for grpc_error. You have
+made some significant changes and want to see some data. From your branch, run
+(ensure everything is committed first):
+
+`tools/profiling/microbenchmarks/bm_diff/bm_main.py -b bm_error -l 5 -d master`
+
+This will build the `bm_error` binary on your branch and master. It will then
+run these benchmarks 5 times each. Lastly it will compute the statistically
+significant performance differences between the two branches. This should show
+the nice performance wins your changes have made.
+
+If you have already invoked bm_main with `-d master`, you should instead use 
+`-o old` for subsequent runs. This allows the script to skip re-building and 
+re-running the unchanged master branch.
+
+## bm_build.py
+
+This scrips builds the benchmarks. It takes in a name parameter, and will
+store the binaries based on that. Both `opt` and `counter` configurations
+will be used. The `opt` is used to get cpu_time and real_time, and the
+`counters` build is used to track other metrics like allocs, atomic adds,
+etc etc etc.
+
+For example, if you were to invoke (we assume everything is run from the 
+root of the repo):
+
+`tools/profiling/microbenchmarks/bm_diff/bm_build.py -b bm_error -n baseline`
+
+then the microbenchmark binaries will show up under 
+`bm_diff_baseline/{opt,counters}/bm_error`
+
+## bm_run.py
+
+This script runs the benchmarks. It takes a name parameter that must match the
+name that was passed to `bm_build.py`. The script then runs the benchmark
+multiple times (default is 20, can be toggled via the loops parameter). The
+output is saved as `<benchmark name>.<config>.<name>.<loop idx>.json`
+
+For example, if you were to run:
+
+`tools/profiling/microbenchmarks/bm_diff/bm_run.py -b bm_error -b baseline -l 5`
+
+Then an example output file would be `bm_error.opt.baseline.1.json`
+
+## bm_diff.py
+
+This script takes in the output from two benchmark runs, computes the diff
+between them, and prints any significant improvements or regressions. It takes
+in two name parameters, old and new. These must have previously been built and
+run.
+
+For example, assuming you had already built and run a 'baseline' microbenchmark
+from master, and then you also built and ran a 'current' microbenchmark from
+the branch you were working on, you could invoke:
+
+`tools/profiling/microbenchmarks/bm_diff/bm_diff.py -b bm_error -o baseline -n current -l 5`
+
+This would output the percent difference between your branch and master.
+
+## bm_main.py
+
+This is the driver script. It uses the previous three modules and does
+everything for you. You pass in the benchmarks to be run, the number of loops,
+number of CPUs to use, and the commit to compare to. Then the script will:
+* Build the benchmarks at head, then checkout the branch to compare to and
+  build the benchmarks there
+* Run both sets of microbenchmarks
+* Run bm_diff.py to compare the two, outputs the difference.
+
+For example, one might run:
+
+`tools/profiling/microbenchmarks/bm_diff/bm_main.py -b bm_error -l 5 -d master`
+
+This would compare the current branch's error benchmarks to master.
+
+This script is invoked by our infrastructure on every PR to protect against
+regressions and demonstrate performance wins.
+
+However, if you are iterating over different performance tweaks quickly, it is
+unnecessary to build and run the baseline commit every time. That is why we
+provide a different flag in case you are sure that the baseline benchmark has
+already been built and run. In that case use the --old flag to pass in the name
+of the baseline. This will only build and run the current branch. For example:
+
+`tools/profiling/microbenchmarks/bm_diff/bm_main.py -b bm_error -l 5 -o old`
+
diff --git a/tools/profiling/microbenchmarks/bm_diff/bm_build.py b/tools/profiling/microbenchmarks/bm_diff/bm_build.py
index a5d1ec3447570f75d7815e94bf01abf12ac21d56..83c3c695e77fda62e8ba94eaa82251cc72e5bb2b 100755
--- a/tools/profiling/microbenchmarks/bm_diff/bm_build.py
+++ b/tools/profiling/microbenchmarks/bm_diff/bm_build.py
@@ -40,10 +40,12 @@ import shutil
 
 def _args():
   argp = argparse.ArgumentParser(description='Builds microbenchmarks')
-  argp.add_argument('-b', '--benchmarks', nargs='+', choices=bm_constants._AVAILABLE_BENCHMARK_TESTS, default=bm_constants._AVAILABLE_BENCHMARK_TESTS)
-  argp.add_argument('-j', '--jobs', type=int, default=multiprocessing.cpu_count())
-  argp.add_argument('-n', '--name', type=str, help='Unique name of this build')
-  return argp.parse_args()
+  argp.add_argument('-b', '--benchmarks', nargs='+', choices=bm_constants._AVAILABLE_BENCHMARK_TESTS, default=bm_constants._AVAILABLE_BENCHMARK_TESTS, help='Which benchmarks to build')
+  argp.add_argument('-j', '--jobs', type=int, default=multiprocessing.cpu_count(), help='How many CPUs to dedicate to this task')
+  argp.add_argument('-n', '--name', type=str, help='Unique name of this build. To be used as a handle to pass to the other bm* scripts')
+  args = argp.parse_args()
+  assert args.name
+  return args
 
 def _make_cmd(cfg, benchmarks, jobs):
   return ['make'] + benchmarks + [
diff --git a/tools/profiling/microbenchmarks/bm_diff/bm_diff.py b/tools/profiling/microbenchmarks/bm_diff/bm_diff.py
index 3c871c1743b8428bdded506694a49faa155370eb..7b1c7e28bf25369119af91d4249df7db75d1399d 100755
--- a/tools/profiling/microbenchmarks/bm_diff/bm_diff.py
+++ b/tools/profiling/microbenchmarks/bm_diff/bm_diff.py
@@ -61,8 +61,8 @@ def _args():
                     nargs='+',
                     default=sorted(bm_constants._INTERESTING),
                     help='Which metrics to track')
-  argp.add_argument('-b', '--benchmarks', nargs='+', choices=bm_constants._AVAILABLE_BENCHMARK_TESTS, default=bm_constants._AVAILABLE_BENCHMARK_TESTS)
-  argp.add_argument('-l', '--loops', type=int, default=20)
+  argp.add_argument('-b', '--benchmarks', nargs='+', choices=bm_constants._AVAILABLE_BENCHMARK_TESTS, default=bm_constants._AVAILABLE_BENCHMARK_TESTS, help='Which benchmarks to run')
+  argp.add_argument('-l', '--loops', type=int, default=20, help='Number of times to loops the benchmarks. Must match what was passed to bm_run.py')
   argp.add_argument('-n', '--new', type=str, help='New benchmark name')
   argp.add_argument('-o', '--old', type=str, help='Old benchmark name')
   argp.add_argument('-v', '--verbose', type=bool, help='print details of before/after')
diff --git a/tools/profiling/microbenchmarks/bm_diff/bm_main.py b/tools/profiling/microbenchmarks/bm_diff/bm_main.py
index 1a46b170155fc9f27d55d8ea653e76580cfcd98f..82b0a10e07c2a41f28294a9c9fdc7079594c15ba 100755
--- a/tools/profiling/microbenchmarks/bm_diff/bm_main.py
+++ b/tools/profiling/microbenchmarks/bm_diff/bm_main.py
@@ -51,13 +51,16 @@ def _args():
                     nargs='+',
                     default=sorted(bm_constants._INTERESTING),
                     help='Which metrics to track')
-  argp.add_argument('-b', '--benchmarks', nargs='+', choices=bm_constants._AVAILABLE_BENCHMARK_TESTS, default=bm_constants._AVAILABLE_BENCHMARK_TESTS)
-  argp.add_argument('-d', '--diff_base', type=str)
-  argp.add_argument('-r', '--repetitions', type=int, default=1)
-  argp.add_argument('-l', '--loops', type=int, default=20)
-  argp.add_argument('-j', '--jobs', type=int, default=multiprocessing.cpu_count())
+  argp.add_argument('-b', '--benchmarks', nargs='+', choices=bm_constants._AVAILABLE_BENCHMARK_TESTS, default=bm_constants._AVAILABLE_BENCHMARK_TESTS, help='Which benchmarks to run')
+  argp.add_argument('-d', '--diff_base', type=str, help='Commit or branch to compare the current one to')
+  argp.add_argument('-o', '--old', type=str, help='Name of baseline run to compare to. Ususally just called "old"')
+  argp.add_argument('-r', '--repetitions', type=int, default=1, help='Number of repetitions to pass to the benchmarks')
+  argp.add_argument('-l', '--loops', type=int, default=20, help='Number of times to loops the benchmarks. More loops cuts down on noise')
+  argp.add_argument('-j', '--jobs', type=int, default=multiprocessing.cpu_count(), help='Number of CPUs to use')
   args = argp.parse_args()
-  assert args.diff_base
+  assert args.diff_base or args.old, "One of diff_base or old must be set!"
+  if args.loops < 3:
+    print "WARNING: This run will likely be noisy. Increase loops."
   return args
 
 
@@ -76,18 +79,21 @@ def main(args):
 
   bm_build.build('new', args.benchmarks, args.jobs)
 
-  where_am_i = subprocess.check_output(['git', 'rev-parse', '--abbrev-ref', 'HEAD']).strip()
-  subprocess.check_call(['git', 'checkout', args.diff_base])
-  try:
-    bm_build.build('old', args.benchmarks, args.jobs)
-  finally:
-    subprocess.check_call(['git', 'checkout', where_am_i])
-    subprocess.check_call(['git', 'submodule', 'update'])
+  old = args.old
+  if args.diff_base:
+    old = 'old'
+    where_am_i = subprocess.check_output(['git', 'rev-parse', '--abbrev-ref', 'HEAD']).strip()
+    subprocess.check_call(['git', 'checkout', args.diff_base])
+    try:
+      bm_build.build('old', args.benchmarks, args.jobs)
+    finally:
+      subprocess.check_call(['git', 'checkout', where_am_i])
+      subprocess.check_call(['git', 'submodule', 'update'])
 
   bm_run.run('new', args.benchmarks, args.jobs, args.loops, args.repetitions)
-  bm_run.run('old', args.benchmarks, args.jobs, args.loops, args.repetitions)
+  bm_run.run(old, args.benchmarks, args.jobs, args.loops, args.repetitions)
 
-  diff = bm_diff.diff(args.benchmarks, args.loops, args.track, 'old', 'new')
+  diff = bm_diff.diff(args.benchmarks, args.loops, args.track, old, 'new')
   if diff:
     text = 'Performance differences noted:\n' + diff
   else:
diff --git a/tools/profiling/microbenchmarks/bm_diff/bm_run.py b/tools/profiling/microbenchmarks/bm_diff/bm_run.py
index 14b3718ecb342df752e24567467cf7c99bf7d6ec..b36e660f29f428d762859910606d248c2134b733 100755
--- a/tools/profiling/microbenchmarks/bm_diff/bm_run.py
+++ b/tools/profiling/microbenchmarks/bm_diff/bm_run.py
@@ -44,12 +44,16 @@ import jobset
 
 def _args():
   argp = argparse.ArgumentParser(description='Runs microbenchmarks')
-  argp.add_argument('-b', '--benchmarks', nargs='+', choices=bm_constants._AVAILABLE_BENCHMARK_TESTS, default=bm_constants._AVAILABLE_BENCHMARK_TESTS)
-  argp.add_argument('-j', '--jobs', type=int, default=multiprocessing.cpu_count())
-  argp.add_argument('-n', '--name', type=str, help='Unique name of this build')
-  argp.add_argument('-r', '--repetitions', type=int, default=1)
-  argp.add_argument('-l', '--loops', type=int, default=20)
-  return argp.parse_args()
+  argp.add_argument('-b', '--benchmarks', nargs='+', choices=bm_constants._AVAILABLE_BENCHMARK_TESTS, default=bm_constants._AVAILABLE_BENCHMARK_TESTS, help='Benchmarks to run')
+  argp.add_argument('-j', '--jobs', type=int, default=multiprocessing.cpu_count(), help='Number of CPUs to use')
+  argp.add_argument('-n', '--name', type=str, help='Unique name of the build to run. Needs to match the handle passed to bm_build.py')
+  argp.add_argument('-r', '--repetitions', type=int, default=1, help='Number of repetitions to pass to the benchmarks')
+  argp.add_argument('-l', '--loops', type=int, default=20, help='Number of times to loops the benchmarks. More loops cuts down on noise')
+  args = argp.parse_args()
+  assert args.name
+  if args.loops < 3:
+    print "WARNING: This run will likely be noisy. Increase loops."
+  return args
 
 def _collect_bm_data(bm, cfg, name, reps, idx, loops):
   cmd = ['bm_diff_%s/%s/%s' % (name, cfg, bm),
@@ -73,5 +77,4 @@ def run(name, benchmarks, jobs, loops, reps):
 
 if __name__ == '__main__':
   args = _args()
-  assert args.name
   run(args.name, args.benchmarks, args.jobs, args.loops, args.repetitions)
diff --git a/tools/profiling/microbenchmarks/bm_diff/bm_speedup.py b/tools/profiling/microbenchmarks/bm_diff/bm_speedup.py
index fb6622760b98561f2b6952091fa11b55897e874a..99f1a073f5da88b1b6b6a87202db2848833ad312 100755
--- a/tools/profiling/microbenchmarks/bm_diff/bm_speedup.py
+++ b/tools/profiling/microbenchmarks/bm_diff/bm_speedup.py
@@ -44,7 +44,6 @@ def cmp(a, b):
 def speedup(new, old):
   if (len(set(new))) == 1 and new == old: return 0
   s0, p0 = cmp(new, old)
-  print s0, p0
   if math.isnan(p0): return 0
   if s0 == 0: return 0
   if p0 > _THRESHOLD: return 0
@@ -52,7 +51,6 @@ def speedup(new, old):
     pct = 1
     while pct < 101:
       sp, pp = cmp(new, scale(old, 1 - pct/100.0))
-      print sp, pp
       if sp > 0: break
       if pp > _THRESHOLD: break
       pct += 1
@@ -61,7 +59,6 @@ def speedup(new, old):
     pct = 1
     while pct < 100000:
       sp, pp = cmp(new, scale(old, 1 + pct/100.0))
-      print sp, pp
       if sp < 0: break
       if pp > _THRESHOLD: break
       pct += 1