Merge pull request #4783 from ctiller/cpu-cost

Use CPU cost modelling to better utilize test resources

Merge pull request #4783 from ctiller/cpu-cost
4c3c397b · Jan Tattermusch · 6990ce22 · 0eef9eef · 4c3c397b · 4c3c397b
Commit 4c3c397b authored Jan 20, 2016 by Jan Tattermusch
--- a/build.yaml
+++ b/build.yaml
@@ -922,6 +922,7 @@ targets:
  - gpr_test_util
  - gpr
 - name: dualstack_socket_test
+  cpu_cost: 0.1
  build: test
  language: c
  src:
@@ -996,6 +997,7 @@ targets:
  - gpr_test_util
  - gpr
 - name: fling_stream_test
+  cpu_cost: 2
  build: test
  language: c
  src:
@@ -1010,6 +1012,7 @@ targets:
  - linux
  - posix
 - name: fling_test
+  cpu_cost: 2
  build: test
  language: c
  src:
@@ -1118,6 +1121,7 @@ targets:
  - gpr_test_util
  - gpr
 - name: gpr_stack_lockfree_test
+  cpu_cost: 10
  build: test
  language: c
  src:
@@ -1134,6 +1138,7 @@ targets:
  - gpr_test_util
  - gpr
 - name: gpr_sync_test
+  cpu_cost: 10
  build: test
  language: c
  src:
@@ -1142,6 +1147,7 @@ targets:
  - gpr_test_util
  - gpr
 - name: gpr_thd_test
+  cpu_cost: 10
  build: test
  language: c
  src:
@@ -1368,6 +1374,7 @@ targets:
  - gpr_test_util
  - gpr
 - name: httpcli_test
+  cpu_cost: 0.5
  build: test
  language: c
  src:
@@ -1382,6 +1389,7 @@ targets:
  - linux
  - posix
 - name: httpscli_test
+  cpu_cost: 0.5
  build: test
  language: c
  src:
@@ -1463,6 +1471,7 @@ targets:
  - gpr_test_util
  - gpr
 - name: lb_policies_test
+  cpu_cost: 0.1
  build: test
  language: c
  src:
@@ -1515,6 +1524,7 @@ targets:
  - gpr_test_util
  - gpr
 - name: no_server_test
+  cpu_cost: 0.1
  build: test
  language: c
  src:
@@ -1575,6 +1585,7 @@ targets:
  - gpr_test_util
  - gpr
 - name: set_initial_connect_string_test
+  cpu_cost: 0.1
  build: test
  language: c
  src:
@@ -1620,6 +1631,7 @@ targets:
  - linux
  - posix
 - name: tcp_client_posix_test
+  cpu_cost: 0.5
  build: test
  language: c
  src:
@@ -1634,6 +1646,7 @@ targets:
  - linux
  - posix
 - name: tcp_posix_test
+  cpu_cost: 0.5
  build: test
  language: c
  src:
@@ -1863,6 +1876,7 @@ targets:
  - gpr_test_util
  - gpr
 - name: client_crash_test
+  cpu_cost: 0.1
  build: test
  language: c++
  src:
@@ -1941,6 +1955,7 @@ targets:
  - gpr_test_util
  - gpr
 - name: end2end_test
+  cpu_cost: 0.5
  build: test
  language: c++
  src:
@@ -2084,6 +2099,7 @@ targets:
  - linux
  - posix
 - name: interop_test
+  cpu_cost: 0.1
  build: test
  language: c++
  src:
@@ -2175,6 +2191,7 @@ targets:
  - linux
  - posix
 - name: qps_test
+  cpu_cost: 10
  build: test
  language: c++
  src:
@@ -2277,6 +2294,7 @@ targets:
  - linux
  - posix
 - name: server_crash_test
+  cpu_cost: 0.1
  build: test
  language: c++
  src:
@@ -2405,6 +2423,7 @@ targets:
  - linux
  - posix
 - name: thread_stress_test
+  cpu_cost: 100
  build: test
  language: c++
  src:

--- a/src/boringssl/gen_build_yaml.py
+++ b/src/boringssl/gen_build_yaml.py
@@ -137,7 +137,8 @@ class Grpc(object):
            'platforms': ['linux', 'mac', 'posix', 'windows'],
            'flaky': False,
            'language': 'c++',
-            'boringssl': True
+            'boringssl': True,
+            'cpu_cost': 1.0
          }
          for test in files['tests']
      ]

--- a/templates/tools/run_tests/tests.json.template
+++ b/templates/tools/run_tests/tests.json.template
@@ -10,7 +10,8 @@
                 "ci_platforms": tgt.ci_platforms,
                 "exclude_configs": tgt.get("exclude_configs", []),
                 "args": [],
-                 "flaky": tgt.flaky}
+                 "flaky": tgt.flaky,
+                 "cpu_cost": tgt.get("cpu_cost", 1.0)}
                for tgt in targets
                if tgt.get('run', True) and tgt.build == 'test'] +
                tests,

--- a/test/core/bad_client/gen_build_yaml.py
+++ b/test/core/bad_client/gen_build_yaml.py
 #!/usr/bin/env python2.7
-# Copyright 2015, Google Inc.
+# Copyright 2015-2016, Google Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -35,15 +35,15 @@
 import collections
 import yaml

-TestOptions = collections.namedtuple('TestOptions', 'flaky')
-default_test_options = TestOptions(False)
+TestOptions = collections.namedtuple('TestOptions', 'flaky cpu_cost')
+default_test_options = TestOptions(False, 1.0)

 # maps test names to options
 BAD_CLIENT_TESTS = {
    'badreq': default_test_options,
-    'connection_prefix': default_test_options,
-    'headers': default_test_options,
-    'initial_settings_frame': default_test_options,
+    'connection_prefix': default_test_options._replace(cpu_cost=0.2),
+    'headers': default_test_options._replace(cpu_cost=0.2),
+    'initial_settings_frame': default_test_options._replace(cpu_cost=0.2),
    'server_registered_method': default_test_options,
    'simple_request': default_test_options,
    'window_overflow': default_test_options,
@@ -75,6 +75,7 @@ def main():
      'targets': [
          {
              'name': '%s_bad_client_test' % t,
+              'cpu_cost': BAD_CLIENT_TESTS[t].cpu_cost,
              'build': 'test',
              'language': 'c',
              'secure': 'no',

--- a/test/core/bad_ssl/gen_build_yaml.py
+++ b/test/core/bad_ssl/gen_build_yaml.py
 #!/usr/bin/env python2.7
-# Copyright 2015, Google Inc.
+# Copyright 2015-2016, Google Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -35,13 +35,13 @@
 import collections
 import yaml

-TestOptions = collections.namedtuple('TestOptions', 'flaky')
-default_test_options = TestOptions(False)
+TestOptions = collections.namedtuple('TestOptions', 'flaky cpu_cost')
+default_test_options = TestOptions(False, 1.0)

 # maps test names to options
 BAD_CLIENT_TESTS = {
-  'cert': default_test_options,
-  'alpn': default_test_options,
+    'cert': default_test_options._replace(cpu_cost=0.1),
+    'alpn': default_test_options._replace(cpu_cost=0.1),
 }

 def main():
@@ -84,6 +84,7 @@ def main():
      for t in sorted(BAD_CLIENT_TESTS.keys())] + [
          {
              'name': 'bad_ssl_%s_test' % t,
+              'cpu_cost': BAD_CLIENT_TESTS[t].cpu_cost,
              'build': 'test',
              'language': 'c',
              'src': ['test/core/bad_ssl/bad_ssl_test.c'],

--- a/test/core/end2end/gen_build_yaml.py
+++ b/test/core/end2end/gen_build_yaml.py
@@ -77,40 +77,42 @@ END2END_FIXTURES = {
 }

 TestOptions = collections.namedtuple(
-    'TestOptions', 'needs_fullstack needs_dns proxyable secure traceable')
-default_test_options = TestOptions(False, False, True, False, True)
+    'TestOptions', 'needs_fullstack needs_dns proxyable secure traceable cpu_cost')
+default_test_options = TestOptions(False, False, True, False, True, 1.0)
 connectivity_test_options = default_test_options._replace(needs_fullstack=True)

+LOWCPU = 0.1
+
 # maps test names to options
 END2END_TESTS = {
    'bad_hostname': default_test_options,
    'binary_metadata': default_test_options,
    'call_creds': default_test_options._replace(secure=True),
-    'cancel_after_accept': default_test_options,
-    'cancel_after_client_done': default_test_options,
-    'cancel_after_invoke': default_test_options,
-    'cancel_before_invoke': default_test_options,
-    'cancel_in_a_vacuum': default_test_options,
-    'cancel_with_status': default_test_options,
-    'channel_connectivity': connectivity_test_options._replace(proxyable=False),
+    'cancel_after_accept': default_test_options._replace(cpu_cost=LOWCPU),
+    'cancel_after_client_done': default_test_options._replace(cpu_cost=LOWCPU),
+    'cancel_after_invoke': default_test_options._replace(cpu_cost=LOWCPU),
+    'cancel_before_invoke': default_test_options._replace(cpu_cost=LOWCPU),
+    'cancel_in_a_vacuum': default_test_options._replace(cpu_cost=LOWCPU),
+    'cancel_with_status': default_test_options._replace(cpu_cost=LOWCPU),
+    'channel_connectivity': connectivity_test_options._replace(proxyable=False, cpu_cost=LOWCPU),
    'channel_ping': connectivity_test_options._replace(proxyable=False),
-    'compressed_payload': default_test_options._replace(proxyable=False),
+    'compressed_payload': default_test_options._replace(proxyable=False, cpu_cost=LOWCPU),
    'default_host': default_test_options._replace(needs_fullstack=True,
                                                  needs_dns=True),
    'disappearing_server': connectivity_test_options,
    'empty_batch': default_test_options,
-    'graceful_server_shutdown': default_test_options,
+    'graceful_server_shutdown': default_test_options._replace(cpu_cost=LOWCPU),
    'hpack_size': default_test_options._replace(proxyable=False,
                                                traceable=False),
    'high_initial_seqno': default_test_options,
    'invoke_large_request': default_test_options,
    'large_metadata': default_test_options,
    'max_concurrent_streams': default_test_options._replace(proxyable=False),
-    'max_message_length': default_test_options,
+    'max_message_length': default_test_options._replace(cpu_cost=LOWCPU),
    'metadata': default_test_options,
    'negative_deadline': default_test_options,
    'no_op': default_test_options,
-    'payload': default_test_options,
+    'payload': default_test_options._replace(cpu_cost=LOWCPU),
    'ping_pong_streaming': default_test_options,
    'registered_call': default_test_options,
    'request_with_flags': default_test_options._replace(proxyable=False),
@@ -118,7 +120,7 @@ END2END_TESTS = {
    'server_finishes_request': default_test_options,
    'shutdown_finishes_calls': default_test_options,
    'shutdown_finishes_tags': default_test_options,
-    'simple_delayed_request': connectivity_test_options,
+    'simple_delayed_request': connectivity_test_options._replace(cpu_cost=LOWCPU),
    'simple_request': default_test_options,
    'trailing_metadata': default_test_options,
 }
@@ -252,6 +254,7 @@ def main():
                                   END2END_FIXTURES[f].platforms, 'mac')),
              'flaky': False,
              'language': 'c',
+              'cpu_cost': END2END_TESTS[t].cpu_cost,
          }
          for f in sorted(END2END_FIXTURES.keys())
          for t in sorted(END2END_TESTS.keys()) if compatible(f, t)
@@ -266,6 +269,7 @@ def main():
                                   END2END_FIXTURES[f].platforms, 'mac')),
              'flaky': False,
              'language': 'c',
+              'cpu_cost': END2END_TESTS[t].cpu_cost,
          }
          for f in sorted(END2END_FIXTURES.keys())
          if not END2END_FIXTURES[f].secure

--- a/tools/buildgen/build-cleaner.py
+++ b/tools/buildgen/build-cleaner.py
 #!/usr/bin/env python2.7
-# Copyright 2015, Google Inc.
+# Copyright 2015-2016, Google Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -41,6 +41,7 @@ _TOP_LEVEL_KEYS = ['settings', 'proto_deps', 'filegroups', 'libs', 'targets', 'v
 _VERSION_KEYS = ['major', 'minor', 'micro', 'build']
 _ELEM_KEYS = [
    'name',
+    'cpu_cost',
    'flaky',
    'build',
    'run',

--- a/tools/run_tests/jobset.py
+++ b/tools/run_tests/jobset.py
@@ -33,6 +33,7 @@ import hashlib
 import multiprocessing
 import os
 import platform
+import re
 import signal
 import subprocess
 import sys
@@ -40,6 +41,10 @@ import tempfile
 import time


+# cpu cost measurement
+measure_cpu_costs = False
+
+
 _DEFAULT_MAX_JOBS = 16 * multiprocessing.cpu_count()
 _MAX_RESULT_SIZE = 8192

@@ -146,7 +151,7 @@ class JobSpec(object):

  def __init__(self, cmdline, shortname=None, environ=None, hash_targets=None,
               cwd=None, shell=False, timeout_seconds=5*60, flake_retries=0,
-               timeout_retries=0, kill_handler=None):
+               timeout_retries=0, kill_handler=None, cpu_cost=1.0):
    """
    Arguments:
      cmdline: a list of arguments to pass as the command line
@@ -154,6 +159,7 @@ class JobSpec(object):
      hash_targets: which files to include in the hash representing the jobs version
                    (or empty, indicating the job should not be hashed)
      kill_handler: a handler that will be called whenever job.kill() is invoked
+      cpu_cost: number of cores per second this job needs
    """
    if environ is None:
      environ = {}
@@ -169,6 +175,7 @@ class JobSpec(object):
    self.flake_retries = flake_retries
    self.timeout_retries = timeout_retries
    self.kill_handler = kill_handler
+    self.cpu_cost = cpu_cost

  def identity(self):
    return '%r %r %r' % (self.cmdline, self.environ, self.hash_targets)
@@ -218,7 +225,10 @@ class Job(object):
    env.update(self._spec.environ)
    env.update(self._add_env)
    self._start = time.time()
-    try_start = lambda: subprocess.Popen(args=self._spec.cmdline,
+    cmdline = self._spec.cmdline
+    if measure_cpu_costs:
+      cmdline = ['time', '--portability'] + cmdline
+    try_start = lambda: subprocess.Popen(args=cmdline,
                                         stderr=subprocess.STDOUT,
                                         stdout=self._tempfile,
                                         cwd=self._spec.cwd,
@@ -267,8 +277,17 @@ class Job(object):
          self.result.returncode = self._process.returncode
      else:
        self._state = _SUCCESS
-        message('PASSED', '%s [time=%.1fsec; retries=%d;%d]' % (
-                    self._spec.shortname, elapsed, self._retries, self._timeout_retries),
+        measurement = ''
+        if measure_cpu_costs:
+          m = re.search(r'real ([0-9.]+)\nuser ([0-9.]+)\nsys ([0-9.]+)', stdout())
+          real = float(m.group(1))
+          user = float(m.group(2))
+          sys = float(m.group(3))
+          if real > 0.5:
+            cores = (user + sys) / real
+            measurement = '; cpu_cost=%.01f; estimated=%.01f' % (cores, self._spec.cpu_cost)
+        message('PASSED', '%s [time=%.1fsec; retries=%d:%d%s]' % (
+                    self._spec.shortname, elapsed, self._retries, self._timeout_retries, measurement),
            do_newline=self._newline_on_success or self._travis)
        self.result.state = 'PASSED'
        if self._bin_hash:
@@ -329,10 +348,19 @@ class Jobset(object):
  def get_num_failures(self):
    return self._failures

+  def cpu_cost(self):
+    c = 0
+    for job in self._running:
+      c += job._spec.cpu_cost
+    return c
+
  def start(self, spec):
    """Start a job. Return True on success, False on failure."""
-    while len(self._running) >= self._maxjobs:
+    while True:
      if self.cancelled(): return False
+      current_cpu_cost = self.cpu_cost()
+      if current_cpu_cost == 0: break
+      if current_cpu_cost + spec.cpu_cost < self._maxjobs: break
      self.reap()
    if self.cancelled(): return False
    if spec.hash_targets:

--- a/tools/run_tests/run_tests.py
+++ b/tools/run_tests/run_tests.py
@@ -78,7 +78,7 @@ class SimpleConfig(object):
    self.timeout_multiplier = timeout_multiplier

  def job_spec(self, cmdline, hash_targets, timeout_seconds=5*60,
-               shortname=None, environ={}):
+               shortname=None, environ={}, cpu_cost=1.0):
    """Construct a jobset.JobSpec for a test under this config

       Args:
@@ -96,6 +96,7 @@ class SimpleConfig(object):
    return jobset.JobSpec(cmdline=cmdline,
                          shortname=shortname,
                          environ=actual_environ,
+                          cpu_cost=cpu_cost,
                          timeout_seconds=self.timeout_multiplier * timeout_seconds,
                          hash_targets=hash_targets
                              if self.allow_hashing else None,
@@ -114,11 +115,12 @@ class ValgrindConfig(object):
    self.args = args
    self.allow_hashing = False

-  def job_spec(self, cmdline, hash_targets):
+  def job_spec(self, cmdline, hash_targets, cpu_cost=1.0):
    return jobset.JobSpec(cmdline=['valgrind', '--tool=%s' % self.tool] +
                          self.args + cmdline,
                          shortname='valgrind %s' % cmdline[0],
                          hash_targets=None,
+                          cpu_cost=cpu_cost,
                          flake_retries=5 if args.allow_flakes else 0,
                          timeout_retries=3 if args.allow_flakes else 0)

@@ -157,6 +159,7 @@ class CLanguage(object):
        cmdline = [binary] + target['args']
        out.append(config.job_spec(cmdline, [binary],
                                   shortname=' '.join(cmdline),
+                                   cpu_cost=target['cpu_cost'],
                                   environ={'GRPC_DEFAULT_SSL_ROOTS_FILE_PATH':
                                            os.path.abspath(os.path.dirname(
                                                sys.argv[0]) + '/../../src/core/tsi/test_creds/ca.pem')}))
@@ -600,7 +603,7 @@ argp.add_argument('-n', '--runs_per_test', default=1, type=runs_per_test_type,
        help='A positive integer or "inf". If "inf", all tests will run in an '
             'infinite loop. Especially useful in combination with "-f"')
 argp.add_argument('-r', '--regex', default='.*', type=str)
-argp.add_argument('-j', '--jobs', default=2 * multiprocessing.cpu_count(), type=int)
+argp.add_argument('-j', '--jobs', default=multiprocessing.cpu_count(), type=int)
 argp.add_argument('-s', '--slowdown', default=1.0, type=float)
 argp.add_argument('-f', '--forever',
                  default=False,
@@ -647,6 +650,8 @@ argp.add_argument('--build_only',
                  action='store_const',
                  const=True,
                  help='Perform all the build steps but dont run any tests.')
+argp.add_argument('--measure_cpu_costs', default=False, action='store_const', const=True,
+                  help='Measure the cpu costs of tests')
 argp.add_argument('--update_submodules', default=[], nargs='*',
                  help='Update some submodules before building. If any are updated, also run generate_projects. ' +
                       'Submodules are specified as SUBMODULE_NAME:BRANCH; if BRANCH is omitted, master is assumed.')
@@ -655,6 +660,8 @@ argp.add_argument('-x', '--xml_report', default=None, type=str,
        help='Generates a JUnit-compatible XML report')
 args = argp.parse_args()

+jobset.measure_cpu_costs = args.measure_cpu_costs
+
 if args.use_docker:
  if not args.travis:
    print 'Seen --use_docker flag, will run tests under docker.'

--- a/tools/run_tests/tests.json
+++ b/tools/run_tests/tests.json