| # Copyright 2025 The Pigweed Authors |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| # use this file except in compliance with the License. You may obtain a copy of |
| # the License at |
| # |
| # https://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| # License for the specific language governing permissions and limitations under |
| # the License. |
| """Bisect build failures without storing state. |
| |
| Look at builders that track a specific repository. Look at builds of those |
| builders by the commits that triggered them (so "most recent" refers not to when |
| the build ran but when the commit used to trigger the build was submitted). If |
| the most recent builds failed but less recent builds passed, trigger a build |
| with a commit in between the oldest failure and the most recent pass. |
| |
| If there's already a build running (or scheduled) with that commit, do nothing. |
| Once that running build completes, the next bisector run will have different |
| parameters and will take the next step in the bisection. |
| |
| Use buildbucket instead of luci-scheduler to trigger builds so bisection builds |
| won't get combined with gitiles poller-triggered builds. (Luci-scheduler will |
| take many triggers that happened in a short window and combine them to trigger |
| just one build, so bisector builders might be combined with a build created |
| because of a newly submitted change.) |
| |
| This is related to the rerunner recipe, but has a different purpose. Rerunner |
| builders make builders green when they fail because of a flake, or confirm that |
| the failure is not a flake. Bisector builders attribute a new non-flake failure |
| to a specific commit. |
| """ |
| |
| from __future__ import annotations |
| |
| import collections |
| import dataclasses |
| import datetime |
| import itertools |
| import fnmatch |
| import json |
| from typing import Generator, Sequence |
| import urllib |
| |
| from PB.recipe_engine import result as result_pb |
| from PB.go.chromium.org.luci.buildbucket.proto import ( |
| build as build_pb, |
| builds_service as builds_service_pb, |
| common as common_pb, |
| project_config as bb_pb, |
| ) |
| from PB.recipe_modules.pigweed.bisector.options import ( |
| Options as BisectorOptions, |
| ) |
| from PB.recipe_modules.pigweed.checkout.options import ( |
| Options as CheckoutOptions, |
| ) |
| from recipe_engine import post_process, recipe_api, recipe_test_api |
| |
| |
| @dataclasses.dataclass(frozen=True) |
| class Remote: |
| url: str |
| branch: str |
| |
| |
| @dataclasses.dataclass |
| class Trigger: |
| bucket: str |
| builder: str |
| remote: str |
| ref: str |
| commit: str |
| |
| |
| @dataclasses.dataclass |
| class ProcessBuilderResult: |
| included: bool = False |
| triggers: list[Trigger] = dataclasses.field(default_factory=list) |
| |
| |
| @dataclasses.dataclass |
| class AgeCommit: |
| age: int |
| commit: str |
| |
| |
| class BisectorApi(recipe_api.RecipeApi): |
| |
| def include_bucket(self, props: InputProperties, bucket: str) -> bool: |
| if not props.excluded_buckets and not props.included_buckets: |
| props.included_buckets.append('*.ci') |
| props.included_buckets.append('ci') |
| |
| for excluded_bucket in props.excluded_buckets: |
| if fnmatch.fnmatch(bucket, excluded_bucket): |
| return False |
| |
| for included_bucket in props.included_buckets: |
| if fnmatch.fnmatch(bucket, included_bucket): |
| return True |
| |
| return False |
| |
| def process_builder( |
| self, |
| bucket_name: str, |
| builder: bb_pb.BuilderConfig, |
| repos: dict[Remote, tuple[str, ...]], |
| max_age_days: int, |
| num_builds: int, |
| ) -> None: |
| result = ProcessBuilderResult() |
| |
| if not builder.properties: |
| self.m.step.empty('no properties') |
| return result |
| |
| build_props = json.loads(builder.properties) |
| |
| if build_props.get('do_not_bisect'): |
| self.m.step.empty('do not bisect') |
| return result |
| |
| # Don't DoS buildbucket. (And there's no need for this |
| # builder to run quickly.) |
| self.m.time.sleep(0.5) |
| |
| status: BuilderStatus = self.m.builder_status.retrieve( |
| bucket=bucket_name, |
| builder=builder.name, |
| n=num_builds, |
| max_age=datetime.timedelta(days=max_age_days), |
| ) |
| |
| triggers = {} |
| for build in status.builds: |
| if commit := build.input.gitiles_commit: |
| if not commit.ref.startswith('refs/heads/'): |
| continue |
| |
| remote = Remote( |
| f'https://{commit.host}/{commit.project}', |
| commit.ref.removeprefix('refs/heads/'), |
| ) |
| triggers.setdefault(remote, 0) |
| triggers[remote] += 1 |
| |
| maxcount, maxremote = 0, None |
| for remote, count in triggers.items(): |
| pres = self.m.step.empty(f'remote {remote}').presentation |
| pres.step_summary_text = f'count {count}' |
| |
| if remote and count > maxcount: |
| maxcount = count |
| maxremote = remote |
| |
| if not maxremote: |
| pres = self.m.step.empty('no remote').presentation |
| pres.step_summary_text = repr(triggers) |
| return result |
| |
| # If there aren't many triggers from one repository then we have a |
| # builder being triggered from multiple repositories or we have |
| # something that has been rerun many times by the rerunner and the |
| # bisector is unlikely to improve it further. (Bisector-launched builds |
| # have triggering commits but rerunner-launched builds do not.) |
| if maxcount < len(status.builds) / 4: |
| self.m.step.empty('count too low') |
| return result |
| |
| remote = maxremote |
| result.included = True |
| |
| # If the builder hasn't recently passed, we don't have a |
| # known good commit and can't bisect. |
| if not self.m.builder_status.has_recently_passed(status): |
| self.m.step.empty('no recent passes') |
| return result |
| |
| # If five or more of the recent builds are infra failures, |
| # don't trigger more builds. |
| infra_failures = [ |
| x for x in status.builds if x.status == common_pb.INFRA_FAILURE |
| ] |
| if len(infra_failures) >= 5: |
| self.m.step.empty('too many infra failures') |
| return result |
| |
| if remote not in repos: |
| # Avoid hitting gitiles quota. (And there's no need for |
| # this builder to run quickly.) |
| self.m.time.sleep(10.0) |
| |
| def gitiles_log(): |
| with self.m.time.timeout(30): |
| return self.m.gitiles.log( |
| url=remote.url, |
| treeish=remote.branch, |
| limit=200, |
| test_data=[ |
| {'id': self.test_api.commit40(x)} |
| for x in range(200) |
| ], |
| ) |
| |
| # If the gitiles query fails it's likely a quota issue. Wait |
| # two minutes and try again. Only retry once since this could |
| # be a permissions issue. |
| try: |
| log = gitiles_log() |
| except self.m.step.StepFailure: # pragma: no cover |
| self.m.time.sleep(2 * 60.0) |
| log = gitiles_log() |
| |
| repos[remote] = [x['id'] for x in log] |
| |
| statuses: dict[str, list[common_pb.Status]] = {} |
| for build in status.builds: |
| if ( |
| build.status & common_pb.ENDED_MASK |
| and build.input.gitiles_commit |
| ): |
| commit = build.input.gitiles_commit.id |
| statuses.setdefault(commit, []) |
| statuses[commit].append(build.status) |
| |
| passes = { |
| commit |
| for commit, status in statuses.items() |
| if common_pb.SUCCESS in status |
| } |
| failures = { |
| commit |
| for commit, status in statuses.items() |
| if common_pb.FAILURE in status |
| } |
| |
| oldest_failure: AgeCommit | None = None |
| newest_passing: AgeCommit | None = None |
| for i, commit in enumerate(repos[remote]): |
| if commit in passes: |
| newest_passing = AgeCommit(i, commit) |
| break |
| if commit in failures: |
| oldest_failure = AgeCommit(i, commit) |
| |
| def _vars(obj): |
| if hasattr(obj, '__dict__'): |
| return vars(obj) |
| return obj |
| |
| if oldest_failure is None or newest_passing is None: |
| pres = self.m.step.empty('no need to bisect').presentation |
| pres.step_summary_text = repr( |
| dict( |
| oldest_failure=_vars(oldest_failure), |
| newest_passing=_vars(newest_passing), |
| ) |
| ) |
| return result |
| |
| output_props = { |
| 'newest_passing': _vars(newest_passing), |
| 'oldest_failure': _vars(oldest_failure), |
| 'attributed': False, |
| 'scheduling': False, |
| } |
| wrapped_props = {bucket_name: {builder.name: output_props}} |
| |
| i = oldest_failure.age + (newest_passing.age - oldest_failure.age) // 2 |
| if i in (newest_passing.age, oldest_failure.age): |
| self.m.step.empty('already attributed') |
| output_props['attributed'] = True |
| self.m.step.empty('props').presentation.properties = wrapped_props |
| return result |
| |
| with self.m.step.nest('picked') as pres: |
| commit = repos[remote][i] |
| self.m.step.empty(str(i)) |
| pres.logs[commit[0:7]] = f'{remote.url}/+/{commit}' |
| self.m.step.empty(commit) |
| output_props['picked'] = _vars(AgeCommit(i, commit)) |
| |
| output_props['already_scheduled'] = False |
| for build in status.builds: |
| if ( |
| build.status |
| in ( |
| common_pb.SCHEDULED, |
| common_pb.STARTED, |
| ) |
| and build.input.gitiles_commit |
| and build.input.gitiles_commit.id == commit |
| ): |
| self.m.step.empty('already scheduled') |
| output_props['already_scheduled'] = True |
| pres = self.m.step.empty('props').presentation |
| pres.properties = wrapped_props |
| return result |
| |
| self.m.step.empty('scheduling') |
| result.triggers.append( |
| Trigger( |
| bucket=bucket_name, |
| builder=builder.name, |
| remote=remote.url, |
| ref=remote.branch, |
| commit=commit, |
| ) |
| ) |
| output_props['scheduling'] = True |
| self.m.step.empty('props').presentation.properties = wrapped_props |
| |
| return result |
| |
| def __call__(self, opts: BisectorOptions): |
| opts.dry_run = opts.dry_run or self.m.buildbucket_util.is_dev_or_try |
| opts.max_age_days = opts.max_age_days or 7 |
| opts.num_builds = opts.num_builds or 15 |
| |
| # The 999999999 value is not meant to be a limit that's actually |
| # reached, it's just there to remove the need to check if |
| # opts.max_triggered_builds is zero. |
| opts.max_triggered_builds = opts.max_triggered_builds or 999999999 |
| |
| state = self.m.builder_state.fetch_previous_state() |
| state.setdefault('triggered_builds', []) |
| builds = self.m.buildbucket.get_multi(state['triggered_builds']) |
| changed = False |
| for bbid, build in builds.items(): |
| if build.output.status & common_pb.ENDED_MASK: |
| state['triggered_builds'].remove(bbid) |
| changed = True |
| |
| if changed: |
| self.m.builder_state.save(state) |
| |
| if len(state['triggered_builds']) >= opts.max_triggered_builds: |
| summary = ( |
| f'Already running {len(state["triggered_builds"])} bisection ' |
| 'builds.' |
| ) |
| |
| pres = self.m.step.empty('max triggered builds').presentation |
| pres.step_summary_text = summary |
| |
| return result_pb.RawResult( |
| summary_markdown=summary, |
| status=common_pb.SUCCESS, |
| ) |
| |
| bb_cfg: bb_pb.BuildbucketCfg = self.m.luci_config.buildbucket() |
| |
| builds_to_launch: list[tuple[str, str]] = [] |
| |
| repos: dict[Remote, tuple[str, ...]] = {} |
| |
| with self.m.defer.context() as defer: |
| for bucket in bb_cfg.buckets: |
| if not self.include_bucket(opts, bucket.name): |
| if bucket.swarming.builders: |
| self.m.step( |
| f'excluding {len(bucket.swarming.builders)} ' |
| f'builders in bucket {bucket.name}', |
| None, |
| ) |
| continue |
| |
| with self.m.step.nest(bucket.name) as pres: |
| included = excluded = 0 |
| |
| for builder in bucket.swarming.builders: |
| with self.m.step.nest(builder.name): |
| deferred_result = defer( |
| self.process_builder, |
| bucket_name=bucket.name, |
| builder=builder, |
| repos=repos, |
| max_age_days=opts.max_age_days, |
| num_builds=opts.num_builds, |
| ) |
| if deferred_result.is_ok(): |
| result = deferred_result.result() |
| if result.included: |
| included += 1 |
| else: |
| excluded += 1 |
| builds_to_launch.extend(result.triggers) |
| |
| pres.step_summary_text = ( |
| f'included {included}, excluded {excluded}' |
| ) |
| |
| # These don't help users much but are useful for testing. |
| self.m.step.empty(f'included {included}') |
| self.m.step.empty(f'excluded {excluded}') |
| |
| if not builds_to_launch: |
| self.m.step('nothing to launch', None) |
| return result_pb.RawResult( |
| summary_markdown='nothing to launch', |
| status=common_pb.SUCCESS, |
| ) |
| |
| bb_requests: list[builds_service_pb.ScheduleBuildRequest] = [] |
| |
| for trigger in builds_to_launch: |
| host = self.m.gerrit.host_from_remote_url(trigger.remote) |
| host = host.replace('-review.', '.') |
| |
| bb_requests.append( |
| self.m.buildbucket.schedule_request( |
| bucket=trigger.bucket, |
| builder=trigger.builder, |
| gitiles_commit=common_pb.GitilesCommit( |
| host=host, |
| project=self.m.gerrit.project_from_remote_url( |
| trigger.remote, |
| ), |
| id=trigger.commit, |
| ref=f'refs/heads/{trigger.ref}', |
| ), |
| tags=[ |
| common_pb.StringPair( |
| key='user_agent', |
| value='bisector', |
| ), |
| ], |
| ), |
| ) |
| |
| # Shuffle requests so if we're close to max_triggered_builds all |
| # requests have a chance to be scheduled, not just those that come |
| # from alphabetically earlier bucket/builder names. |
| self.m.random.shuffle(bb_requests) |
| |
| num_existing_builds = len(state['triggered_builds']) |
| if ( |
| num_existing_builds + len(bb_requests) |
| > opts.max_triggered_builds |
| ): |
| orig_len = len(bb_requests) |
| del bb_requests[ |
| opts.max_triggered_builds - num_existing_builds : |
| ] |
| pres = self.m.step.empty( |
| f'truncated {orig_len-len(bb_requests)} requests' |
| ).presentation |
| pres.step_summary_text = ( |
| f'max_triggered_builds = {opts.max_triggered_builds}' |
| ) |
| |
| if opts.dry_run: |
| with self.m.step.nest('dry-run, not launching builds'): |
| self.m.step.empty(f'{len(bb_requests)} requests') |
| |
| links: list[tuple[str, str]] = [] |
| |
| for req in bb_requests: |
| bucket_builder: str = ( |
| f'{req.builder.bucket}/{req.builder.builder}' |
| ) |
| pres = self.m.step.empty(bucket_builder).presentation |
| builder_link = ( |
| 'https://ci.chromium.org/ui/p/' |
| f'{req.builder.project}/builders/{bucket_builder}' |
| ) |
| pres.links['builder'] = builder_link |
| links.append((bucket_builder, builder_link)) |
| |
| commit = req.gitiles_commit |
| pres.links['commit'] = ( |
| f'https://{commit.host}/{commit.project}/+/' |
| f'{commit.id}' |
| ) |
| |
| links_combined: str = ''.join( |
| f'<br/>[{name}]({link})' for name, link in links |
| ) |
| |
| return result_pb.RawResult( |
| summary_markdown=( |
| f'dry-run, would have launched: {links_combined}' |
| ), |
| status=common_pb.SUCCESS, |
| ) |
| |
| with self.m.step.nest('launch') as pres: |
| self.m.step.empty(f'{len(bb_requests)} requests') |
| |
| links: list[tuple[str, str]] = [] |
| |
| if bb_requests: |
| deferred_result = defer( |
| self.m.buildbucket.schedule, |
| bb_requests, |
| ) |
| if deferred_result.is_ok(): |
| builds: list[build_pb.Build] = deferred_result.result() |
| for build in builds: |
| bucket_builder: str = ( |
| f'{build.builder.bucket}/' |
| f'{build.builder.builder}' |
| ) |
| link: str = self.m.buildbucket.build_url( |
| build_id=build.id, |
| ) |
| pres.links[bucket_builder] = link |
| links.append((bucket_builder, link)) |
| state['triggered_builds'].append(build.id) |
| |
| self.m.builder_state.save(state) |
| |
| links_combined: str = ''.join( |
| f'<br/>[{name}]({link})' for name, link in links |
| ) |
| |
| return result_pb.RawResult( |
| summary_markdown=f'launched: {links_combined}', |
| status=common_pb.SUCCESS, |
| ) |