blob: e6ac235a4569100a0e84ad860378df9cf0de3883 [file] [log] [blame]
# Copyright 2025 The Pigweed Authors
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
"""Bisect build failures without storing state.
Look at builders that track a specific repository. Look at builds of those
builders by the commits that triggered them (so "most recent" refers not to when
the build ran but when the commit used to trigger the build was submitted). If
the most recent builds failed but less recent builds passed, trigger a build
with a commit in between the oldest failure and the most recent pass.
If there's already a build running (or scheduled) with that commit, do nothing.
Once that running build completes, the next bisector run will have different
parameters and will take the next step in the bisection.
Use buildbucket instead of luci-scheduler to trigger builds so bisection builds
won't get combined with gitiles poller-triggered builds. (Luci-scheduler will
take many triggers that happened in a short window and combine them to trigger
just one build, so bisector builders might be combined with a build created
because of a newly submitted change.)
This is related to the rerunner recipe, but has a different purpose. Rerunner
builders make builders green when they fail because of a flake, or confirm that
the failure is not a flake. Bisector builders attribute a new non-flake failure
to a specific commit.
"""
from __future__ import annotations
import collections
import dataclasses
import datetime
import itertools
import fnmatch
import json
from typing import Generator, Sequence
import urllib
from PB.recipe_engine import result as result_pb
from PB.go.chromium.org.luci.buildbucket.proto import (
build as build_pb,
builds_service as builds_service_pb,
common as common_pb,
project_config as bb_pb,
)
from PB.recipe_modules.pigweed.bisector.options import (
Options as BisectorOptions,
)
from PB.recipe_modules.pigweed.checkout.options import (
Options as CheckoutOptions,
)
from recipe_engine import post_process, recipe_api, recipe_test_api
@dataclasses.dataclass(frozen=True)
class Remote:
url: str
branch: str
@dataclasses.dataclass
class Trigger:
bucket: str
builder: str
remote: str
ref: str
commit: str
@dataclasses.dataclass
class ProcessBuilderResult:
included: bool = False
triggers: list[Trigger] = dataclasses.field(default_factory=list)
@dataclasses.dataclass
class AgeCommit:
age: int
commit: str
class BisectorApi(recipe_api.RecipeApi):
def include_bucket(self, props: InputProperties, bucket: str) -> bool:
if not props.excluded_buckets and not props.included_buckets:
props.included_buckets.append('*.ci')
props.included_buckets.append('ci')
for excluded_bucket in props.excluded_buckets:
if fnmatch.fnmatch(bucket, excluded_bucket):
return False
for included_bucket in props.included_buckets:
if fnmatch.fnmatch(bucket, included_bucket):
return True
return False
def process_builder(
self,
bucket_name: str,
builder: bb_pb.BuilderConfig,
repos: dict[Remote, tuple[str, ...]],
max_age_days: int,
num_builds: int,
) -> None:
result = ProcessBuilderResult()
if not builder.properties:
self.m.step.empty('no properties')
return result
build_props = json.loads(builder.properties)
if build_props.get('do_not_bisect'):
self.m.step.empty('do not bisect')
return result
# Don't DoS buildbucket. (And there's no need for this
# builder to run quickly.)
self.m.time.sleep(0.5)
status: BuilderStatus = self.m.builder_status.retrieve(
bucket=bucket_name,
builder=builder.name,
n=num_builds,
max_age=datetime.timedelta(days=max_age_days),
)
triggers = {}
for build in status.builds:
if commit := build.input.gitiles_commit:
if not commit.ref.startswith('refs/heads/'):
continue
remote = Remote(
f'https://{commit.host}/{commit.project}',
commit.ref.removeprefix('refs/heads/'),
)
triggers.setdefault(remote, 0)
triggers[remote] += 1
maxcount, maxremote = 0, None
for remote, count in triggers.items():
pres = self.m.step.empty(f'remote {remote}').presentation
pres.step_summary_text = f'count {count}'
if remote and count > maxcount:
maxcount = count
maxremote = remote
if not maxremote:
pres = self.m.step.empty('no remote').presentation
pres.step_summary_text = repr(triggers)
return result
# If there aren't many triggers from one repository then we have a
# builder being triggered from multiple repositories or we have
# something that has been rerun many times by the rerunner and the
# bisector is unlikely to improve it further. (Bisector-launched builds
# have triggering commits but rerunner-launched builds do not.)
if maxcount < len(status.builds) / 4:
self.m.step.empty('count too low')
return result
remote = maxremote
result.included = True
# If the builder hasn't recently passed, we don't have a
# known good commit and can't bisect.
if not self.m.builder_status.has_recently_passed(status):
self.m.step.empty('no recent passes')
return result
# If five or more of the recent builds are infra failures,
# don't trigger more builds.
infra_failures = [
x for x in status.builds if x.status == common_pb.INFRA_FAILURE
]
if len(infra_failures) >= 5:
self.m.step.empty('too many infra failures')
return result
if remote not in repos:
# Avoid hitting gitiles quota. (And there's no need for
# this builder to run quickly.)
self.m.time.sleep(10.0)
def gitiles_log():
with self.m.time.timeout(30):
return self.m.gitiles.log(
url=remote.url,
treeish=remote.branch,
limit=200,
test_data=[
{'id': self.test_api.commit40(x)}
for x in range(200)
],
)
# If the gitiles query fails it's likely a quota issue. Wait
# two minutes and try again. Only retry once since this could
# be a permissions issue.
try:
log = gitiles_log()
except self.m.step.StepFailure: # pragma: no cover
self.m.time.sleep(2 * 60.0)
log = gitiles_log()
repos[remote] = [x['id'] for x in log]
statuses: dict[str, list[common_pb.Status]] = {}
for build in status.builds:
if (
build.status & common_pb.ENDED_MASK
and build.input.gitiles_commit
):
commit = build.input.gitiles_commit.id
statuses.setdefault(commit, [])
statuses[commit].append(build.status)
passes = {
commit
for commit, status in statuses.items()
if common_pb.SUCCESS in status
}
failures = {
commit
for commit, status in statuses.items()
if common_pb.FAILURE in status
}
oldest_failure: AgeCommit | None = None
newest_passing: AgeCommit | None = None
for i, commit in enumerate(repos[remote]):
if commit in passes:
newest_passing = AgeCommit(i, commit)
break
if commit in failures:
oldest_failure = AgeCommit(i, commit)
def _vars(obj):
if hasattr(obj, '__dict__'):
return vars(obj)
return obj
if oldest_failure is None or newest_passing is None:
pres = self.m.step.empty('no need to bisect').presentation
pres.step_summary_text = repr(
dict(
oldest_failure=_vars(oldest_failure),
newest_passing=_vars(newest_passing),
)
)
return result
output_props = {
'newest_passing': _vars(newest_passing),
'oldest_failure': _vars(oldest_failure),
'attributed': False,
'scheduling': False,
}
wrapped_props = {bucket_name: {builder.name: output_props}}
i = oldest_failure.age + (newest_passing.age - oldest_failure.age) // 2
if i in (newest_passing.age, oldest_failure.age):
self.m.step.empty('already attributed')
output_props['attributed'] = True
self.m.step.empty('props').presentation.properties = wrapped_props
return result
with self.m.step.nest('picked') as pres:
commit = repos[remote][i]
self.m.step.empty(str(i))
pres.logs[commit[0:7]] = f'{remote.url}/+/{commit}'
self.m.step.empty(commit)
output_props['picked'] = _vars(AgeCommit(i, commit))
output_props['already_scheduled'] = False
for build in status.builds:
if (
build.status
in (
common_pb.SCHEDULED,
common_pb.STARTED,
)
and build.input.gitiles_commit
and build.input.gitiles_commit.id == commit
):
self.m.step.empty('already scheduled')
output_props['already_scheduled'] = True
pres = self.m.step.empty('props').presentation
pres.properties = wrapped_props
return result
self.m.step.empty('scheduling')
result.triggers.append(
Trigger(
bucket=bucket_name,
builder=builder.name,
remote=remote.url,
ref=remote.branch,
commit=commit,
)
)
output_props['scheduling'] = True
self.m.step.empty('props').presentation.properties = wrapped_props
return result
def __call__(self, opts: BisectorOptions):
opts.dry_run = opts.dry_run or self.m.buildbucket_util.is_dev_or_try
opts.max_age_days = opts.max_age_days or 7
opts.num_builds = opts.num_builds or 15
# The 999999999 value is not meant to be a limit that's actually
# reached, it's just there to remove the need to check if
# opts.max_triggered_builds is zero.
opts.max_triggered_builds = opts.max_triggered_builds or 999999999
state = self.m.builder_state.fetch_previous_state()
state.setdefault('triggered_builds', [])
builds = self.m.buildbucket.get_multi(state['triggered_builds'])
changed = False
for bbid, build in builds.items():
if build.output.status & common_pb.ENDED_MASK:
state['triggered_builds'].remove(bbid)
changed = True
if changed:
self.m.builder_state.save(state)
if len(state['triggered_builds']) >= opts.max_triggered_builds:
summary = (
f'Already running {len(state["triggered_builds"])} bisection '
'builds.'
)
pres = self.m.step.empty('max triggered builds').presentation
pres.step_summary_text = summary
return result_pb.RawResult(
summary_markdown=summary,
status=common_pb.SUCCESS,
)
bb_cfg: bb_pb.BuildbucketCfg = self.m.luci_config.buildbucket()
builds_to_launch: list[tuple[str, str]] = []
repos: dict[Remote, tuple[str, ...]] = {}
with self.m.defer.context() as defer:
for bucket in bb_cfg.buckets:
if not self.include_bucket(opts, bucket.name):
if bucket.swarming.builders:
self.m.step(
f'excluding {len(bucket.swarming.builders)} '
f'builders in bucket {bucket.name}',
None,
)
continue
with self.m.step.nest(bucket.name) as pres:
included = excluded = 0
for builder in bucket.swarming.builders:
with self.m.step.nest(builder.name):
deferred_result = defer(
self.process_builder,
bucket_name=bucket.name,
builder=builder,
repos=repos,
max_age_days=opts.max_age_days,
num_builds=opts.num_builds,
)
if deferred_result.is_ok():
result = deferred_result.result()
if result.included:
included += 1
else:
excluded += 1
builds_to_launch.extend(result.triggers)
pres.step_summary_text = (
f'included {included}, excluded {excluded}'
)
# These don't help users much but are useful for testing.
self.m.step.empty(f'included {included}')
self.m.step.empty(f'excluded {excluded}')
if not builds_to_launch:
self.m.step('nothing to launch', None)
return result_pb.RawResult(
summary_markdown='nothing to launch',
status=common_pb.SUCCESS,
)
bb_requests: list[builds_service_pb.ScheduleBuildRequest] = []
for trigger in builds_to_launch:
host = self.m.gerrit.host_from_remote_url(trigger.remote)
host = host.replace('-review.', '.')
bb_requests.append(
self.m.buildbucket.schedule_request(
bucket=trigger.bucket,
builder=trigger.builder,
gitiles_commit=common_pb.GitilesCommit(
host=host,
project=self.m.gerrit.project_from_remote_url(
trigger.remote,
),
id=trigger.commit,
ref=f'refs/heads/{trigger.ref}',
),
tags=[
common_pb.StringPair(
key='user_agent',
value='bisector',
),
],
),
)
# Shuffle requests so if we're close to max_triggered_builds all
# requests have a chance to be scheduled, not just those that come
# from alphabetically earlier bucket/builder names.
self.m.random.shuffle(bb_requests)
num_existing_builds = len(state['triggered_builds'])
if (
num_existing_builds + len(bb_requests)
> opts.max_triggered_builds
):
orig_len = len(bb_requests)
del bb_requests[
opts.max_triggered_builds - num_existing_builds :
]
pres = self.m.step.empty(
f'truncated {orig_len-len(bb_requests)} requests'
).presentation
pres.step_summary_text = (
f'max_triggered_builds = {opts.max_triggered_builds}'
)
if opts.dry_run:
with self.m.step.nest('dry-run, not launching builds'):
self.m.step.empty(f'{len(bb_requests)} requests')
links: list[tuple[str, str]] = []
for req in bb_requests:
bucket_builder: str = (
f'{req.builder.bucket}/{req.builder.builder}'
)
pres = self.m.step.empty(bucket_builder).presentation
builder_link = (
'https://ci.chromium.org/ui/p/'
f'{req.builder.project}/builders/{bucket_builder}'
)
pres.links['builder'] = builder_link
links.append((bucket_builder, builder_link))
commit = req.gitiles_commit
pres.links['commit'] = (
f'https://{commit.host}/{commit.project}/+/'
f'{commit.id}'
)
links_combined: str = ''.join(
f'<br/>[{name}]({link})' for name, link in links
)
return result_pb.RawResult(
summary_markdown=(
f'dry-run, would have launched: {links_combined}'
),
status=common_pb.SUCCESS,
)
with self.m.step.nest('launch') as pres:
self.m.step.empty(f'{len(bb_requests)} requests')
links: list[tuple[str, str]] = []
if bb_requests:
deferred_result = defer(
self.m.buildbucket.schedule,
bb_requests,
)
if deferred_result.is_ok():
builds: list[build_pb.Build] = deferred_result.result()
for build in builds:
bucket_builder: str = (
f'{build.builder.bucket}/'
f'{build.builder.builder}'
)
link: str = self.m.buildbucket.build_url(
build_id=build.id,
)
pres.links[bucket_builder] = link
links.append((bucket_builder, link))
state['triggered_builds'].append(build.id)
self.m.builder_state.save(state)
links_combined: str = ''.join(
f'<br/>[{name}]({link})' for name, link in links
)
return result_pb.RawResult(
summary_markdown=f'launched: {links_combined}',
status=common_pb.SUCCESS,
)