Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ classifiers = [
]
dependencies = [
"aiohttp>=3",
"aiomemoizettl",
"arrow>=1.0",
"cryptography>=2.6.1",
"dictdiffer",
Expand Down
91 changes: 57 additions & 34 deletions src/scriptworker/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import logging
import re

from aiomemoizettl import memoize_ttl
from github3 import GitHub
from github3.exceptions import GitHubException

Expand Down Expand Up @@ -35,6 +34,9 @@ def __init__(self, owner, repo_name, token=""):
dict: a representation of the repo definition

"""
self._owner = owner
self._repo_name = repo_name
self._token = token
github = retry_sync(GitHub, kwargs={"token": token}, sleeptime_kwargs=_GITHUB_LIBRARY_SLEEP_TIME_KWARGS)
self._github_repository = retry_sync(github.repository, args=(owner, repo_name), sleeptime_kwargs=_GITHUB_LIBRARY_SLEEP_TIME_KWARGS)

Expand Down Expand Up @@ -110,6 +112,11 @@ async def get_tag_hash(self, tag_name):
async def has_commit_landed_on_repository(self, context, revision):
"""Tell if a commit was landed on the repository or if it just comes from a pull request.

This method uses the official GitHub REST API to check if a commit has been merged
into the repository. It queries the /repos/{owner}/{repo}/commits/{sha}/pulls endpoint
to determine if the commit is associated with any merged pull requests, or if it was
directly committed to the repository.

Args:
context (scriptworker.context.Context): the scriptworker context.
revision (str): the commit hash or the tag name.
Expand All @@ -118,42 +125,58 @@ async def has_commit_landed_on_repository(self, context, revision):
bool: True if the commit is present in one of the branches of the main repository

"""
if any(vcs_rule.get("require_secret") for vcs_rule in context.config["trusted_vcs_rules"]):
# This check uses unofficial API on github, which we can't easily
# check for private repos, assume its true in the private case.
log.info("has_commit_landed_on_repository() not implemented for private" "repositories, assume True")
return True

# Revision may be a tag name. `branch_commits` doesn't work on tags
# Revision may be a tag name. Convert tags to commit hashes
if not _is_git_full_hash(revision):
revision = await self.get_tag_hash(tag_name=revision)

html_text = await _fetch_github_branch_commits_data(context, self._github_repository.html_url, revision)

# https://github.com/{repo_owner}/{repo_name}/branch_commits/{revision} just returns some \n
# when the commit hasn't landed on the origin repo. Otherwise, some HTML data is returned - it
# represents the branches on which the given revision is present.
return html_text != ""


# TODO Use memoize_ttl() as decorator once https://github.com/michalc/aiomemoizettl/issues/2 is done
async def _fetch_github_branch_commits_data_helper(context, repo_html_url, revision):
url = "/".join((repo_html_url.rstrip("/"), "branch_commits", revision))
log.info('Cache does not exist for URL "{}" (in this context), fetching it...'.format(url))
html_text = await retry_request(context, url)
return html_text.strip()


# XXX memoize_ttl() uses all function parameters to create a key that stores its cache.
# This means new contexts cannot use the memoized value, even though they're calling the same
# repo and revision. jlorenzo tried to take the context out of the memoize_ttl() call, but
# whenever the cache is invalidated, request() doesn't work anymore because the session carried
# by the context has been long closed.
# Therefore, the defined TTL has 2 purposes:
# a. it memoizes calls for the time of a single cot_verify() run
# b. it clears the cache automatically, so we don't have to manually invalidate it.
_BRANCH_COMMITS_CACHE_TTL_IN_SECONDS = 10 * 60 # 10 minutes
_fetch_github_branch_commits_data = memoize_ttl(_fetch_github_branch_commits_data_helper, get_ttl=lambda _: _BRANCH_COMMITS_CACHE_TTL_IN_SECONDS)
# Use the official GitHub REST API to check if commit has landed
# GET /repos/{owner}/{repo}/commits/{sha}/pulls
api_url = f"https://api.github.com/repos/{self._owner}/{self._repo_name}/commits/{revision}/pulls"

headers = {
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}

# Add authentication if token is available
if self._token:
headers["Authorization"] = f"Bearer {self._token}"

log.info(f"Checking if commit {revision} has landed on {self._owner}/{self._repo_name}")

try:
# Query the GitHub API for pull requests associated with this commit
pull_requests = await retry_request(
context, api_url, return_type="json", headers=headers, good=(200, 404) # 404 means commit doesn't exist in repo at all
)

# If we got a 404, the commit doesn't exist in this repository
# This is handled by checking the type of response
if isinstance(pull_requests, dict) and pull_requests.get("message") == "Not Found":
log.info(f"Commit {revision} not found in repository")
return False

# If no pull requests are associated with this commit, it was directly committed
# to the repository (not from a PR), so it has landed
if not pull_requests:
log.info(f"Commit {revision} was directly committed (no associated PRs)")
return True

# If there are pull requests, check if any of them are merged
# A merged PR means the commit has landed on the main repository
has_merged_pr = any(pr.get("merged_at") is not None for pr in pull_requests)

if has_merged_pr:
log.info(f"Commit {revision} has landed via merged PR")
else:
log.info(f"Commit {revision} only exists in unmerged PRs")

return has_merged_pr

except Exception as e:
log.error(f"Error checking if commit {revision} has landed: {e}")
# In case of errors, assume the commit hasn't landed to be safe
return False


def is_github_url(url):
Expand Down
122 changes: 25 additions & 97 deletions tests/test_github.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import asyncio
from copy import copy
from types import SimpleNamespace
from unittest.mock import MagicMock, patch

Expand All @@ -8,20 +6,7 @@
from scriptworker import github
from scriptworker.exceptions import ConfigError

_CACHE = {}


@pytest.fixture(scope="session", autouse=True)
async def mock_memoized_func():
# Memoize_ttl causes an issue with pytest-asyncio, so we copy the behavior to an in-memory cache
async def fetch(*args, **kwargs):
key = (args, tuple(kwargs.items()))
if key not in _CACHE:
_CACHE[key] = await github._fetch_github_branch_commits_data_helper(*args, **kwargs)
return _CACHE[key]

with patch("scriptworker.github._fetch_github_branch_commits_data", fetch):
yield
# No longer need caching/memoization mocks since we're using the official GitHub API


@pytest.fixture(scope="function")
Expand Down Expand Up @@ -147,66 +132,57 @@ async def test_get_tag_hash(github_repository, tags, raises, expected):


@pytest.mark.parametrize(
"commitish, expected_url, html_text, raises, expected",
"commitish, expected_url, api_response, raises, expected",
(
# Commit with only unmerged PRs - should return False
(
"0129abcdef012345643456789abcdef012345678",
"https://github.com/some-user/some-repo/branch_commits/0129abcdef012345643456789abcdef012345678",
"\r\n\r\n",
"https://api.github.com/repos/some-user/some-repo/commits/0129abcdef012345643456789abcdef012345678/pulls",
[{"id": 123, "merged_at": None}], # Unmerged PR
False,
False,
),
# Commit with no associated PRs - direct commit, should return True
(
"0f0123456789abcdef012123456789abcde34565",
"https://github.com/some-user/some-repo/branch_commits/0f0123456789abcdef012123456789abcde34565",
"\n",
False,
"https://api.github.com/repos/some-user/some-repo/commits/0f0123456789abcdef012123456789abcde34565/pulls",
[], # No PRs = direct commit
False,
True,
),
# Commit that doesn't exist (404) - should return False
(
"0123456789abcdef0123456789abcdef01234568",
"https://github.com/some-user/some-repo/branch_commits/0123456789abcdef0123456789abcdef01234568",
"",
"https://api.github.com/repos/some-user/some-repo/commits/0123456789abcdef0123456789abcdef01234568/pulls",
{"message": "Not Found"}, # 404 response
False,
False,
),
# Commit with merged PR - should return True
(
"06789abcdf0123ef01123456789abcde45234569",
"https://github.com/some-user/some-repo/branch_commits/06789abcdf0123ef01123456789abcde45234569",
"""


<svg class="octicon octicon-git-branch" viewBox="0 0 10 16" version="1.1" width="10" height="16" aria-hidden="true"><path fill-rule="evenodd" d="M10 5c0-1.11-.89-2-2-2a1.993 1.993 0 0 0-1 3.72v.3c-.02.52-.23.98-.63 1.38-.4.4-.86.61-1.38.63-.83.02-1.48.16-2 .45V4.72a1.993 1.993 0 0 0-1-3.72C.88 1 0 1.89 0 3a2 2 0 0 0 1 1.72v6.56c-.59.35-1 .99-1 1.72 0 1.11.89 2 2 2 1.11 0 2-.89 2-2 0-.53-.2-1-.53-1.36.09-.06.48-.41.59-.47.25-.11.56-.17.94-.17 1.05-.05 1.95-.45 2.75-1.25S8.95 7.77 9 6.73h-.02C9.59 6.37 10 5.73 10 5zM2 1.8c.66 0 1.2.55 1.2 1.2 0 .65-.55 1.2-1.2 1.2C1.35 4.2.8 3.65.8 3c0-.65.55-1.2 1.2-1.2zm0 12.41c-.66 0-1.2-.55-1.2-1.2 0-.65.55-1.2 1.2-1.2.65 0 1.2.55 1.2 1.2 0 .65-.55 1.2-1.2 1.2zm6-8c-.66 0-1.2-.55-1.2-1.2 0-.65.55-1.2 1.2-1.2.65 0 1.2.55 1.2 1.2 0 .65-.55 1.2-1.2 1.2z"/></svg>
<ul class="branches-list">
<li class="branch"><a href="/some-user/some-repo">master</a></li>
<li class="pull-request">(<a title="Merged Pull Request: [glean] Create a way for glean test API functions to await async IO operations" href="/some-user/some-repo/pull/1234">#1234</a>)</li>
</ul>
""",
"https://api.github.com/repos/some-user/some-repo/commits/06789abcdf0123ef01123456789abcde45234569/pulls",
[{"id": 1234, "merged_at": "2021-01-01T00:00:00Z"}], # Merged PR
False,
True,
),
# Tag that resolves to a commit with merged PR - should return True
(
"v1.0.0",
"https://github.com/some-user/some-repo/branch_commits/hashforv100",
"""


<svg class="octicon octicon-git-branch" viewBox="0 0 10 16" version="1.1" width="10" height="16" aria-hidden="true"><path fill-rule="evenodd" d="M10 5c0-1.11-.89-2-2-2a1.993 1.993 0 0 0-1 3.72v.3c-.02.52-.23.98-.63 1.38-.4.4-.86.61-1.38.63-.83.02-1.48.16-2 .45V4.72a1.993 1.993 0 0 0-1-3.72C.88 1 0 1.89 0 3a2 2 0 0 0 1 1.72v6.56c-.59.35-1 .99-1 1.72 0 1.11.89 2 2 2 1.11 0 2-.89 2-2 0-.53-.2-1-.53-1.36.09-.06.48-.41.59-.47.25-.11.56-.17.94-.17 1.05-.05 1.95-.45 2.75-1.25S8.95 7.77 9 6.73h-.02C9.59 6.37 10 5.73 10 5zM2 1.8c.66 0 1.2.55 1.2 1.2 0 .65-.55 1.2-1.2 1.2C1.35 4.2.8 3.65.8 3c0-.65.55-1.2 1.2-1.2zm0 12.41c-.66 0-1.2-.55-1.2-1.2 0-.65.55-1.2 1.2-1.2.65 0 1.2.55 1.2 1.2 0 .65-.55 1.2-1.2 1.2zm6-8c-.66 0-1.2-.55-1.2-1.2 0-.65.55-1.2 1.2-1.2.65 0 1.2.55 1.2 1.2 0 .65-.55 1.2-1.2 1.2z"/></svg>
<ul class="branches-list">
<li class="branch"><a href="/some-user/some-repo">master</a></li>
<li class="pull-request">(<a title="Merged Pull Request: [glean] Create a way for glean test API functions to await async IO operations" href="/some-user/some-repo/pull/1234">#1234</a>)</li>
</ul>
""",
"https://api.github.com/repos/some-user/some-repo/commits/hashforv100/pulls",
[{"id": 1234, "merged_at": "2021-01-01T00:00:00Z"}], # Merged PR
False,
True,
),
("non-existing-tag", None, "", True, None),
# Non-existing tag - should raise ValueError
("non-existing-tag", None, None, True, None),
),
)
async def test_has_commit_landed_on_repository(context, github_repository, commitish, expected_url, html_text, raises, expected):
async def retry_request(_, url):
assert url == expected_url
return html_text
async def test_has_commit_landed_on_repository(context, github_repository, commitish, expected_url, api_response, raises, expected):
async def retry_request(_, url, **kwargs):
if expected_url:
assert url == expected_url
return api_response

with patch("scriptworker.github.retry_request", retry_request):
if raises:
Expand All @@ -216,54 +192,6 @@ async def retry_request(_, url):
assert await github_repository.has_commit_landed_on_repository(context, commitish) == expected


@pytest.mark.asyncio
async def test_has_commit_landed_on_repository_private(vpn_context, github_repository):
"""For private repos we don't actually query against github.

The API used is not formally available and needs authorization on private repos, but isn't clearly useful
to try to use for this case.
"""
commitish = "06789abcdf0123ef01123456789abcde45234569"

async def retry_request(_, url):
assert False, "We should never have made a request"

assert await github_repository.has_commit_landed_on_repository(vpn_context, commitish) == True


@pytest.mark.asyncio
async def test_has_commit_landed_on_repository_cache(context, vpn_context, github_repository):
global retry_request_call_count
retry_request_call_count = 0

async def _counter(*args, **kwargs):
global retry_request_call_count
retry_request_call_count += 1
return ""

with patch("scriptworker.github.retry_request", _counter):
await asyncio.gather(
*[
github_repository.has_commit_landed_on_repository(context, "0129abcdef012345643456789abcdef012345678"),
github_repository.has_commit_landed_on_repository(context, "0129abcdef012345643456789abcdef012345678"),
github_repository.has_commit_landed_on_repository(context, "0129abcdef012345643456789abcdef012345678"),
github_repository.has_commit_landed_on_repository(context, "0129abcdef012345643456789abcdef012345678"),
]
)
# Even though all calls were fired at once, just a single call was made
assert retry_request_call_count == 1

await github_repository.has_commit_landed_on_repository(context, "456789abcdef0123456780129abcdef012345643")
# New commit means new request
assert retry_request_call_count == 2

different_context = copy(context)
different_context.task = {"taskGroupId": "someOtherTaskId"}
await github_repository.has_commit_landed_on_repository(different_context, "456789abcdef0123456780129abcdef012345643")
# New context means new request too
assert retry_request_call_count == 3


@pytest.mark.parametrize(
"url, expected",
(
Expand Down
16 changes: 0 additions & 16 deletions tests/test_production.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,13 @@
import os
import re
import tempfile
from unittest.mock import patch

import pytest
from asyncio_extras.contextmanager import async_contextmanager
from taskcluster.aio import Index, Queue, Secrets

import scriptworker.log as swlog
import scriptworker.utils as utils
from scriptworker import github
from scriptworker.config import apply_product_config, get_unfrozen_copy, read_worker_creds
from scriptworker.constants import DEFAULT_CONFIG
from scriptworker.context import Context
Expand All @@ -24,20 +22,6 @@

# constants helpers and fixtures {{{1
pytestmark = [pytest.mark.skipif(os.environ.get("NO_TESTS_OVER_WIRE"), reason="NO_TESTS_OVER_WIRE: skipping production CoT verification test")]
_CACHE = {}


@pytest.fixture(scope="session", autouse=True)
async def mock_memoized_func():
# Memoize_ttl causes an issue with pytest-asyncio, so we copy the behavior to an in-memory cache
async def fetch(*args, **kwargs):
key = (args, tuple(kwargs.items()))
if key not in _CACHE:
_CACHE[key] = await github._fetch_github_branch_commits_data_helper(*args, **kwargs)
return _CACHE[key]

with patch("scriptworker.github._fetch_github_branch_commits_data", fetch):
yield


def read_integration_creds():
Expand Down