diff --git a/MODULE.bazel b/MODULE.bazel index 2e8ed2e3d47a..75ba581bd85c 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -166,6 +166,21 @@ go_deps = use_extension("@gazelle//:extensions.bzl", "go_deps") go_deps.from_file(go_mod = "//go/extractor:go.mod") use_repo(go_deps, "org_golang_x_mod", "org_golang_x_tools") +git_lfs_binary = use_repo_rule("//misc/bazel:lfs.bzl", "git_lfs_binary") + +# to update, check out dsp-testing/codeql-git-lfs, do changes there, and push a tag with +# `git tag $(git describe)-ls-urls && git push --tags` +# then wait for https://github.com/dsp-testing/codeql-git-lfs/actions/runs/11800398535 to end, +# then copy here information from https://github.com/dsp-testing/codeql-git-lfs/releases/latest +git_lfs_binary( + name = "git-lfs", + sha256_linux = "08b75033a98f77f7e60b0928e160a6f0a5c5cd9d91b8605537969eec6980219a", + sha256_macos_arm64 = "8a17c488c975dbd050610a0b2692567064dbfef33b6c58ee89ea02f649cc0114", + sha256_macos_x86 = "9fc7265c5345901ca5cb83707ed5374fc6dfbf7ed45d2c047d5929bfe0b5f64a", + sha256_windows = "ef2f5794667584b155786291d4f839c59bfe10fcc5f870902c64f3063ffd9923", + version = "v3.5.0-179-gfd031ea1", +) + lfs_files = use_repo_rule("//misc/bazel:lfs.bzl", "lfs_files") lfs_files( diff --git a/misc/bazel/internal/git_lfs_probe.py b/misc/bazel/internal/git_lfs_probe.py index 57f0fd5b8bb0..33a9896c8e73 100755 --- a/misc/bazel/internal/git_lfs_probe.py +++ b/misc/bazel/internal/git_lfs_probe.py @@ -24,39 +24,19 @@ import argparse def options(): + def resolved_path(path): + return pathlib.Path(path).expanduser().resolve() p = argparse.ArgumentParser(description=__doc__) - p.add_argument("--hash-only", action="store_true") - p.add_argument("sources", type=pathlib.Path, nargs="+") - return p.parse_args() - - -TIMEOUT = 20 - -def warn(message: str) -> None: - print(f"WARNING: {message}", file=sys.stderr) - - -@dataclass -class Endpoint: - name: str - href: str - ssh: typing.Optional[str] = None - headers: typing.Dict[str, str] = dataclasses.field(default_factory=dict) - - def update_headers(self, d: typing.Iterable[typing.Tuple[str, str]]): - self.headers.update((k.capitalize(), v) for k, v in d) - - -class NoEndpointsFound(Exception): - pass - - -opts = options() -sources = [p.resolve() for p in opts.sources] -source_dir = pathlib.Path(os.path.commonpath(src.parent for src in sources)) -source_dir = subprocess.check_output( - ["git", "rev-parse", "--show-toplevel"], cwd=source_dir, text=True -).strip() + excl = p.add_mutually_exclusive_group(required=True) + excl.add_argument("--hash-only", action="store_true") + excl.add_argument("--git-lfs", type=resolved_path) + p.add_argument("sources", type=resolved_path, nargs="+") + opts = p.parse_args() + source_dir = pathlib.Path(os.path.commonpath(src.parent for src in opts.sources)) + opts.source_dir = subprocess.check_output( + ["git", "rev-parse", "--show-toplevel"], cwd=source_dir, text=True + ).strip() + return opts def get_env(s: str, sep: str = "=") -> typing.Iterable[typing.Tuple[str, str]]: @@ -64,161 +44,37 @@ def get_env(s: str, sep: str = "=") -> typing.Iterable[typing.Tuple[str, str]]: yield m.groups() -def git(*args, **kwargs): - proc = subprocess.run( - ("git",) + args, stdout=subprocess.PIPE, text=True, cwd=source_dir, **kwargs - ) - return proc.stdout.strip() if proc.returncode == 0 else None - - -endpoint_re = re.compile(r"^Endpoint(?: \((.*)\))?$") - - -def get_endpoint_addresses() -> typing.Iterable[Endpoint]: - """Get all lfs endpoints, including SSH if present""" - lfs_env_items = get_env( - subprocess.check_output(["git", "lfs", "env"], text=True, cwd=source_dir) - ) - current_endpoint = None - for k, v in lfs_env_items: - m = endpoint_re.match(k) - if m: - if current_endpoint: - yield current_endpoint - href, _, _ = v.partition(" ") - current_endpoint = Endpoint(name=m[1] or "default", href=href) - elif k == " SSH" and current_endpoint: - current_endpoint.ssh = v - if current_endpoint: - yield current_endpoint - - -def get_endpoints() -> typing.Iterable[Endpoint]: - for endpoint in get_endpoint_addresses(): - endpoint.headers = { - "Content-Type": "application/vnd.git-lfs+json", - "Accept": "application/vnd.git-lfs+json", - } - if endpoint.ssh: - # see https://github.com/git-lfs/git-lfs/blob/main/docs/api/authentication.md - server, _, path = endpoint.ssh.partition(":") - ssh_command = shutil.which( - os.environ.get("GIT_SSH", os.environ.get("GIT_SSH_COMMAND", "ssh")) - ) - assert ssh_command, "no ssh command found" - cmd = [ - ssh_command, - "-oStrictHostKeyChecking=accept-new", - server, - "git-lfs-authenticate", - path, - "download", - ] - try: - res = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=TIMEOUT) - except subprocess.TimeoutExpired: - warn(f"ssh timed out when connecting to {server}, ignoring {endpoint.name} endpoint") - continue - if res.returncode != 0: - warn(f"ssh failed when connecting to {server}, ignoring {endpoint.name} endpoint") - continue - ssh_resp = json.loads(res.stdout) - endpoint.href = ssh_resp.get("href", endpoint) - endpoint.update_headers(ssh_resp.get("header", {}).items()) - url = urlparse(endpoint.href) - # this is how actions/checkout persist credentials - # see https://github.com/actions/checkout/blob/44c2b7a8a4ea60a981eaca3cf939b5f4305c123b/src/git-auth-helper.ts#L56-L63 - auth = git("config", f"http.{url.scheme}://{url.netloc}/.extraheader") or "" - endpoint.update_headers(get_env(auth, sep=": ")) - if os.environ.get("GITHUB_TOKEN"): - endpoint.headers["Authorization"] = f"token {os.environ['GITHUB_TOKEN']}" - if "Authorization" not in endpoint.headers: - # last chance: use git credentials (possibly backed by a credential helper like the one installed by gh) - # see https://git-scm.com/docs/git-credential - credentials = git( - "credential", - "fill", - check=True, - # drop leading / from url.path - input=f"protocol={url.scheme}\nhost={url.netloc}\npath={url.path[1:]}\n", - ) - if credentials is None: - warn(f"no authorization method found, ignoring {endpoint.name} endpoint") - continue - credentials = dict(get_env(credentials)) - auth = base64.b64encode( - f'{credentials["username"]}:{credentials["password"]}'.encode() - ).decode("ascii") - endpoint.headers["Authorization"] = f"Basic {auth}" - yield endpoint - - -# see https://github.com/git-lfs/git-lfs/blob/310d1b4a7d01e8d9d884447df4635c7a9c7642c2/docs/api/basic-transfers.md -def get_locations(objects): +def get_locations(objects, opts): ret = ["local" for _ in objects] indexes = [i for i, o in enumerate(objects) if o] - if not indexes: - # all objects are local, do not send an empty request as that would be an error - return ret if opts.hash_only: for i in indexes: ret[i] = objects[i]["oid"] - return ret - data = { - "operation": "download", - "transfers": ["basic"], - "objects": [objects[i] for i in indexes], - "hash_algo": "sha256", - } - for endpoint in get_endpoints(): - req = urllib.request.Request( - f"{endpoint.href}/objects/batch", - headers=endpoint.headers, - data=json.dumps(data).encode("ascii"), - ) - try: - with urllib.request.urlopen(req, timeout=TIMEOUT) as resp: - data = json.load(resp) - assert len(data["objects"]) == len( - indexes - ), f"received {len(data)} objects, expected {len(indexes)}" - for i, resp in zip(indexes, data["objects"]): - ret[i] = f'{resp["oid"]} {resp["actions"]["download"]["href"]}' - return ret - except urllib.error.URLError as e: - warn(f"encountered {type(e).__name__} {e}, ignoring endpoint {endpoint.name}") - continue - except KeyError: - warn(f"encountered malformed response, ignoring endpoint {endpoint.name}:\n{json.dumps(data, indent=2)}") - continue - raise NoEndpointsFound - + else: + cmd = [opts.git_lfs, "ls-urls", "--json"] + cmd.extend(objects[i]["path"] for i in indexes) + data = json.loads(subprocess.check_output(cmd, cwd=opts.source_dir)) + for i, f in zip(indexes, data["files"]): + ret[i] = f'{f["oid"]} {f["url"]}' + return ret def get_lfs_object(path): with open(path, "rb") as fileobj: lfs_header = "version https://git-lfs.github.com/spec".encode() actual_header = fileobj.read(len(lfs_header)) - sha256 = size = None if lfs_header != actual_header: return None data = dict(get_env(fileobj.read().decode("ascii"), sep=" ")) assert data["oid"].startswith("sha256:"), f"unknown oid type: {data['oid']}" _, _, sha256 = data["oid"].partition(":") - size = int(data["size"]) - return {"oid": sha256, "size": size} + return {"path": path, "oid": sha256} -try: - objects = [get_lfs_object(src) for src in sources] - for resp in get_locations(objects): +def main(): + opts = options() + objects = [get_lfs_object(src) for src in opts.sources] + for resp in get_locations(objects, opts): print(resp) -except NoEndpointsFound as e: - print("""\ -ERROR: no valid endpoints found, your git authentication method might be currently unsupported by this script. -You can bypass this error by running from semmle-code (this might take a while): - git config lfs.fetchexclude "" - git -C ql config lfs.fetchinclude \\* - git lfs fetch && git lfs checkout - cd ql - git lfs fetch && git lfs checkout""", file=sys.stderr) - sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/misc/bazel/lfs.bzl b/misc/bazel/lfs.bzl index 7c37c0a55236..347823375b07 100644 --- a/misc/bazel/lfs.bzl +++ b/misc/bazel/lfs.bzl @@ -2,13 +2,16 @@ def lfs_smudge(repository_ctx, srcs, *, extract = False, stripPrefix = None, exe python = repository_ctx.which("python3") or repository_ctx.which("python") if not python: fail("Neither python3 nor python executables found") - script = Label("//misc/bazel/internal:git_lfs_probe.py") + script = repository_ctx.path(Label("//misc/bazel/internal:git_lfs_probe.py")) + git_lfs_binary = repository_ctx.path(Label("@git-lfs")) def probe(srcs, hash_only = False): repository_ctx.report_progress("querying LFS url(s) for: %s" % ", ".join([src.basename for src in srcs])) cmd = [python, script] if hash_only: cmd.append("--hash-only") + else: + cmd += ["--git-lfs", git_lfs_binary] cmd.extend(srcs) res = repository_ctx.execute(cmd, quiet = True) if res.return_code != 0: @@ -102,3 +105,48 @@ lfs_files = repository_rule( "executable": attr.bool(doc = "Whether files should be marked as executable"), }, ) + +def _lfs_binary_impl(repository_ctx): + suffix = "" + if repository_ctx.os.name.startswith("windows"): + arch = "windows-amd64" + sha256 = repository_ctx.attr.sha256_windows + suffix = ".exe" + elif repository_ctx.os.name.startswith("mac"): + if repository_ctx.os.arch == "x86": + arch = "darwin-amd64" + sha256 = repository_ctx.attr.sha256_macos_x86 + else: + arch = "darwin-arm64" + sha256 = repository_ctx.attr.sha256_macos_arm64 + else: + arch = "linux-amd64" + sha256 = repository_ctx.attr.sha256_linux + url = "https://github.com/dsp-testing/codeql-git-lfs/releases/download/%s/git-lfs-%s%s" % ( + repository_ctx.attr.version, + arch, + suffix, + ) + exe = "git-lfs" + suffix + repository_ctx.download( + url = url, + output = exe, + sha256 = sha256, + executable = True, + ) + name = repository_ctx.name.split("+")[-1] + if suffix: + repository_ctx.file("BUILD.bazel", "filegroup(name = %r, srcs = [%r], visibility = ['//visibility:public'])" % (name, exe)) + else: + repository_ctx.file("BUILD.bazel", "exports_files([%r])" % exe) + +git_lfs_binary = repository_rule( + implementation = _lfs_binary_impl, + attrs = { + "version": attr.string(mandatory = True), + "sha256_linux": attr.string(mandatory = True), + "sha256_macos_x86": attr.string(mandatory = True), + "sha256_macos_arm64": attr.string(mandatory = True), + "sha256_windows": attr.string(mandatory = True), + }, +)