diff --git a/src/bin/cmd-index.ts b/src/bin/cmd-index.ts index bb521f8..4a980d1 100644 --- a/src/bin/cmd-index.ts +++ b/src/bin/cmd-index.ts @@ -7,6 +7,7 @@ import { Indexer } from "../core/indexer.js"; import { Source } from "../sources/types.js"; import { FilesystemStore } from "../stores/filesystem.js"; import { getS3Config } from "../stores/s3-config.js"; +import { parseSourceUrl } from "../core/url-parser.js"; // Shared store options interface StoreOptions { @@ -203,9 +204,74 @@ websiteCommand.action(async (options) => { } }); +// URL-based indexing command (auto-detects source type) +const urlCommand = new Command("url") + .description("Index from a URL (auto-detects source type)") + .argument("", "URL of the repository or website to index") + .option("--ref ", "Branch, tag, or commit (overrides URL-detected ref)"); +addStoreOptions(urlCommand); +urlCommand.action(async (url: string, options) => { + try { + // Parse the URL to determine source type and config + const parsed = parseSourceUrl(url); + const indexKey = options.index || parsed.defaultIndexName; + + let source: Source; + + switch (parsed.type) { + case "github": { + const { GitHubSource } = await import("../sources/github.js"); + const config = parsed.config as import("../sources/github.js").GitHubSourceConfig; + source = new GitHubSource({ + ...config, + ref: options.ref || config.ref, + }); + break; + } + case "gitlab": { + const { GitLabSource } = await import("../sources/gitlab.js"); + const config = parsed.config as import("../sources/gitlab.js").GitLabSourceConfig; + source = new GitLabSource({ + ...config, + ref: options.ref || config.ref, + }); + break; + } + case "bitbucket": { + const { BitBucketSource } = await import("../sources/bitbucket.js"); + const config = parsed.config as import("../sources/bitbucket.js").BitBucketSourceConfig; + source = new BitBucketSource({ + ...config, + ref: options.ref || config.ref, + }); + break; + } + case "website": { + const { WebsiteSource } = await import("../sources/website.js"); + const config = parsed.config as import("../sources/website.js").WebsiteSourceConfig; + source = new WebsiteSource(config); + break; + } + default: + throw new Error(`Unknown source type: ${parsed.type}`); + } + + const store = await createStore(options); + await runIndex(source, store, indexKey, parsed.type); + } catch (error) { + if (error instanceof Error && error.message.includes("Invalid")) { + console.error(`Error parsing URL: ${error.message}`); + } else { + console.error("Indexing failed:", error); + } + process.exit(1); + } +}); + // Main index command export const indexCommand = new Command("index") .description("Index a data source") + .addCommand(urlCommand) .addCommand(githubCommand) .addCommand(gitlabCommand) .addCommand(bitbucketCommand) diff --git a/src/bin/index.ts b/src/bin/index.ts index b0c06f3..8636046 100644 --- a/src/bin/index.ts +++ b/src/bin/index.ts @@ -29,5 +29,18 @@ program.addCommand(searchCommand); program.addCommand(mcpCommand); program.addCommand(agentCommand); -program.parse(); +// Auto-detect URL mode: ctxc index -> ctxc index url +// This allows users to skip the 'url' subcommand when providing a URL directly +const indexIdx = process.argv.indexOf("index"); +if (indexIdx !== -1 && indexIdx + 1 < process.argv.length) { + const nextArg = process.argv[indexIdx + 1]; + const subcommands = ["url", "github", "gitlab", "bitbucket", "website"]; + if ( + nextArg.match(/^https?:\/\//) && + !subcommands.includes(nextArg) + ) { + process.argv.splice(indexIdx + 1, 0, "url"); + } +} +program.parse(); diff --git a/src/core/index.ts b/src/core/index.ts index d139515..9c9d1a2 100644 --- a/src/core/index.ts +++ b/src/core/index.ts @@ -25,3 +25,6 @@ export { sanitizeKey, isoTimestamp } from "./utils.js"; export { Indexer } from "./indexer.js"; export type { IndexerConfig } from "./indexer.js"; +export { parseSourceUrl } from "./url-parser.js"; +export type { ParsedUrl } from "./url-parser.js"; + diff --git a/src/core/url-parser.test.ts b/src/core/url-parser.test.ts new file mode 100644 index 0000000..67e753a --- /dev/null +++ b/src/core/url-parser.test.ts @@ -0,0 +1,163 @@ +import { describe, it, expect } from "vitest"; +import { parseSourceUrl } from "./url-parser.js"; + +describe("parseSourceUrl", () => { + describe("GitHub URLs", () => { + it("parses basic github.com URL", () => { + const result = parseSourceUrl("https://github.com/owner/repo"); + expect(result.type).toBe("github"); + expect(result.config).toEqual({ owner: "owner", repo: "repo", ref: "HEAD" }); + expect(result.defaultIndexName).toBe("repo"); + }); + + it("parses GitHub URL with tree/branch", () => { + const result = parseSourceUrl("https://github.com/owner/repo/tree/main"); + expect(result.type).toBe("github"); + expect(result.config).toEqual({ owner: "owner", repo: "repo", ref: "main" }); + expect(result.defaultIndexName).toBe("repo"); + }); + + it("parses GitHub URL with tree/feature/branch (slashes in branch name)", () => { + const result = parseSourceUrl("https://github.com/owner/repo/tree/feature/branch"); + expect(result.type).toBe("github"); + expect(result.config).toEqual({ owner: "owner", repo: "repo", ref: "feature/branch" }); + expect(result.defaultIndexName).toBe("repo"); + }); + + it("parses GitHub URL with commit SHA", () => { + const result = parseSourceUrl("https://github.com/owner/repo/commit/abc123def456"); + expect(result.type).toBe("github"); + expect(result.config).toEqual({ owner: "owner", repo: "repo", ref: "abc123def456" }); + expect(result.defaultIndexName).toBe("repo"); + }); + + it("throws on invalid GitHub URL without repo", () => { + expect(() => parseSourceUrl("https://github.com/owner")).toThrow("Invalid GitHub URL"); + }); + }); + + describe("GitLab URLs", () => { + it("parses basic gitlab.com URL", () => { + const result = parseSourceUrl("https://gitlab.com/group/project"); + expect(result.type).toBe("gitlab"); + expect(result.config).toEqual({ projectId: "group/project", ref: "HEAD", baseUrl: undefined }); + expect(result.defaultIndexName).toBe("project"); + }); + + it("parses GitLab URL with subgroups", () => { + const result = parseSourceUrl("https://gitlab.com/group/subgroup/project"); + expect(result.type).toBe("gitlab"); + expect(result.config).toEqual({ + projectId: "group/subgroup/project", + ref: "HEAD", + baseUrl: undefined, + }); + expect(result.defaultIndexName).toBe("project"); + }); + + it("parses GitLab URL with /-/tree/branch", () => { + const result = parseSourceUrl("https://gitlab.com/group/project/-/tree/main"); + expect(result.type).toBe("gitlab"); + expect(result.config).toEqual({ projectId: "group/project", ref: "main", baseUrl: undefined }); + expect(result.defaultIndexName).toBe("project"); + }); + + it("parses GitLab URL with /-/tree/feature/branch", () => { + const result = parseSourceUrl("https://gitlab.com/group/project/-/tree/feature/branch"); + expect(result.type).toBe("gitlab"); + expect(result.config).toEqual({ + projectId: "group/project", + ref: "feature/branch", + baseUrl: undefined, + }); + }); + + it("parses self-hosted GitLab URL", () => { + const result = parseSourceUrl("https://gitlab.mycompany.com/team/project"); + expect(result.type).toBe("gitlab"); + expect(result.config).toEqual({ + projectId: "team/project", + ref: "HEAD", + baseUrl: "https://gitlab.mycompany.com", + }); + expect(result.defaultIndexName).toBe("project"); + }); + + it("throws on invalid GitLab URL", () => { + expect(() => parseSourceUrl("https://gitlab.com/group")).toThrow("Invalid GitLab URL"); + }); + }); + + describe("Bitbucket URLs", () => { + it("parses basic bitbucket.org URL", () => { + const result = parseSourceUrl("https://bitbucket.org/workspace/repo"); + expect(result.type).toBe("bitbucket"); + expect(result.config).toEqual({ + workspace: "workspace", + repo: "repo", + ref: "HEAD", + baseUrl: undefined, + }); + expect(result.defaultIndexName).toBe("repo"); + }); + + it("parses Bitbucket URL with /src/branch", () => { + const result = parseSourceUrl("https://bitbucket.org/workspace/repo/src/main"); + expect(result.type).toBe("bitbucket"); + expect(result.config).toEqual({ + workspace: "workspace", + repo: "repo", + ref: "main", + baseUrl: undefined, + }); + }); + + it("parses Bitbucket URL with /branch/feature", () => { + const result = parseSourceUrl("https://bitbucket.org/workspace/repo/branch/feature"); + expect(result.type).toBe("bitbucket"); + expect(result.config).toEqual({ + workspace: "workspace", + repo: "repo", + ref: "feature", + baseUrl: undefined, + }); + }); + + it("parses self-hosted Bitbucket URL", () => { + const result = parseSourceUrl("https://bitbucket.mycompany.com/workspace/repo"); + expect(result.type).toBe("bitbucket"); + expect(result.config).toEqual({ + workspace: "workspace", + repo: "repo", + ref: "HEAD", + baseUrl: "https://bitbucket.mycompany.com", + }); + }); + + it("throws on invalid Bitbucket URL", () => { + expect(() => parseSourceUrl("https://bitbucket.org/workspace")).toThrow("Invalid Bitbucket URL"); + }); + }); + + describe("Website URLs (fallback)", () => { + it("parses unknown URL as website", () => { + const result = parseSourceUrl("https://docs.example.com/api/v2"); + expect(result.type).toBe("website"); + expect(result.config).toEqual({ url: "https://docs.example.com/api/v2" }); + expect(result.defaultIndexName).toBe("docs.example.com"); + }); + + it("uses hostname as default index name for website", () => { + const result = parseSourceUrl("https://react.dev/learn/thinking-in-react"); + expect(result.type).toBe("website"); + expect(result.defaultIndexName).toBe("react.dev"); + }); + }); + + describe("Invalid URLs", () => { + it("throws on invalid URL format", () => { + expect(() => parseSourceUrl("not-a-url")).toThrow(); + }); + }); +}); + diff --git a/src/core/url-parser.ts b/src/core/url-parser.ts new file mode 100644 index 0000000..f2fe3b4 --- /dev/null +++ b/src/core/url-parser.ts @@ -0,0 +1,171 @@ +/** + * URL Parser - Parses source URLs to determine type and extract configuration + * + * @module core/url-parser + */ + +import type { GitHubSourceConfig } from "../sources/github.js"; +import type { GitLabSourceConfig } from "../sources/gitlab.js"; +import type { BitBucketSourceConfig } from "../sources/bitbucket.js"; +import type { WebsiteSourceConfig } from "../sources/website.js"; + +/** + * Result of parsing a source URL + */ +export interface ParsedUrl { + type: "github" | "gitlab" | "bitbucket" | "website"; + config: GitHubSourceConfig | GitLabSourceConfig | BitBucketSourceConfig | WebsiteSourceConfig; + defaultIndexName: string; +} + +/** + * Parse a source URL to determine the source type and extract configuration. + * + * @param urlString - The URL to parse + * @returns Parsed URL with type, config, and default index name + * @throws Error if the URL is invalid + * + * @example + * ```typescript + * const result = parseSourceUrl("https://github.com/owner/repo/tree/main"); + * // result.type === "github" + * // result.config === { owner: "owner", repo: "repo", ref: "main" } + * // result.defaultIndexName === "repo" + * ``` + */ +export function parseSourceUrl(urlString: string): ParsedUrl { + const url = new URL(urlString); + const hostname = url.hostname.toLowerCase(); + + // GitHub + if (hostname === "github.com") { + return parseGitHubUrl(url); + } + + // GitLab (gitlab.com or hostname contains "gitlab") + if (hostname === "gitlab.com" || hostname.includes("gitlab")) { + return parseGitLabUrl(url); + } + + // Bitbucket (bitbucket.org or hostname contains "bitbucket") + if (hostname === "bitbucket.org" || hostname.includes("bitbucket")) { + return parseBitBucketUrl(url); + } + + // Fallback to website + return { + type: "website", + config: { url: urlString }, + defaultIndexName: hostname, + }; +} + +/** + * Parse a GitHub URL + * Formats: + * - https://github.com/owner/repo + * - https://github.com/owner/repo/tree/branch + * - https://github.com/owner/repo/tree/feature/branch + * - https://github.com/owner/repo/commit/sha + */ +function parseGitHubUrl(url: URL): ParsedUrl { + const pathParts = url.pathname.split("/").filter(Boolean); + + if (pathParts.length < 2) { + throw new Error(`Invalid GitHub URL: ${url.href} - expected owner and repo in path`); + } + + const owner = pathParts[0]; + const repo = pathParts[1]; + let ref = "HEAD"; + + // Check for tree/branch or commit/sha patterns + if (pathParts.length >= 4) { + if (pathParts[2] === "tree" || pathParts[2] === "commit") { + // Join all remaining parts to handle branch names with slashes + ref = pathParts.slice(3).join("/"); + } + } + + return { + type: "github", + config: { owner, repo, ref }, + defaultIndexName: repo, + }; +} + +/** + * Parse a GitLab URL + * Formats: + * - https://gitlab.com/group/project + * - https://gitlab.com/group/subgroup/project + * - https://gitlab.com/group/project/-/tree/branch + */ +function parseGitLabUrl(url: URL): ParsedUrl { + const pathParts = url.pathname.split("/").filter(Boolean); + + if (pathParts.length < 2) { + throw new Error(`Invalid GitLab URL: ${url.href} - expected project path`); + } + + let ref = "HEAD"; + let projectParts = pathParts; + + // Check for /-/tree/branch pattern + const dashIndex = pathParts.indexOf("-"); + if (dashIndex !== -1) { + projectParts = pathParts.slice(0, dashIndex); + // After "-", expect "tree" or "commits" followed by ref + if (pathParts[dashIndex + 1] === "tree" || pathParts[dashIndex + 1] === "commits") { + ref = pathParts.slice(dashIndex + 2).join("/"); + } + } + + const projectId = projectParts.join("/"); + const projectName = projectParts[projectParts.length - 1]; + + // Handle self-hosted GitLab + const baseUrl = url.origin !== "https://gitlab.com" ? url.origin : undefined; + + return { + type: "gitlab", + config: { projectId, ref, baseUrl }, + defaultIndexName: projectName, + }; +} + +/** + * Parse a Bitbucket URL + * Formats: + * - https://bitbucket.org/workspace/repo + * - https://bitbucket.org/workspace/repo/src/branch + * - https://bitbucket.org/workspace/repo/branch/feature + */ +function parseBitBucketUrl(url: URL): ParsedUrl { + const pathParts = url.pathname.split("/").filter(Boolean); + + if (pathParts.length < 2) { + throw new Error(`Invalid Bitbucket URL: ${url.href} - expected workspace and repo in path`); + } + + const workspace = pathParts[0]; + const repo = pathParts[1]; + let ref = "HEAD"; + + // Check for /src/branch or /branch/name patterns + if (pathParts.length >= 4) { + if (pathParts[2] === "src" || pathParts[2] === "branch") { + ref = pathParts.slice(3).join("/"); + } + } + + // Handle self-hosted Bitbucket + const baseUrl = url.origin !== "https://bitbucket.org" ? url.origin : undefined; + + return { + type: "bitbucket", + config: { workspace, repo, ref, baseUrl }, + defaultIndexName: repo, + }; +} +