Skip to content

Commit 72242d9

Browse files
committed
Add a server-side Maven metadata harvester
Usage: python status.py groupId1:artifactId1 groupId2:artifactId2 ... The groupId:artifactId arguments are treated as regex. Or run "python status.py" with no arguments for the whole SciJava BOM.
1 parent 86cfd25 commit 72242d9

File tree

2 files changed

+292
-0
lines changed

2 files changed

+292
-0
lines changed

maven.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
#!/usr/bin/env python
2+
#
3+
# This is free and unencumbered software released into the public domain.
4+
# See the UNLICENSE file for details.
5+
#
6+
# ------------------------------------------------------------------------
7+
# maven.py
8+
# ------------------------------------------------------------------------
9+
# Supporting library for harvesting metadata about Maven components.
10+
# Requires direct access to the backing storage of the repositories.
11+
12+
import datetime, logging, os, pathlib, re
13+
import xml.etree.ElementTree as ET
14+
15+
storage = "/opt/sonatype-work/nexus/storage"
16+
release_repos = ["releases", "thirdparty", "sonatype", "central", "ome-releases"]
17+
snapshot_repos = ["snapshots", "sonatype-snapshots", "ome-snapshots"]
18+
19+
ts_allowance = 10 # maximum seconds difference in SNAPSHOT timestamp
20+
21+
class XML:
22+
23+
def __init__(self, source):
24+
self.source = source
25+
self.tree = ET.parse(source)
26+
XML._strip_ns(self.tree.getroot())
27+
28+
def elements(self, path):
29+
return self.tree.findall(path)
30+
31+
def value(self, path):
32+
el = self.elements(path)
33+
assert len(el) <= 1
34+
return None if len(el) == 0 else el[0].text
35+
36+
@staticmethod
37+
def _strip_ns(el):
38+
"""
39+
Remove namespace prefixes from elements and attributes.
40+
Credit: https://stackoverflow.com/a/32552776/1207769
41+
"""
42+
if el.tag.startswith("{"):
43+
el.tag = el.tag[el.tag.find("}")+1:]
44+
for k in list(el.attrib.keys()):
45+
if k.startswith("{"):
46+
k2 = k[k.find("}")+1:]
47+
el.attrib[k2] = el.attrib[k]
48+
del el.attrib[k]
49+
for child in el:
50+
XML._strip_ns(child)
51+
52+
class MavenPOM(XML):
53+
54+
@property
55+
def groupId(self):
56+
return self.value("groupId") or self.value("parent/groupId")
57+
58+
@property
59+
def artifactId(self):
60+
return self.value("artifactId")
61+
62+
@property
63+
def version(self):
64+
return self.value("version") or self.value("parent/version")
65+
66+
@property
67+
def scmURL(self):
68+
return self.value("scm/url")
69+
70+
@property
71+
def issuesURL(self):
72+
return self.value("issueManagement/url")
73+
74+
@property
75+
def ciURL(self):
76+
return self.value("ciManagement/url")
77+
78+
@property
79+
def developers(self):
80+
devs = []
81+
for el in self.elements("developers/developer"):
82+
dev = {}
83+
for child in el:
84+
if len(child) == 0: dev[child.tag] = child.text
85+
else:
86+
if child.tag == 'properties':
87+
dev[child.tag] = {grand.tag: grand.text for grand in child}
88+
else:
89+
dev[child.tag] = [grand.text for grand in child]
90+
devs.append(dev)
91+
return devs
92+
93+
@property
94+
def reviewers(self):
95+
return self._dev_role("reviewer")
96+
97+
@property
98+
def support(self):
99+
return self._dev_role("support")
100+
101+
@property
102+
def maintainers(self):
103+
return self._dev_role("maintainer")
104+
105+
def _dev_role(self, role):
106+
return [dev["id"] for dev in self.developers if role in dev["roles"]]
107+
108+
class MavenMetadata(XML):
109+
110+
@property
111+
def groupId(self):
112+
try:
113+
return self.value("groupId")
114+
except Exception:
115+
return self.value("parent/groupId")
116+
117+
@property
118+
def artifactId(self):
119+
return self.value("artifactId")
120+
121+
@property
122+
def lastUpdated(self):
123+
result = self.value("versioning/lastUpdated")
124+
return None if result is None else int(result)
125+
126+
@property
127+
def latest(self):
128+
# WARNING: The <latest> value is often wrong, for reasons I don't know.
129+
# However, the last <version> under <versions> has the correct value.
130+
# Consider using lastVersion instead of latest.
131+
return self.value("versioning/latest")
132+
133+
@property
134+
def lastVersion(self):
135+
vs = self.elements("versioning/versions/version")
136+
return None if len(vs) == 0 else vs[-1].text
137+
138+
@property
139+
def release(self):
140+
return self.value("versioning/release")
141+
142+
class MavenComponent:
143+
144+
def __init__(self, g, a):
145+
self.groupId = g
146+
self.artifactId = a
147+
self.release = MavenComponent._metadata(release_repos, g, a)
148+
self.snapshot = MavenComponent._metadata(snapshot_repos, g, a)
149+
if self.snapshot:
150+
# Get the newest POM possible, based on last updated SNAPSHOT.
151+
self.pom = MavenComponent._pom(snapshot_repos, g, a, v=self.snapshot.lastVersion,
152+
ts=str(self.snapshot.lastUpdated))
153+
elif self.release:
154+
# Get the POM of the newest release.
155+
self.pom = MavenComponent._pom(release_repos, g, a, v=self.release.lastVersion)
156+
else:
157+
self.pom = None
158+
159+
@staticmethod
160+
def _metadata(repos, g, a):
161+
suffix = f"{g.replace('.', '/')}/{a}/maven-metadata.xml"
162+
best = None
163+
for repo in repos:
164+
path = f"{storage}/{repo}/{suffix}"
165+
if os.path.exists(path):
166+
m = MavenMetadata(path)
167+
if best is None or (m.lastUpdated is not None and m.lastUpdated > best.lastUpdated):
168+
best = m
169+
return best
170+
171+
@staticmethod
172+
def _ts2dt(ts):
173+
"""
174+
Converts Maven-style timestamp strings into Python datetime objects.
175+
176+
Valid forms:
177+
* 20210702144918 (seen in <lastUpdated> in maven-metadata.xml)
178+
* 20210702.144917 (seen in deployed SNAPSHOT filenames)
179+
"""
180+
m = re.match("(\d{4})(\d\d)(\d\d)\.?(\d\d)(\d\d)(\d\d)", ts)
181+
if not m: raise ValueError(f"Invalid timestamp: {ts}")
182+
return datetime.datetime(*map(int, m.groups()))
183+
184+
@staticmethod
185+
def _pom(repos, g, a, v, ts=None):
186+
gav_path = f"{g.replace('.', '/')}/{a}/{v}"
187+
if v.endswith("-SNAPSHOT"):
188+
# Find snapshot POM with matching timestamp.
189+
assert ts is not None
190+
dt_requested = MavenComponent._ts2dt(ts)
191+
pom_prefix = f"{a}-{v[:-9]}" # artifactId-version minus -SNAPSHOT
192+
for repo in repos:
193+
d = pathlib.Path(f"{storage}/{repo}/{gav_path}")
194+
for f in d.glob(f"{pom_prefix}-*.pom"):
195+
m = re.match(pom_prefix + "-(\d{8}\.\d{6})-\d+\.pom", f.name)
196+
if not m: continue # ignore weirdly named POM
197+
dt_actual = MavenComponent._ts2dt(m.group(1))
198+
if abs(dt_requested - dt_actual).seconds <= ts_allowance:
199+
# Timestamp is within tolerance! Found it!
200+
return MavenPOM(str(f))
201+
else:
202+
# Find release POM.
203+
suffix = f"{gav_path}/{a}-{v}.pom"
204+
for repo in repos:
205+
path = f"{storage}/{repo}/{suffix}"
206+
if os.path.exists(path):
207+
return MavenPOM(path)
208+
return None

status.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/usr/bin/env python
2+
#
3+
# This is free and unencumbered software released into the public domain.
4+
# See the UNLICENSE file for details.
5+
#
6+
# ------------------------------------------------------------------------
7+
# component-status.py
8+
# ------------------------------------------------------------------------
9+
# Generates a JSON document with information about the components
10+
# and repositories of the SciJava component collection.
11+
12+
import json, re, sys
13+
14+
import maven
15+
16+
def resource_path(source):
17+
return None if source is None else source[len(maven.storage):]
18+
19+
def status(c):
20+
"""
21+
Gathers information from Maven about the given groupId:artifactId.
22+
"""
23+
record = {
24+
"groupId": c.groupId,
25+
"artifactId": c.artifactId
26+
}
27+
if c.release:
28+
record["release"] = {
29+
"source": resource_path(c.release.source),
30+
"groupId": c.release.groupId,
31+
"artifactId": c.release.artifactId,
32+
"lastUpdated": c.release.lastUpdated,
33+
"latest": c.release.latest,
34+
"lastVersion": c.release.lastVersion,
35+
"release": c.release.release,
36+
}
37+
if c.snapshot:
38+
record["snapshot"] = {
39+
"source": resource_path(c.snapshot.source),
40+
"groupId": c.snapshot.groupId,
41+
"artifactId": c.snapshot.artifactId,
42+
"lastUpdated": c.snapshot.lastUpdated,
43+
"latest": c.snapshot.latest,
44+
"lastVersion": c.snapshot.lastVersion,
45+
"release": c.snapshot.release,
46+
}
47+
if c.pom:
48+
record["pom"] = {
49+
"source": resource_path(c.pom.source),
50+
"groupId": c.pom.groupId,
51+
"artifactId": c.pom.artifactId,
52+
"version": c.pom.version,
53+
"scm": c.pom.scmURL,
54+
"issues": c.pom.issuesURL,
55+
"ci": c.pom.ciURL,
56+
"developers": c.pom.developers,
57+
}
58+
return record
59+
60+
def matches(g, a, patterns):
61+
return not patterns or any(re.match(pat, f"{g}:{a}") for pat in patterns)
62+
63+
def process(patterns=[]):
64+
g = "org.scijava"
65+
a = "pom-scijava"
66+
psj = maven.MavenComponent(g, a)
67+
68+
records = []
69+
70+
if matches(g, a, patterns):
71+
records.append(status(psj))
72+
73+
for dep in psj.pom.elements("dependencyManagement/dependencies/dependency"):
74+
g = dep.find("groupId").text
75+
a = dep.find("artifactId").text
76+
77+
if matches(g, a, patterns):
78+
c = maven.MavenComponent(g, a)
79+
records.append(status(c))
80+
81+
print(json.dumps(records, sort_keys=True, indent=4))
82+
83+
if __name__ == '__main__':
84+
process(sys.argv[1:])

0 commit comments

Comments
 (0)