8000 Add first version of javadoc wrangler · scijava/javadoc-wrangler@96c8a0e · GitHub
[go: up one dir, main page]

Skip to content

Commit 96c8a0e

Browse files
committed
Add first version of javadoc wrangler
1 parent 68b736a commit 96c8a0e

File tree

4 files changed

+387
-0
lines changed

4 files changed

+387
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/__pycache__/
2+
/target/

UNLICENSE

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
This is free and unencumbered software released into the public domain.
2+
3+
Anyone is free to copy, modify, publish, use, compile, sell, or
4+
distribute this software, either in source code form or as a compiled
5+
binary, for any purpose, commercial or non-commercial, and by any
6+
means.
7+
8+
In jurisdictions that recognize copyright laws, the author or authors
9+
of this software dedicate any and all copyright interest in the
10+
software to the public domain. We make this dedication for the benefit
11+
of the public at large and to the detriment of our heirs and
12+
successors. We intend this dedication to be an overt act of
13+
relinquishment in perpetuity of all present and future rights to this
14+
software under copyright law.
15+
16+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19+
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20+
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21+
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22+
OTHER DEALINGS IN THE SOFTWARE.
23+
24+
For more information, please refer to <http://unlicense.org/>

settings.xml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
<?xml version="1.0"?>
2+
<settings>
3+
<mirrors>
4+
<mirror>
5+
<id>scijava-mirror</id>
6+
<name>SciJava mirror</name>
7+
<url>https://maven.scijava.org/content/groups/public</url>
8+
<mirrorOf>*</mirrorOf>
9+
</mirror>
10+
</mirrors>
11+
</settings>

wrangle.py

Lines changed: 350 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,350 @@
1+
#!/usr/bin/env python
2+
3+
#
4+
# wrangle.py - Unpack javadoc JARs into a coherent multi-project structure.
5+
#
6+
7+
import logging, os, re, subprocess, sys
8+
from pathlib import Path
9+
from typing import Sequence
10+
from urllib import request
11+
from xml.etree import ElementTree as ET
12+
from zipfile import ZipFile
13+
14+
# -- Constants --
15+
16+
scriptDir = Path(__file__).parent
17+
baseDir = scriptDir / "target"
18+
siteBase = baseDir / "site"
19+
workBase = baseDir / "work"
20+
jarDir = baseDir / "jars"
21+
22+
toplevel_html_docs = {
23+
"about.html",
24+
"allclasses-frame.html",
25+
"allclasses-index.html",
26+
"allclasses-noframe.html",
27+
"allclasses.html",
28+
"allpackages-index.html",
29+
"constant-values.html",
30+
"deprecated-list.html",
31+
"help-doc.html",
32+
"index-all.html",
33+
"index.html",
34+
"overview-frame.html",
35+
"overview-summary.html",
36+
"overview-tree.html",
37+
"package-frame.html",
38+
"package-summary.html",
39+
"package-tree.html",
40+
"package-use.html",
41+
"serialized-form.html"
42+
}
43+
44+
# -- Logging --
45+
46+
log = logging.getLogger(__name__)
47+
48+
def die(message, code=1):
49+
log.error(message)
50+
sys.exit(code)
51+
52+
# -- Classes --
53+
54+
class GAV:
55+
def __init__(self, g, a, v):
56+
self.g = g
57+
self.a = a
58+
self.v = v
59+
60+
def __str__(self):
61+
return f"{self.g}:{self.a}:{self.v}"
62+
63+
@property
64+
def valid(self):
65+
return bool(self.g and self.a and self.v)
66+
67+
class XML:
68+
69+
def __init__(self, source):
70+
if isinstance(source, str) and source.startswith('<'):
71+
# Parse XML from string.
72+
# https://stackoverflow.com/a/18281386/1207769
73+
self.tree = ET.ElementTree(ET.fromstring(source))
74+
else:
75+
# Parse XML from file.
76+
self.tree = ET.parse(source)
77+
XML._strip_ns(self.tree.getroot())
78+
79+
def elements(self, path):
80+
return self.tree.findall(path)
81+
82+
def value(self, path):
83+
el = self.elements(path)
84+
assert len(el) <= 1
85+
return None if len(el) == 0 else el[0].text
86+
87+
@staticmethod
88+
def _strip_ns(el):
89+
"""
90+
Remove namespace prefixes from elements and attributes.
91+
Credit: https://stackoverflow.com/a/32552776/1207769
92+
"""
93+
if el.tag.startswith("{"):
94+
el.tag = el.tag[el.tag.find("}")+1:]
95+
for k in list(el.attrib.keys()):
96+
if k.startswith("{"):
97+
k2 = k[k.find("}")+1:]
98+
el.attrib[k2] = el.attrib[k]
99+
del el.attrib[k]
100+
for child in el:
101+
XML._strip_ns(child)
102+
103+
# -- Functions --
104+
105+
def mkdirs(path):
106+
path.mkdir(parents=True, exist_ok=True)
107+
108+
def readfile(path):
109+
try:
110+
with open(path) as f:
111+
return f.readlines()
112+
except Exception as e:
113+
log.warning(f"Failed to read file {path}")
114+
log.debug(e)
115+
116+
def writefile(path, lines=None, append=False):
117+
with open(path, "a" if append else "w") as f:
118+
if lines is not None:
119+
f.writelines(lines)
120+
121+
def execute(cmd: Sequence[str], die_on_error=True):
122+
result = subprocess.run(cmd, capture_output=True)
123+
if result.returncode != 0:
124+
error_message = f"Command {cmd[0]} failed with exit code {result.returncode}"
125+
if die_on_error:
126+
die(error_message)
127+
else:
128+
raise RuntimeError(error_message)
129+
return result.stdout.decode().splitlines(keepends=True)
130+
131+
def mvn(goal: Sequence[str], pom=None, die_on_error=True, **kwargs):
132+
cmd = ["mvn", "-B", "-s", "settings.xml"]
133+
if pom is not None:
134+
cmd.extend(["-f", str(pom)])
135+
cmd.append(goal)
136+
for k, v in kwargs.items():
137+
cmd.append(f"-D{k}={v}")
138+
return execute(cmd, die_on_error=die_on_error)
139+
140+
def squash(path: Path):
141+
if not Path(path).exists():
142+
die(f"No such file: {path}")
143+
try:
144+
writefile(path, sorted(set(readfile(path))))
145+
except Exception as e:
146+
log.error(f"Exception squashing {path}")
147+
log.debug(e)
148+
149+
def unpack_javadoc(c: GAV, jarFile: Path, javadocDir: Path):
150+
if javadocDir.exists():
151+
log.info(f"Skipping already unpacked {c}")
152+
return
153+
154+
log.info(f"Unpacking javadoc JAR for {c}")
155+
mkdirs(javadocDir)
156+
with ZipFile(jarFile) as z:
157+
z.extractall(javadocDir)
158+
159+
# Grab this component's associated POM.
160+
log.info(f"Copying POM for {c}")
161+
mvn("dependency:copy", artifact=f"{c}:pom", outputDirectory=javadocDir)
162+
163+
# Replace old javadoc.scijava.org links with new ones:
164+
# javadoc.scijava.org/*/ -> javadoc.scijava.org/{parent.g}/{parent.a}/{parent.v}/
165+
pom = javadocDir / f"{c.a}-{c.v}.pom"
166+
xml = XML(pom)
167+
parent = GAV(xml.value("parent/groupId"),
168+
xml.value("parent/artifactId"),
169+
xml.value("parent/version"))
170+
171+
if not parent.valid:
172+
log.warning(f"Could not glean parent POM for artifact {c}; skipping link replacement")
173+
return
174+
175+
log.info(f"Replacing links for {c} javadoc")
176+
oldLink = "https?://javadoc.(scijava.org|imagej.net)/[^/]*/"
177+
newLink = f"/{parent.g}/{parent.a}/{parent.v}/"
178+
for f in javadocDir.rglob("*"):
179+
if f.suffix != '.html' or not f.is_file():
180+
continue
181+
try:
182+
writefile(f, [re.sub(oldLink, newLink, line) for line in readfile(f)])
183+
except Exception as e:
184+
log.error(f"Exception replacing links for {f}")
185+
log.debug(e)
186+
187+
188+
def process_component(c: GAV, bom: GAV, bomDir: Path):
189+
# Obtain the javadoc classifier JAR.
190+
jarFile = jarDir / f"{c.a}-{c.v}-javadoc.jar"
191+
if not jarFile.exists():
192+
missingFile = jarFile.with_suffix(".missing")
193+
if missingFile.exists():
194+
log.warning(f"No javadoc archive for {c} (cached)")
195+
return
196+
197+
log.info(f"Downloading/copying javadoc archive: {jarFile.name}")
198+
mkdirs(jarDir)
199+
try:
200+
mvn("dependency:copy", die_on_error=False,
201+
artifact=f"{c}:jar:javadoc",
202+
outputDirectory=jarDir)
203+
except RuntimeError as e:
204+
log.warning(f"No javadoc archive for {c}")
205+
log.debug(e)
206+
writefile(missingFile)
207+
return
208+
209+
# Unpack javadoc JAR into dedicated folder.
210+
javadocDir = siteBase / c.g / c.a / c.v
211+
unpack_javadoc(c, jarFile, javadocDir)
212+
213+
# Append this artifact's indices to the BOM's aggregated indices.
214+
log.info(f"Appending {c} package lists to {bom}")
215+
for packageIndexName in ("package-list", "element-list"):
216+
componentPackageIndex = javadocDir / packageIndexName
217+
if componentPackageIndex.exists():
218+
bomPackageIndex = bomDir / packageIndexName
219+
try:
220+
writefile(bomPackageIndex, readfile(componentPackageIndex), append=True)
221+
except Exception as e:
222+
log.error(f"Exception appending {packageIndexName} for {c}")
223+
log.debug(e)
224+
225+
# Append artifact's class links to BOM folder's .htaccess redirects.
226+
log.info(f"Appending {c} htaccess rules to {bom}")
227+
for f in javadocDir.rglob("*"):
228+
# Process only Java class and package HTML documents, not toplevel ones.
229+
if f.suffix != '.html' or f.name in toplevel_html_docs or not f.is_file():
230+
continue
231+
relativePath = str(f)[len(str(javadocDir)):] # /
232+
bomPath = f"/{bom.g}/{bom.a}/{bom.v}/{relativePath}"
233+
componentPath = f"/{c.g}/{c.a}/{c.v}/{relativePath}"
234+
redirect = f"RedirectMatch permanent \"^{bomPath}$\" {componentPath}\n"
235+
writefile(bomDir / ".htaccess", [redirect], append=True)
236+
237+
def process_bom(bom: GAV):
238+
workDir = workBase / bom.g / bom.a / bom.v
239+
240+
completeMarker = workDir / "complete"
241+
if completeMarker.exists():
242+
# Already processed this version of the BOM.
243+
log.info(f"Skipping already processed BOM {bom}")
244+
return
245+
246+
mkdirs(workDir)
247+
248+
log.info(f"Processing BOM {bom}")
249+
250+
bomDir = siteBase / bom.g / bom.a / bom.v
251+
mkdirs(bomDir)
252+
253+
# Download the BOM file.
254+
bomFile = workDir / f"{bom.a}-{bom.v}.pom"
255+
if not bomFile.exists():
256+
log.info(f"Downloading BOM {bom}")
257+
mvn("dependency:copy",
258+
artifact=f"{bom}:pom",
259+
outputDirectory=workDir)
260+
261+
# Interpolate the BOM and extract the list of managed dependencies as XML.
262+
bomComponentsFile = workDir / "components.xml"
263+
if not bomComponentsFile.exists():
264+
output = mvn("help:effective-pom", bomFile)
265+
start = end = None
266+
for i, line in enumerate(output):
267+
if start is not None and end is not None: break
268+
if line.startswith(' <dependencyManagement>'): start = i
269+
elif line.startswith(' </dependencyManagement>'): end = i
270+
if start is None or end is None:
271+
die(f"Could not interpolate the BOM -- mvn output follows:\n{''.join(output)}")
272+
writefile(bomComponentsFile, output[start:end+1])
273+
bomComponents = XML(bomComponentsFile)
274+
275+
for dep in bomComponents.elements('dependencies/dependency'):
276+
c = GAV(dep.find('groupId').text,
277+
dep.find('artifactId').text,
278+
dep.find('version').text)
279+
if c.valid:
280+
process_component(c, bom, bomDir)
281+
else:
282+
log.warning(f"Invalid component: {c}")
283+
284+
# Sort package-list, element-list, and htaccess files, squashing duplicates.
285+
squash(bomDir / "package-list")
286+
squash(bomDir / "element-list")
287+
squash(bomDir / ".htaccess")
288+
289+
log.info(f"Done processing BOM {bom}")
290+
writefile(completeMarker)
291+
292+
# TODO: Check that javadoc tool actually works pointed at a BOM prefix.
293+
# TODO: Close https://github.com/scijava/pom-scijava/issues/130 when done.
294+
295+
# QUESTIONS:
296+
# - What should the root of javadoc.scijava.org serve now?
297+
# An index of available components? E.g. net.imagej:imagej, sc.fiji:fiji
298+
# - Should we aggregate the JSON indices (*-search-index.zip for all components)?
299+
# What else would we need to do to make the search work for a BOM's javadoc index?
300+
# - Are there other documents we should aggregate like the various toplevel HTML files?
301+
302+
# 3. Loop over the <dependency> elements:
303+
# - Obtain the -javadoc JAR for that dependency (efficiently!).
304+
# - If already copied/linked, do nothing.
305+
# - Copy/link from local file system if available.
306+
# Right now, javadoc.scijava.org resides on devonrex, but if
307+
# moved to balinese, it could fetch existing cached artifacts
308+
# from the local file system, which would be even faster.
309+
# - `mvn dependency:get` if not available locally.
310+
# - Fail gracefully and continue if it doesn't exist.
311+
# - Extract the JAR to its special folder.
312+
# From here on out, the logic in wrangle.sh should be correct.
313+
# Just need to translate it into Python.
314+
315+
# Authoritative list of published pom-scijava versions:
316+
# https://repo1.maven.org/maven2/org/scijava/pom-scijava/maven-metadata.xml
317+
318+
# Ultimately, the goal is to wrangle every published version of
319+
# pom-scijava, so the javadoc is as complete as possible.
320+
321+
# -- Main --
322+
323+
def main(args=None):
324+
logging.basicConfig(level=logging.DEBUG, format="[%(levelname)s] %(message)s")
325+
326+
if args is None:
327+
args = []
328+
if len(args) == 0:
329+
# Use the latest release of pom-scijava if no args given.
330+
try:
331+
url = "https://repo1.maven.org/maven2/org/scijava/pom-scijava/maven-metadata.xml"
332+
metadata = XML(request.urlopen(url).read().decode())
333+
version = metadata.value('versioning/release')
334+
except Exception as e:
335+
log.debug(e)
336+
version = None
337+
if not version:
338+
die("Cannot glean latest version of org.scijava:pom-scijava.")
339+
args.append(version)
340+
341+
for arg in args:
342+
if ":" in arg:
343+
gav = ":".split(arg)
344+
bom = GAV(*gav)
345+
else:
346+
bom = GAV("org.scijava", "pom-scijava", arg)
347+
process_bom(bom)
348+
349+
if __name__ == '__main__':
350+
main(sys.argv[1:])

0 commit comments

Comments
 (0)
0