|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +# |
| 4 | +# wrangle.py - Unpack javadoc JARs into a coherent multi-project structure. |
| 5 | +# |
| 6 | + |
| 7 | +import logging, os, re, subprocess, sys |
| 8 | +from pathlib import Path |
| 9 | +from typing import Sequence |
| 10 | +from urllib import request |
| 11 | +from xml.etree import ElementTree as ET |
| 12 | +from zipfile import ZipFile |
| 13 | + |
| 14 | +# -- Constants -- |
| 15 | + |
| 16 | +scriptDir = Path(__file__).parent |
| 17 | +baseDir = scriptDir / "target" |
| 18 | +siteBase = baseDir / "site" |
| 19 | +workBase = baseDir / "work" |
| 20 | +jarDir = baseDir / "jars" |
| 21 | + |
| 22 | +toplevel_html_docs = { |
| 23 | + "about.html", |
| 24 | + "allclasses-frame.html", |
| 25 | + "allclasses-index.html", |
| 26 | + "allclasses-noframe.html", |
| 27 | + "allclasses.html", |
| 28 | + "allpackages-index.html", |
| 29 | + "constant-values.html", |
| 30 | + "deprecated-list.html", |
| 31 | + "help-doc.html", |
| 32 | + "index-all.html", |
| 33 | + "index.html", |
| 34 | + "overview-frame.html", |
| 35 | + "overview-summary.html", |
| 36 | + "overview-tree.html", |
| 37 | + "package-frame.html", |
| 38 | + "package-summary.html", |
| 39 | + "package-tree.html", |
| 40 | + "package-use.html", |
| 41 | + "serialized-form.html" |
| 42 | +} |
| 43 | + |
| 44 | +# -- Logging -- |
| 45 | + |
| 46 | +log = logging.getLogger(__name__) |
| 47 | + |
| 48 | +def die(message, code=1): |
| 49 | + log.error(message) |
| 50 | + sys.exit(code) |
| 51 | + |
| 52 | +# -- Classes -- |
| 53 | + |
| 54 | +class GAV: |
| 55 | + def __init__(self, g, a, v): |
| 56 | + self.g = g |
| 57 | + self.a = a |
| 58 | + self.v = v |
| 59 | + |
| 60 | + def __str__(self): |
| 61 | + return f"{self.g}:{self.a}:{self.v}" |
| 62 | + |
| 63 | + @property |
| 64 | + def valid(self): |
| 65 | + return bool(self.g and self.a and self.v) |
| 66 | + |
| 67 | +class XML: |
| 68 | + |
| 69 | + def __init__(self, source): |
| 70 | + if isinstance(source, str) and source.startswith('<'): |
| 71 | + # Parse XML from string. |
| 72 | + # https://stackoverflow.com/a/18281386/1207769 |
| 73 | + self.tree = ET.ElementTree(ET.fromstring(source)) |
| 74 | + else: |
| 75 | + # Parse XML from file. |
| 76 | + self.tree = ET.parse(source) |
| 77 | + XML._strip_ns(self.tree.getroot()) |
| 78 | + |
| 79 | + def elements(self, path): |
| 80 | + return self.tree.findall(path) |
| 81 | + |
| 82 | + def value(self, path): |
| 83 | + el = self.elements(path) |
| 84 | + assert len(el) <= 1 |
| 85 | + return None if len(el) == 0 else el[0].text |
| 86 | + |
| 87 | + @staticmethod |
| 88 | + def _strip_ns(el): |
| 89 | + """ |
| 90 | + Remove namespace prefixes from elements and attributes. |
| 91 | + Credit: https://stackoverflow.com/a/32552776/1207769 |
| 92 | + """ |
| 93 | + if el.tag.startswith("{"): |
| 94 | + el.tag = el.tag[el.tag.find("}")+1:] |
| 95 | + for k in list(el.attrib.keys()): |
| 96 | + if k.startswith("{"): |
| 97 | + k2 = k[k.find("}")+1:] |
| 98 | + el.attrib[k2] = el.attrib[k] |
| 99 | + del el.attrib[k] |
| 100 | + for child in el: |
| 101 | + XML._strip_ns(child) |
| 102 | + |
| 103 | +# -- Functions -- |
| 104 | + |
| 105 | +def mkdirs(path): |
| 106 | + path.mkdir(parents=True, exist_ok=True) |
| 107 | + |
| 108 | +def readfile(path): |
| 109 | + try: |
| 110 | + with open(path) as f: |
| 111 | + return f.readlines() |
| 112 | + except Exception as e: |
| 113 | + log.warning(f"Failed to read file {path}") |
| 114 | + log.debug(e) |
| 115 | + |
| 116 | +def writefile(path, lines=None, append=False): |
| 117 | + with open(path, "a" if append else "w") as f: |
| 118 | + if lines is not None: |
| 119 | + f.writelines(lines) |
| 120 | + |
| 121 | +def execute(cmd: Sequence[str], die_on_error=True): |
| 122 | + result = subprocess.run(cmd, capture_output=True) |
| 123 | + if result.returncode != 0: |
| 124 | + error_message = f"Command {cmd[0]} failed with exit code {result.returncode}" |
| 125 | + if die_on_error: |
| 126 | + die(error_message) |
| 127 | + else: |
| 128 | + raise RuntimeError(error_message) |
| 129 | + return result.stdout.decode().splitlines(keepends=True) |
| 130 | + |
| 131 | +def mvn(goal: Sequence[str], pom=None, die_on_error=True, **kwargs): |
| 132 | + cmd = ["mvn", "-B", "-s", "settings.xml"] |
| 133 | + if pom is not None: |
| 134 | + cmd.extend(["-f", str(pom)]) |
| 135 | + cmd.append(goal) |
| 136 | + for k, v in kwargs.items(): |
| 137 | + cmd.append(f"-D{k}={v}") |
| 138 | + return execute(cmd, die_on_error=die_on_error) |
| 139 | + |
| 140 | +def squash(path: Path): |
| 141 | + if not Path(path).exists(): |
| 142 | + die(f"No such file: {path}") |
| 143 | + try: |
| 144 | + writefile(path, sorted(set(readfile(path)))) |
| 145 | + except Exception as e: |
| 146 | + log.error(f"Exception squashing {path}") |
| 147 | + log.debug(e) |
| 148 | + |
| 149 | +def unpack_javadoc(c: GAV, jarFile: Path, javadocDir: Path): |
| 150 | + if javadocDir.exists(): |
| 151 | + log.info(f"Skipping already unpacked {c}") |
| 152 | + return |
| 153 | + |
| 154 | + log.info(f"Unpacking javadoc JAR for {c}") |
| 155 | + mkdirs(javadocDir) |
| 156 | + with ZipFile(jarFile) as z: |
| 157 | + z.extractall(javadocDir) |
| 158 | + |
| 159 | + # Grab this component's associated POM. |
| 160 | + log.info(f"Copying POM for {c}") |
| 161 | + mvn("dependency:copy", artifact=f"{c}:pom", outputDirectory=javadocDir) |
| 162 | + |
| 163 | + # Replace old javadoc.scijava.org links with new ones: |
| 164 | + # javadoc.scijava.org/*/ -> javadoc.scijava.org/{parent.g}/{parent.a}/{parent.v}/ |
| 165 | + pom = javadocDir / f"{c.a}-{c.v}.pom" |
| 166 | + xml = XML(pom) |
| 167 | + parent = GAV(xml.value("parent/groupId"), |
| 168 | + xml.value("parent/artifactId"), |
| 169 | + xml.value("parent/version")) |
| 170 | + |
| 171 | + if not parent.valid: |
| 172 | + log.warning(f"Could not glean parent POM for artifact {c}; skipping link replacement") |
| 173 | + return |
| 174 | + |
| 175 | + log.info(f"Replacing links for {c} javadoc") |
| 176 | + oldLink = "https?://javadoc.(scijava.org|imagej.net)/[^/]*/" |
| 177 | + newLink = f"/{parent.g}/{parent.a}/{parent.v}/" |
| 178 | + for f in javadocDir.rglob("*"): |
| 179 | + if f.suffix != '.html' or not f.is_file(): |
| 180 | + continue |
| 181 | + try: |
| 182 | + writefile(f, [re.sub(oldLink, newLink, line) for line in readfile(f)]) |
| 183 | + except Exception as e: |
| 184 | + log.error(f"Exception replacing links for {f}") |
| 185 | + log.debug(e) |
| 186 | + |
| 187 | + |
| 188 | +def process_component(c: GAV, bom: GAV, bomDir: Path): |
| 189 | + # Obtain the javadoc classifier JAR. |
| 190 | + jarFile = jarDir / f"{c.a}-{c.v}-javadoc.jar" |
| 191 | + if not jarFile.exists(): |
| 192 | + missingFile = jarFile.with_suffix(".missing") |
| 193 | + if missingFile.exists(): |
| 194 | + log.warning(f"No javadoc archive for {c} (cached)") |
| 195 | + return |
| 196 | + |
| 197 | + log.info(f"Downloading/copying javadoc archive: {jarFile.name}") |
| 198 | + mkdirs(jarDir) |
| 199 | + try: |
| 200 | + mvn("dependency:copy", die_on_error=False, |
| 201 | + artifact=f"{c}:jar:javadoc", |
| 202 | + outputDirectory=jarDir) |
| 203 | + except RuntimeError as e: |
| 204 | + log.warning(f"No javadoc archive for {c}") |
| 205 | + log.debug(e) |
| 206 | + writefile(missingFile) |
| 207 | + return |
| 208 | + |
| 209 | + # Unpack javadoc JAR into dedicated folder. |
| 210 | + javadocDir = siteBase / c.g / c.a / c.v |
| 211 | + unpack_javadoc(c, jarFile, javadocDir) |
| 212 | + |
| 213 | + # Append this artifact's indices to the BOM's aggregated indices. |
| 214 | + log.info(f"Appending {c} package lists to {bom}") |
| 215 | + for packageIndexName in ("package-list", "element-list"): |
| 216 | + componentPackageIndex = javadocDir / packageIndexName |
| 217 | + if componentPackageIndex.exists(): |
| 218 | + bomPackageIndex = bomDir / packageIndexName |
| 219 | + try: |
| 220 | + writefile(bomPackageIndex, readfile(componentPackageIndex), append=True) |
| 221 | + except Exception as e: |
| 222 | + log.error(f"Exception appending {packageIndexName} for {c}") |
| 223 | + log.debug(e) |
| 224 | + |
| 225 | + # Append artifact's class links to BOM folder's .htaccess redirects. |
| 226 | + log.info(f"Appending {c} htaccess rules to {bom}") |
| 227 | + for f in javadocDir.rglob("*"): |
| 228 | + # Process only Java class and package HTML documents, not toplevel ones. |
| 229 | + if f.suffix != '.html' or f.name in toplevel_html_docs or not f.is_file(): |
| 230 | + continue |
| 231 | + relativePath = str(f)[len(str(javadocDir)):] # / |
| 232 | + bomPath = f"/{bom.g}/{bom.a}/{bom.v}/{relativePath}" |
| 233 | + componentPath = f"/{c.g}/{c.a}/{c.v}/{relativePath}" |
| 234 | + redirect = f"RedirectMatch permanent \"^{bomPath}$\" {componentPath}\n" |
| 235 | + writefile(bomDir / ".htaccess", [redirect], append=True) |
| 236 | + |
| 237 | +def process_bom(bom: GAV): |
| 238 | + workDir = workBase / bom.g / bom.a / bom.v |
| 239 | + |
| 240 | + completeMarker = workDir / "complete" |
| 241 | + if completeMarker.exists(): |
| 242 | + # Already processed this version of the BOM. |
| 243 | + log.info(f"Skipping already processed BOM {bom}") |
| 244 | + return |
| 245 | + |
| 246 | + mkdirs(workDir) |
| 247 | + |
| 248 | + log.info(f"Processing BOM {bom}") |
| 249 | + |
| 250 | + bomDir = siteBase / bom.g / bom.a / bom.v |
| 251 | + mkdirs(bomDir) |
| 252 | + |
| 253 | + # Download the BOM file. |
| 254 | + bomFile = workDir / f"{bom.a}-{bom.v}.pom" |
| 255 | + if not bomFile.exists(): |
| 256 | + log.info(f"Downloading BOM {bom}") |
| 257 | + mvn("dependency:copy", |
| 258 | + artifact=f"{bom}:pom", |
| 259 | + outputDirectory=workDir) |
| 260 | + |
| 261 | + # Interpolate the BOM and extract the list of managed dependencies as XML. |
| 262 | + bomComponentsFile = workDir / "components.xml" |
| 263 | + if not bomComponentsFile.exists(): |
| 264 | + output = mvn("help:effective-pom", bomFile) |
| 265 | + start = end = None |
| 266 | + for i, line in enumerate(output): |
| 267 | + if start is not None and end is not None: break |
| 268 | + if line.startswith(' <dependencyManagement>'): start = i |
| 269 | + elif line.startswith(' </dependencyManagement>'): end = i |
| 270 | + if start is None or end is None: |
| 271 | + die(f"Could not interpolate the BOM -- mvn output follows:\n{''.join(output)}") |
| 272 | + writefile(bomComponentsFile, output[start:end+1]) |
| 273 | + bomComponents = XML(bomComponentsFile) |
| 274 | + |
| 275 | + for dep in bomComponents.elements('dependencies/dependency'): |
| 276 | + c = GAV(dep.find('groupId').text, |
| 277 | + dep.find('artifactId').text, |
| 278 | + dep.find('version').text) |
| 279 | + if c.valid: |
| 280 | + process_component(c, bom, bomDir) |
| 281 | + else: |
| 282 | + log.warning(f"Invalid component: {c}") |
| 283 | + |
| 284 | + # Sort package-list, element-list, and htaccess files, squashing duplicates. |
| 285 | + squash(bomDir / "package-list") |
| 286 | + squash(bomDir / "element-list") |
| 287 | + squash(bomDir / ".htaccess") |
| 288 | + |
| 289 | + log.info(f"Done processing BOM {bom}") |
| 290 | + writefile(completeMarker) |
| 291 | + |
| 292 | + # TODO: Check that javadoc tool actually works pointed at a BOM prefix. |
| 293 | + # TODO: Close https://github.com/scijava/pom-scijava/issues/130 when done. |
| 294 | + |
| 295 | + # QUESTIONS: |
| 296 | + # - What should the root of javadoc.scijava.org serve now? |
| 297 | + # An index of available components? E.g. net.imagej:imagej, sc.fiji:fiji |
| 298 | + # - Should we aggregate the JSON indices (*-search-index.zip for all components)? |
| 299 | + # What else would we need to do to make the search work for a BOM's javadoc index? |
| 300 | + # - Are there other documents we should aggregate like the various toplevel HTML files? |
| 301 | + |
| 302 | +# 3. Loop over the <dependency> elements: |
| 303 | +# - Obtain the -javadoc JAR for that dependency (efficiently!). |
| 304 | +# - If already copied/linked, do nothing. |
| 305 | +# - Copy/link from local file system if available. |
| 306 | +# Right now, javadoc.scijava.org resides on devonrex, but if |
| 307 | +# moved to balinese, it could fetch existing cached artifacts |
| 308 | +# from the local file system, which would be even faster. |
| 309 | +# - `mvn dependency:get` if not available locally. |
| 310 | +# - Fail gracefully and continue if it doesn't exist. |
| 311 | +# - Extract the JAR to its special folder. |
| 312 | +# From here on out, the logic in wrangle.sh should be correct. |
| 313 | +# Just need to translate it into Python. |
| 314 | + |
| 315 | +# Authoritative list of published pom-scijava versions: |
| 316 | +# https://repo1.maven.org/maven2/org/scijava/pom-scijava/maven-metadata.xml |
| 317 | + |
| 318 | +# Ultimately, the goal is to wrangle every published version of |
| 319 | +# pom-scijava, so the javadoc is as complete as possible. |
| 320 | + |
| 321 | +# -- Main -- |
| 322 | + |
| 323 | +def main(args=None): |
| 324 | + logging.basicConfig(level=logging.DEBUG, format="[%(levelname)s] %(message)s") |
| 325 | + |
| 326 | + if args is None: |
| 327 | + args = [] |
| 328 | + if len(args) == 0: |
| 329 | + # Use the latest release of pom-scijava if no args given. |
| 330 | + try: |
| 331 | + url = "https://repo1.maven.org/maven2/org/scijava/pom-scijava/maven-metadata.xml" |
| 332 | + metadata = XML(request.urlopen(url).read().decode()) |
| 333 | + version = metadata.value('versioning/release') |
| 334 | + except Exception as e: |
| 335 | + log.debug(e) |
| 336 | + version = None |
| 337 | + if not version: |
| 338 | + die("Cannot glean latest version of org.scijava:pom-scijava.") |
| 339 | + args.append(version) |
| 340 | + |
| 341 | + for arg in args: |
| 342 | + if ":" in arg: |
| 343 | + gav = ":".split(arg) |
| 344 | + bom = GAV(*gav) |
| 345 | + else: |
| 346 | + bom = GAV("org.scijava", "pom-scijava", arg) |
| 347 | + process_bom(bom) |
| 348 | + |
| 349 | +if __name__ == '__main__': |
| 350 | + main(sys.argv[1:]) |
0 commit comments