8000 Add index repartition script and job (#73) · python/pymanager@4add401 · GitHub
[go: up one dir, main page]

Skip to content

Commit 4add401

Browse files
authored
Add index repartition script and job (#73)
Fixes #5 This script allows ingesting on 8000 e or more indexes, sorting them, and writing out the entries into one or more new index files according to a set of rules.
1 parent 89afccf commit 4add401

File tree

6 files changed

+356
-7
lines changed

6 files changed

+356
-7
lines changed

ci/repartition-index.yml

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# Repartitioning runs on Azure Pipelines, because that's where we have SSH
2+
# access to the download server.
3+
4+
name: $(Date:yyyyMMdd).$(Rev:r)
5+
6+
# Do not run automatically
7+
trigger: none
8+
9+
10+
parameters:
11+
- name: Publish
12+
displayName: "Publish"
13+
type: boolean
14+
default: false
15+
- name: TestPublish
16+
displayName: "Run all steps without publishing"
17+
type: boolean
18+
default: false
19+
20+
stages:
21+
- stage: PyManagerIndexPartition
22+
displayName: 'Repartition PyManager Index'
23+
24+
jobs:
25+
- job: Repartition
26+
27+
pool:
28+
vmImage: 'windows-latest'
29+
30+
variables:
31+
- group: PythonOrgPublish
32+
33+
steps:
34+
- checkout: self
35+
36+
- task: NugetToolInstaller@0
37+
displayName: 'Install Nuget'
38+
39+
- powershell: |
40+
nuget install -o host_python -x -noninteractive -prerelease python
41+
Write-Host "##vso[task.prependpath]$(gi host_python\python\tools)"
42+
displayName: 'Install host Python'
43+
workingDirectory: $(Build.BinariesDirectory)
44+
45+
- powershell: |
46+
cd (mkdir -Force index)
47+
python "$(Build.SourcesDirectory)\scripts\repartition-index.py" --windows-default
48+
displayName: 'Repartition index'
49+
workingDirectory: $(Build.BinariesDirectory)
50+
51+
- publish: $(Build.BinariesDirectory)\index
52+
artifact: index
53+
displayName: Publish index artifact
54+
55+
- ${{ if or(eq(parameters.Publish, 'true'), eq(parameters.TestPublish, 'true')) }}:
56+
- ${{ if ne(parameters.TestPublish, 'true') }}:
57+
- task: DownloadSecureFile@1
58+
name: sshkey
59+
inputs:
60+
secureFile: pydotorg-ssh.ppk
61+
displayName: 'Download PuTTY key'
62+
63+
- powershell: |
64+
git clone https://github.com/python/cpython-bin-deps --branch putty --single-branch --depth 1 --progress -v "putty"
65+
"##vso[task.prependpath]$(gi putty)"
66+
workingDirectory: $(Pipeline.Workspace)
67+
displayName: 'Download PuTTY binaries'
68+
69+
- powershell: |
70+
python ci\upload.py
71+
displayName: 'Publish packages'
72+
env:
73+
UPLOAD_URL: $(PyDotOrgUrlPrefix)python/
74+
UPLOAD_DIR: $(Build.BinariesDirectory)\index
75+
UPLOAD_URL_PREFIX: $(PyDotOrgUrlPrefix)
76+
UPLOAD_PATH_PREFIX: $(PyDotOrgUploadPathPrefix)
77+
UPLOAD_HOST: $(PyDotOrgServer)
78+
UPLOAD_HOST_KEY: $(PyDotOrgHostKey)
79+
UPLOAD_USER: $(PyDotOrgUsername)
80+
UPLOADING_INDEX: true
81+
${{ if eq(parameters.TestPublish, 'true') }}:
82+
NO_UPLOAD: 1
83+
${{ else }}:
84+
UPLOAD_KEYFILE: $(sshkey.secureFilePath)

ci/upload.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,15 @@
99
UPLOAD_PATH_PREFIX = os.getenv("UPLOAD_PATH_PREFIX", "/srv/www.python.org/ftp/")
1010
UPLOAD_URL = os.getenv("UPLOAD_URL")
1111
UPLOAD_DIR = os.getenv("UPLOAD_DIR")
12-
# A version will be inserted before the extension later on
13-
MANIFEST_FILE = os.getenv("MANIFEST_FILE")
1412
UPLOAD_HOST = os.getenv("UPLOAD_HOST", "")
1513
UPLOAD_HOST_KEY = os.getenv("UPLOAD_HOST_KEY", "")
1614
UPLOAD_KEYFILE = os.getenv("UPLOAD_KEYFILE", "")
1715
UPLOAD_USER = os.getenv("UPLOAD_USER", "")
1816
NO_UPLOAD = os.getenv("NO_UPLOAD", "no")[:1].lower() in "yt1"
1917

18+
# Set to 'true' when updating index.json, rather than the app
19+
UPLOADING_INDEX = os.getenv("UPLOADING_INDEX", "no")[:1].lower() in "yt1"
20+
2021

2122
if not UPLOAD_URL:
2223
print("##[error]Cannot upload without UPLOAD_URL")
@@ -179,10 +180,15 @@ def purge(url):
179180

180181
UPLOADS = []
181182

182-
for pat in ("python-manager-*.msix", "python-manager-*.msi", "pymanager.appinstaller"):
183-
for f in UPLOAD_DIR.glob(pat):
183+
if UPLOADING_INDEX:
184+
for f in UPLOAD_DIR.glob("*.json"):
184185
u = UPLOAD_URL + f.name
185186
UPLOADS.append((f, u, url2path(u)))
187+
else:
188+
for pat in ("python-manager-*.msix", "python-manager-*.msi", "pymanager.appinstaller"):
189+
for f in UPLOAD_DIR.glob(pat):
190+
u = UPLOAD_URL + f.name
191+
UPLOADS.append((f, u, url2path(u)))
186192

187193
print("Planned uploads:")
188194
for f, u, p in UPLOADS:

scripts/repartition-index.py

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
import json
2+
import re
3+
import sys
4+
5+
from collections import OrderedDict
6+
from pathlib import Path
7+
from urllib.request import Request, urlopen
8+
9+
REPO = Path(__file__).absolute().parent.parent
10+
sys.path.append(str(REPO / "src"))
11+
12+
from manage.urlutils import IndexDownloader
13+
from manage.tagutils import CompanyTag, tag_or_range
14+
from manage.verutils import Version
15+
16+
17+
def usage():
18+
print("Usage: repartition-index.py [-i options <FILENAME> ...] [options <OUTPUT> ...]")
19+
print()
20+
print(" --windows-default Implies default output files and configurations.")
21+
print()
22+
print(" -i <FILENAME> One or more files or URLs to read existing entries from.")
23+
print(" -i -n/--no-recurse Do not follow 'next' info")
24+
print("If no files are provided, uses the current online index")
25+
print()
26+
print(" <OUTPUT> Filename to write entries into")
27+
print(" -d/--allow-dup Include entries written in previous outputs")
28+
print(" --only-dup Only include entries written in previous outputs")
29+
print(" --pre Include entries marked as prereleases")
30+
print(" -t/--tag TAG Include only the specified tags (comma-separated)")
31+
print(" -r/--range RANGE Include only the specified range (comma-separated)")
32+
print(" --latest-micro Include only the latest x.y.z version")
33+
print()
34+
print("An output of 'nul' is permitted to drop entries.")
35+
print("Providing the same inputs and outputs is permitted, as all inputs are read")
36+
print("before any outputs are written.")
37+
sys.exit(1)
38+
39+
40+
class ReadFile:
41+
def __init__(self):
42+
self.source = None
43+
self.recurse = True
44+
45+
def add_arg(self, arg):
46+
if arg[:1] != "-":
47+
self.source = arg
48+
return True
49+
if arg in ("-n", "--no-recurse"):
50+
self.recurse = False
51+
return False
52+
raise ValueError("Unknown argument: " + arg)
53+
54+
def execute(self, versions, context):
55+
for _, data in IndexDownloader(self.source, lambda *a: a):
56+
versions.extend(data["versions"])
57+
if not self.recurse:
58+
break
59+
60+
61+
class SortVersions:
62+
def __init__(self):
63+
pass
64+
65+
def add_arg(self, arg):
66+
raise ValueError("Unknown argument: " + arg)
67+
68+
def _number_sortkey(self, k):
69+
bits = []
70+
for n in re.split(r"(\d+)", k):
71+
try:
72+
bits.append(f"{int(n):020}")
73+
except ValueError:
74+
bits.append(n)
75+
return tuple(bits)
76+
77+
def _sort_key(self, v):
78+
from manage.tagutils import _CompanyKey, _DescendingVersion
79+
return (
80+
_DescendingVersion(v["sort-version"]),
81+
_CompanyKey(v["company"]),
82+
self._number_sortkey(v["id"]),
83+
)
84+
85+
def execute(self, versions, context):
86+
versions.sort(key=self._sort_key)
87+
print("Processing {} entries".format(len(versions)))
88+
89+
90+
class SplitToFile:
91+
def __init__(self):
92+
self.target = None
93+
self.allow_dup = False
94+
self.only_dup = False
95+
self.pre = False
96+
self.tag_or_range = None
97+
self._expect_tag_or_range = False
98+
self.latest_micro = False
99+
100+
def add_arg(self, arg):
101+
if arg[:1] != "-":
102+
if self._expect_tag_or_range:
103+
self.tag_or_range = tag_or_range(arg)
104+
self._expect_tag_or_range = False
105+
return False
106+
self.target = arg
107+
return True
108+
if arg in ("-d", "--allow-dup"):
109+
self.allow_dup = True
110+
return False
111+
if arg == "--only-dup":
112+
self.allow_dup = True
113+
self.only_dup = True
114+
return False
115+
if arg == "--pre":
116+
self.pre = True
117+
return False
118+
if arg in ("-t", "--tag", "-r", "--range"):
119+
self._expect_tag_or_range = True
120+
return False
121+
if arg == "--latest-micro":
122+
self.latest_micro = True
123+
return False
124+
raise ValueError("Unknown argument: " + arg)
125+
126+
def execute(self, versions, context):
127+
written = context.setdefault("written", set())
128+
written_now = set()
129+
outputs = context.setdefault("outputs", {})
130+
if self.target != "nul":
131+
try:
132+
output = outputs[self.target]
133+
except KeyError:
134+
context.setdefault("output_order", []).append(self.target)
135+
output = outputs.setdefault(self.target, [])
136+
else:
137+
# Write to a list that'll be forgotten
138+
output = []
139+
140+
latest_micro_skip = set()
141+
142+
for i in versions:
143+
k = i["id"].casefold(), i["sort-version"].casefold()
144+
v = Version(i["sort-version"])
145+
if self.only_dup and k not in written_now:
146+
written_now.add(k)
147+
continue
148+
if not self.allow_dup and k in written:
149+
continue
150+
if not self.pre and v.is_prerelease:
151+
continue
152+
if self.tag_or_range and not any(
153+
self.tag_or_range.satisfied_by(CompanyTag(i["company"], t))
154+
for t in i["install-for"]
155+
):
156+
continue
157+
if self.latest_micro:
158+
k2 = i["id"].casefold(), v.to_python_style(2, with_dev=False)
159+
if k2 in latest_micro_skip:
160+
continue
161+
latest_micro_skip.add(k2)
162+
written.add(k)
163+
output.append(i)
164+
165+
166+
class WriteFiles:
167+
def __init__(self):
168+
self.indent = None
169+
170+
def add_arg(self, arg):
171+
if arg == "-w-indent":
172+
self.indent = 4
173+
return False
174+
if arg == "-w-indent1":
175+
self.indent = 1
176+
return False
177+
raise ValueError("Unknown argument: " + arg)
178+
179+
def execute(self, versions, context):
180+
outputs = context.get("outputs") or {}
181+
output_order = context.get("output_order", [])
182+
for target, next_target in zip(output_order, [*output_order[1:], None]):
183+
data = {
184+
"versions": outputs[target]
185+
}
186+
if next_target:
187+
data["next"] = next_target
188+
with open(target, "w", encoding="utf-8") as f:
189+
json.dump(data, f, indent=self.indent)
190+
print("Wrote {} ({} entries, {} bytes)".format(
191+
target, len(data["versions"]), Path(target).stat().st_size
192+
))
193+
194+
195+
def parse_cli(args):
196+
plan_read = []
197+
plan_split = []
198+
sort = SortVersions()
199+
action = None
200+
write = WriteFiles()
201+
for a in args:
202+
if a == "--windows-default":
203+
print("Using equivalent of: --pre --latest-micro -r >=3.11.0 index-windows.json")
204+
print(" --pre -r >=3.11.0 index-windows-recent.json")
205+
print(" index-windows-legacy.json")
206+
plan_split = [SplitToFile(), SplitToFile(), SplitToFile()]
207+
plan_split[0].target = "index-windows.json"
208+
plan_split[1].target = "index-windows-recent.json"
209+
plan_split[2].target = "index-windows-legacy.json"
210+
plan_split[0].pre = plan_split[1].pre = plan_split[2].pre = True
211+
plan_split[0].latest_micro = True
212+
plan_split[0].tag_or_range = tag_or_range(">=3.11.0")
213+
plan_split[1].tag_or_range = tag_or_range(">=3.11.0")
214+
elif a == "-i":
215+
action = ReadFile()
216+
plan_read.append(action)
217+
elif a.startswith("-s-"):
218+
sort.add_arg(a)
219+
elif a.startswith("-w-"):
220+
write.add_arg(a)
221+
else:
222+
try:
223+
if action is None:
224+
action = SplitToFile()
225+
plan_split.append(action)
226+
if action.add_arg(a):
227+
action = None
228+
continue
229+
except ValueError as ex:
230+
print(ex)
231+
usage()
232+
if not plan_read:
233+
action = ReadFile()
234+
action.source = "https://www.python.org/ftp/python/index-windows.json"
235+
plan_read.append(action)
236+
if not plan_split:
237+
print("No outputs specified")
238+
print(args)
239+
usage()
240+
return [*plan_read, sort, *plan_split, write]
241+
242+
243+
if __name__ == "__main__":
244+
plan = parse_cli(sys.argv[1:])
245+
VERSIONS = []
246+
CONTEXT = {}
247+
for p in plan:
248+
p.execute(VERSIONS, CONTEXT)
249+

src/manage/tagutils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ def startswith(self, other):
2626
return self._company.startswith(other._company)
2727
return self._company == other._company
2828

29+
def __hash__(self):
30+
return hash(self._company)
31+
2932
def __eq__(self, other):
3033
return self._company == other._company
3134

@@ -64,6 +67,9 @@ def startswith(self, other):
6467
return not self.s
6568
return self.s.startswith(other.s)
6669

70+
def __hash__(self):
71+
return hash(self.s)
72+
6773
def __eq__(self, other):
6874
if not isinstance(other, type(self)):
6975
return False

0 commit comments

Comments
 (0)
0