8000 Use orjson instead of json, when available by hauntsaninja · Pull Request #17955 · python/mypy · GitHub
[go: up one dir, main page]

Skip to content

Use orjson instead of json, when available #17955

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

8000
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Use orjson instead of json, when available
For `mypy -c 'import torch'`, the cache load time goes from 0.44s to
0.25s as measured by manager's data_json_load_time
If I time dump times specifically, I see a saving of 0.65s to 0.07s.
Overall, a pretty reasonable perf win -- should we make it a required
dependency?

I don't know if the sqlite cache path is used at all, but let me know if
I need a cleverer migration than renaming the table
  • Loading branch information
hauntsaninja committed Oct 15, 2024
commit 4103892b71f8667d03e7372eafe448d9e57e4e46
45 changes: 18 additions & 27 deletions mypy/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@
from mypy.stubinfo import legacy_bundled_packages, non_bundled_packages, stub_distribution_name
from mypy.types import Type
from mypy.typestate import reset_global_state, type_state
from mypy.util import json_dumps, json_loads
from mypy.version import __version__

# Switch to True to produce debug output related to fine-grained incremental
Expand Down Expand Up @@ -858,7 +859,7 @@ def load_fine_grained_deps(self, id: str) -> dict[str, set[str]]:
t0 = time.time()
if id in self.fg_deps_meta:
# TODO: Assert deps file wasn't changed.
deps = json.loads(self.metastore.read(self.fg_deps_meta[id]["path"]))
deps = json_loads(self.metastore.read(self.fg_deps_meta[id]["path"]))
else:
deps = {}
val = {k: set(v) for k, v in deps.items()}
Expand Down Expand Up @@ -911,8 +912,8 @@ def stats_summary(self) -> Mapping[str, object]:
return self.stats


def deps_to_json(x: dict[str, set[str]]) -> str:
return json.dumps({k: list(v) for k, v in x.items()}, separators=(",", ":"))
def deps_to_json(x: dict[str, set[str]]) -> bytes:
return json_dumps({k: list(v) for k, v in x.items()})


# File for storing metadata about all the fine-grained dependency caches
Expand Down Expand Up @@ -980,7 +981,7 @@ def write_deps_cache(

meta = {"snapshot": meta_snapshot, "deps_meta": fg_deps_meta}

if not metastore.write(DEPS_META_FILE, json.dumps(meta, separators=(",", ":"))):
if not metastore.write(DEPS_META_FILE, json_dumps(meta)):
manager.log(f"Error writing fine-grained deps meta JSON file {DEPS_META_FILE}")
error = True

Expand Down Expand Up @@ -1048,7 +1049,7 @@ def generate_deps_for_cache(manager: BuildManager, graph: Graph) -> dict[str, di

def write_plugins_snapshot(manager: BuildManager) -> None:
"""Write snapshot of versions and hashes of currently active plugins."""
snapshot = json.dumps(manager.plugins_snapshot, separators=(",", ":"))
snapshot = json_dumps(manager.plugins_snapshot)
if not manager.metastore.write(PLUGIN_SNAPSHOT_FILE, snapshot):
manager.errors.set_file(_cache_dir_prefix(manager.options), None, manager.options)
manager.errors.report(0, 0, "Error writing plugins snapshot", blocker=True)
Expand Down Expand Up @@ -1079,8 +1080,8 @@ def read_quickstart_file(
# just ignore it.
raw_quickstart: dict[str, Any] = {}
try:
with open(options.quickstart_file) as f:
raw_quickstart = json.load(f)
with open(options.quickstart_file, "rb") as f:
raw_quickstart = json_loads(f.read())

quickstart = {}
for file, (x, y, z) in raw_quickstart.items():
Expand Down Expand Up @@ -1148,10 +1149,10 @@ def _load_json_file(
manager.add_stats(metastore_read_time=time.time() - t0)
# Only bother to compute the log message if we are logging it, since it could be big
if manager.verbosity() >= 2:
manager.trace(log_success + data.rstrip())
manager.trace(log_success + data.rstrip().decode())
try:
t1 = time.time()
result = json.loads(data)
result = json_loads(data)
manager.add_stats(data_json_load_time=time.time() - t1)
except json.JSONDecodeError:
manager.errors.set_file(file, None, manager.options)
Expand Down Expand Up @@ -1343,8 +1344,8 @@ def find_cache_meta(id: str, path: str, manager: BuildManager) -> CacheMeta | No
# So that plugins can return data with tuples in it without
# things silently always invalidating modules, we round-trip
# the config data. This isn't beautiful.
plugin_data = json.loads(
json.dumps(manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=True)))
plugin_data = json_loads(
json_dumps(manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=True)))
)
if m.plugin_data != plugin_data:
manager.log(f"Metadata abandoned for {id}: plugin configuration differs")
Expand Down Expand Up @@ -1478,18 +1479,15 @@ def validate_meta(
"ignore_all": meta.ignore_all,
"plugin_data": meta.plugin_data,
}
if manager.options.debug_cache:
meta_str = json.dumps(meta_dict, indent=2, sort_keys=True)
else:
meta_str = json.dumps(meta_dict, separators=(",", ":"))
meta_bytes = json_dumps(meta_dict, manager.options.debug_cache)
meta_json, _, _ = get_cache_names(id, path, manager.options)
manager.log(
"Updating mtime for {}: file {}, meta {}, mtime {}".format(
id, path, meta_json, meta.mtime
)
)
t1 = time.time()
manager.metastore.write(meta_json, meta_str) # Ignore errors, just an optimization.
manager.metastore.write(meta_json, meta_bytes) # Ignore errors, just an optimization.
manager.add_stats(validate_update_time=time.time() - t1, validate_munging_time=t1 - t0)
return meta

Expand All @@ -1507,13 +1505,6 @@ def compute_hash(text: str) -> str:
return hash_digest(text.encode("utf-8"))


def json_dumps(obj: Any, debug_cache: bool) -> str:
if debug_cache:
return json.dumps(obj, indent=2, sort_keys=True)
else:
return json.dumps(obj, sort_keys=True, separators=(",", ":"))


def write_cache(
id: str,
path: str,
Expand Down Expand Up @@ -1566,8 +1557,8 @@ def write_cache(

# Serialize data and analyze interface
data = tree.serialize()
data_str = json_dumps(data, manager.options.debug_cache)
interface_hash = compute_hash(data_str)
data_bytes = json_dumps(data, manager.options.debug_cache)
interface_hash = hash_digest(data_bytes)

plugin_data = manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=False))

Expand All @@ -1591,7 +1582,7 @@ def write_cache(
manager.trace(f"Interface for {id} is unchanged")
else:
manager.trace(f"Interface for {id} has changed")
if not metastore.write(data_json, data_str):
if not metastore.write(data_json, data_bytes):
# Most likely the error is the replace() call
# (see https://github.com/python/mypy/issues/3215).
manager.log(f"Error writing data JSON file {data_json}")
Expand Down Expand Up @@ -3566,4 +3557,4 @@ def write_undocumented_ref_info(
assert not ref_info_file.startswith(".")

deps_json = get_undocumented_ref_info_json(state.tree, type_map)
metastore.write(ref_info_file, json.dumps(deps_json, separators=(",", ":")))
metastore.write(ref_info_file, json_dumps(deps_json))
37 changes: 15 additions & 22 deletions mypy/metastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ def getmtime(self, name: str) -> float:
"""

@abstractmethod
def read(self, name: str) -> str:
def read(self, name: str) -> bytes:
"""Read the contents of a metadata entry.

Raises FileNotFound if the entry does not exist.
"""

@abstractmethod
def write(self, name: str, data: str, mtime: float | None = None) -> bool:
def write(self, name: str, data: bytes, mtime: float | None = None) -> bool:
"""Write a metadata entry.

If mtime is specified, set it as the mtime of the entry. Otherwise,
Expand Down Expand Up @@ -86,16 +86,16 @@ def getmtime(self, name: str) -> float:

return int(os.path.getmtime(os.path.join(self.cache_dir_prefix, name)))

def read(self, name: str) -> str:
def read(self, name: str) -> bytes:
assert os.path.normpath(name) != os.path.abspath(name), "Don't use absolute paths!"

if not self.cache_dir_prefix:
raise FileNotFoundError()

with open(os.path.join(self.cache_dir_prefix, name)) as f:
with open(os.path.join(self.cache_dir_prefix, name), "rb") as f:
return f.read()

def write(self, name: str, data: str, mtime: float | None = None) -> bool:
def write(self, name: str, data: bytes, mtime: float | None = None) -> bool:
assert os.path.normpath(name) != os.path.abspath(name), "Don't use absolute paths!"

if not self.cache_dir_prefix:
Expand All @@ -105,7 +105,7 @@ def write(self, name: str, data: str, mtime: float | None = None) -> bool:
tmp_filename = path + "." + random_string()
try:
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(tmp_filename, "w") as f:
with open(tmp_filename, "wb") as f:
f.write(data)
os.replace(tmp_filename, path)
if mtime is not None:
Expand Down Expand Up @@ -135,27 +135,20 @@ def list_all(self) -> Iterable[str]:


SCHEMA = """
CREATE TABLE IF NOT EXISTS files (
CREATE TABLE IF NOT EXISTS files2 (
path TEXT UNIQUE NOT NULL,
mtime REAL,
data TEXT
data BLOB
);
CREATE INDEX IF NOT EXISTS path_idx on files(path);
CREATE INDEX IF NOT EXISTS path_idx on files2(path);
"""
# No migrations yet
MIGRATIONS: list[str] = []


def connect_db(db_file: str) -> sqlite3.Connection:
import sqlite3.dbapi2

db = sqlite3.dbapi2.connect(db_file)
db.executescript(SCHEMA)
for migr in MIGRATIONS:
try:
db.executescript(migr)
except sqlite3.OperationalError:
pass
return db


Expand Down Expand Up @@ -188,12 +181,12 @@ def getmtime(self, name: str) -> float:
assert isinstance(mtime, float)
return mtime

def read(self, name: str) -> str:
def read(self, name: str) -> bytes:
data = self._query(name, "data")
assert isinstance(data, str)
assert isinstance(data, bytes)
return data

def write(self, name: str, data: str, mtime: float | None = None) -> bool:
def write(self, name: str, data: bytes, mtime: float | None = None) -> bool:
import sqlite3

if not self.db:
Expand All @@ -202,7 +195,7 @@ def write(self, name: str, data: str, mtime: float | None = None) -> bool:
if mtime is None:
mtime = time.time()
self.db.execute(
"INSERT OR REPLACE INTO files(path, mtime, data) VALUES(?, ?, ?)",
"INSERT OR REPLACE INTO files2(path, mtime, data) VALUES(?, ?, ?)",
(name, mtime, data),
)
except sqlite3.OperationalError:
Expand All @@ -213,13 +206,13 @@ def remove(self, name: str) -> None:
if not self.db:
raise FileNotFoundError()

self.db.execute("DELETE FROM files WHERE path = ?", (name,))
self.db.execute("DELETE FROM files2 WHERE path = ?", (name,))

def commit(self) -> None:
if self.db:
self.db.commit()

def list_all(self) -> Iterable[str]:
if self.db:
for row in self.db.execute("SELECT path FROM files"):
for row in self.db.execute("SELECT path FROM files2"):
yield row[0]
28 changes: 27 additions & 1 deletion mypy/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,23 @@

import hashlib
import io
import json
import os
import pathlib
import re
import shutil
import sys
import time
from importlib import resources as importlib_resources
from typing import IO, Callable, Container, Final, Iterable, Sequence, Sized, TypeVar
from typing import IO, Any, Callable, Container, Final, Iterable, Sequence, Sized, TypeVar
from typing_extensions import Literal

orjson: Any
try:
import orjson # type: ignore[import-not-found, no-redef, unused-ignore]
except ImportError:
orjson = None

try:
import curses

Expand Down Expand Up @@ -874,3 +881,22 @@ def quote_docstring(docstr: str) -> str:
return f"''{docstr_repr}''"
else:
return f'""{docstr_repr}""'


def json_dumps(obj: object, debug: bool = False) -> bytes:
if orjson is not None:
if debug:
return orjson.dumps(obj, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS) # type: ignore[no-any-return]
Copy link
@simon-liebehenschel simon-liebehenschel Nov 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hauntsaninja @JukkaL I think that Mypy will not use too much memory because the memory will be released somewhere sooner or later (and, obviously, the cache is not so large to eat a lot of memory), but keep in mind that

python -c 'import orjson; [orjson.dumps(i) for i in range(30000000)]'
python -c 'import orjson; [orjson.dumps(i).decode() for i in range(30000000)]'

are very different things in the terms how the memory is used and when it is released.

The first command will keep all dumped objects in the memory, plus a crazy memory overhead. E.g. the first command needs +20 GiB of memory to run, but the second command will eat only ~2 GiB.

As I said, Mypy should not be affected because the memory will be freed (I hope, ha-ha) so this PR should be great. Actually, this problem can happen only if we dumps all in a single function. Just a friendly heads up to make sure you check the memory usage in other applications if you will need to dumps lots of objects in a FOR loop in a single function :)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, that's probably from ijl/orjson#483 (comment) Looks like that's not resolved, someone should open a PR against orjson fixing the thing godlygeek points out

else:
return orjson.dumps(obj) # type: ignore[no-any-return]

if debug:
return json.dumps(obj, indent=2, sort_keys=True).encode("utf-8")
else:
return json.dumps(obj, separators=(",", ":")).encode("utf-8")


def json_loads(data: bytes) -> Any:
if orjson is not None:
return orjson.loads(data)
return json.loads(data)
8 changes: 4 additions & 4 deletions mypyc/codegen/emitmodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from mypy.nodes import MypyFile
from mypy.options import Options
from mypy.plugin import Plugin, ReportConfigContext
from mypy.util import hash_digest
from mypy.util import hash_digest, json_dumps
from mypyc.codegen.cstring import c_string_initializer
from mypyc.codegen.emit import Emitter, EmitterContext, HeaderDeclaration, c_array_initializer
from mypyc.codegen.emitclass import generate_class, generate_class_type_decl
Expand Down Expand Up @@ -154,7 +154,7 @@ def report_config_data(self, ctx: ReportConfigContext) -> tuple[str | None, list
ir_data = json.loads(ir_json)

# Check that the IR cache matches the metadata cache
if compute_hash(meta_json) != ir_data["meta_hash"]:
if hash_digest(meta_json) != ir_data["meta_hash"]:
return None

# Check that all of the source files are present and as
Expand Down Expand Up @@ -369,11 +369,11 @@ def write_cache(
newpath = get_state_ir_cache_name(st)
ir_data = {
"ir": module.serialize(),
"meta_hash": compute_hash(meta_data),
"meta_hash": hash_digest(meta_data),
"src_hashes": hashes[group_map[id]],
}

result.manager.metastore.write(newpath, json.dumps(ir_data, separators=(",", ":")))
result.manager.metastore.write(newpath, json_dumps(ir_data))

result.manager.metastore.commit()

Expand Down
Loading
0