Use orjson instead of json, when available

For `mypy -c 'import torch'`, the cache load time goes from 0.44s to 0.25s as measured by manager's data_json_load_time If I time dump times specifically, I see a saving of 0.65s to 0.07s. Overall, a pretty reasonable perf win -- should we make it a required dependency? I don't know if the sqlite cache path is used at all, but let me know if I need a cleverer migration than renaming the table
python · hauntsaninja · Oct 16, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
commit 4103892b71f8667d03e7372eafe448d9e57e4e46
diff --git a/mypy/build.py b/mypy/build.py
@@ -95,6 +95,7 @@
 from mypy.stubinfo import legacy_bundled_packages, non_bundled_packages, stub_distribution_name
 from mypy.types import Type
 from mypy.typestate import reset_global_state, type_state
+from mypy.util import json_dumps, json_loads
 from mypy.version import __version__
 
 # Switch to True to produce debug output related to fine-grained incremental
@@ -858,7 +859,7 @@ def load_fine_grained_deps(self, id: str) -> dict[str, set[str]]:
         t0 = time.time()
         if id in self.fg_deps_meta:
             # TODO: Assert deps file wasn't changed.
-            deps = json.loads(self.metastore.read(self.fg_deps_meta[id]["path"]))
+            deps = json_loads(self.metastore.read(self.fg_deps_meta[id]["path"]))
         else:
             deps = {}
         val = {k: set(v) for k, v in deps.items()}
@@ -911,8 +912,8 @@ def stats_summary(self) -> Mapping[str, object]:
         return self.stats
 
 
-def deps_to_json(x: dict[str, set[str]]) -> str:
-    return json.dumps({k: list(v) for k, v in x.items()}, separators=(",", ":"))
+def deps_to_json(x: dict[str, set[str]]) -> bytes:
+    return json_dumps({k: list(v) for k, v in x.items()})
 
 
 # File for storing metadata about all the fine-grained dependency caches
@@ -980,7 +981,7 @@ def write_deps_cache(
 
     meta = {"snapshot": meta_snapshot, "deps_meta": fg_deps_meta}
 
-    if not metastore.write(DEPS_META_FILE, json.dumps(meta, separators=(",", ":"))):
+    if not metastore.write(DEPS_META_FILE, json_dumps(meta)):
         manager.log(f"Error writing fine-grained deps meta JSON file {DEPS_META_FILE}")
         error = True
 
@@ -1048,7 +1049,7 @@ def generate_deps_for_cache(manager: BuildManager, graph: Graph) -> dict[str, di
 
 def write_plugins_snapshot(manager: BuildManager) -> None:
     """Write snapshot of versions and hashes of currently active plugins."""
-    snapshot = json.dumps(manager.plugins_snapshot, separators=(",", ":"))
+    snapshot = json_dumps(manager.plugins_snapshot)
     if not manager.metastore.write(PLUGIN_SNAPSHOT_FILE, snapshot):
         manager.errors.set_file(_cache_dir_prefix(manager.options), None, manager.options)
         manager.errors.report(0, 0, "Error writing plugins snapshot", blocker=True)
@@ -1079,8 +1080,8 @@ def read_quickstart_file(
         # just ignore it.
         raw_quickstart: dict[str, Any] = {}
         try:
-            with open(options.quickstart_file) as f:
-                raw_quickstart = json.load(f)
+            with open(options.quickstart_file, "rb") as f:
+                raw_quickstart = json_loads(f.read())
 
             quickstart = {}
             for file, (x, y, z) in raw_quickstart.items():
@@ -1148,10 +1149,10 @@ def _load_json_file(
     manager.add_stats(metastore_read_time=time.time() - t0)
     # Only bother to compute the log message if we are logging it, since it could be big
     if manager.verbosity() >= 2:
-        manager.trace(log_success + data.rstrip())
+        manager.trace(log_success + data.rstrip().decode())
     try:
         t1 = time.time()
-        result = json.loads(data)
+        result = json_loads(data)
         manager.add_stats(data_json_load_time=time.time() - t1)
     except json.JSONDecodeError:
         manager.errors.set_file(file, None, manager.options)
@@ -1343,8 +1344,8 @@ def find_cache_meta(id: str, path: str, manager: BuildManager) -> CacheMeta | No
     # So that plugins can return data with tuples in it without
     # things silently always invalidating modules, we round-trip
     # the config data. This isn't beautiful.
-    plugin_data = json.loads(
-        json.dumps(manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=True)))
+    plugin_data = json_loads(
+        json_dumps(manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=True)))
     )
     if m.plugin_data != plugin_data:
         manager.log(f"Metadata abandoned for {id}: plugin configuration differs")
@@ -1478,18 +1479,15 @@ def validate_meta(
                 "ignore_all": meta.ignore_all,
                 "plugin_data": meta.plugin_data,
             }
-            if manager.options.debug_cache:
-                meta_str = json.dumps(meta_dict, indent=2, sort_keys=True)
-            else:
-                meta_str = json.dumps(meta_dict, separators=(",", ":"))
+            meta_bytes = json_dumps(meta_dict, manager.options.debug_cache)
             meta_json, _, _ = get_cache_names(id, path, manager.options)
             manager.log(
                 "Updating mtime for {}: file {}, meta {}, mtime {}".format(
                     id, path, meta_json, meta.mtime
                 )
             )
             t1 = time.time()
-            manager.metastore.write(meta_json, meta_str)  # Ignore errors, just an optimization.
+            manager.metastore.write(meta_json, meta_bytes)  # Ignore errors, just an optimization.
             manager.add_stats(validate_update_time=time.time() - t1, validate_munging_time=t1 - t0)
             return meta
 
@@ -1507,13 +1505,6 @@ def compute_hash(text: str) -> str:
     return hash_digest(text.encode("utf-8"))
 
 
-def json_dumps(obj: Any, debug_cache: bool) -> str:
-    if debug_cache:
-        return json.dumps(obj, indent=2, sort_keys=True)
-    else:
-        return json.dumps(obj, sort_keys=True, separators=(",", ":"))
-
-
 def write_cache(
     id: str,
     path: str,
@@ -1566,8 +1557,8 @@ def write_cache(
 
     # Serialize data and analyze interface
     data = tree.serialize()
-    data_str = json_dumps(data, manager.options.debug_cache)
-    interface_hash = compute_hash(data_str)
+    data_bytes = json_dumps(data, manager.options.debug_cache)
+    interface_hash = hash_digest(data_bytes)
 
     plugin_data = manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=False))
 
@@ -1591,7 +1582,7 @@ def write_cache(
         manager.trace(f"Interface for {id} is unchanged")
     else:
         manager.trace(f"Interface for {id} has changed")
-        if not metastore.write(data_json, data_str):
+        if not metastore.write(data_json, data_bytes):
             # Most likely the error is the replace() call
             # (see https://github.com/python/mypy/issues/3215).
             manager.log(f"Error writing data JSON file {data_json}")
@@ -3566,4 +3557,4 @@ def write_undocumented_ref_info(
     assert not ref_info_file.startswith(".")
 
     deps_json = get_undocumented_ref_info_json(state.tree, type_map)
-    metastore.write(ref_info_file, json.dumps(deps_json, separators=(",", ":")))
+    metastore.write(ref_info_file, json_dumps(deps_json))
diff --git a/mypy/metastore.py b/mypy/metastore.py
@@ -33,14 +33,14 @@ def getmtime(self, name: str) -> float:
         """
 
     @abstractmethod
-    def read(self, name: str) -> str:
+    def read(self, name: str) -> bytes:
         """Read the contents of a metadata entry.
 
         Raises FileNotFound if the entry does not exist.
         """
 
     @abstractmethod
-    def write(self, name: str, data: str, mtime: float | None = None) -> bool:
+    def write(self, name: str, data: bytes, mtime: float | None = None) -> bool:
         """Write a metadata entry.
 
         If mtime is specified, set it as the mtime of the entry. Otherwise,
@@ -86,16 +86,16 @@ def getmtime(self, name: str) -> float:
 
         return int(os.path.getmtime(os.path.join(self.cache_dir_prefix, name)))
 
-    def read(self, name: str) -> str:
+    def read(self, name: str) -> bytes:
         assert os.path.normpath(name) != os.path.abspath(name), "Don't use absolute paths!"
 
         if not self.cache_dir_prefix:
             raise FileNotFoundError()
 
-        with open(os.path.join(self.cache_dir_prefix, name)) as f:
+        with open(os.path.join(self.cache_dir_prefix, name), "rb") as f:
             return f.read()
 
-    def write(self, name: str, data: str, mtime: float | None = None) -> bool:
+    def write(self, name: str, data: bytes, mtime: float | None = None) -> bool:
         assert os.path.normpath(name) != os.path.abspath(name), "Don't use absolute paths!"
 
         if not self.cache_dir_prefix:
@@ -105,7 +105,7 @@ def write(self, name: str, data: str, mtime: float | None = None) -> bool:
         tmp_filename = path + "." + random_string()
         try:
             os.makedirs(os.path.dirname(path), exist_ok=True)
-            with open(tmp_filename, "w") as f:
+            with open(tmp_filename, "wb") as f:
                 f.write(data)
             os.replace(tmp_filename, path)
             if mtime is not None:
@@ -135,27 +135,20 @@ def list_all(self) -> Iterable[str]:
 
 
 SCHEMA = """
-CREATE TABLE IF NOT EXISTS files (
+CREATE TABLE IF NOT EXISTS files2 (
     path TEXT UNIQUE NOT NULL,
     mtime REAL,
-    data TEXT
+    data BLOB
 );
-CREATE INDEX IF NOT EXISTS path_idx on files(path);
+CREATE INDEX IF NOT EXISTS path_idx on files2(path);
 """
-# No migrations yet
-MIGRATIONS: list[str] = []
 
 
 def connect_db(db_file: str) -> sqlite3.Connection:
     import sqlite3.dbapi2
 
     db = sqlite3.dbapi2.connect(db_file)
     db.executescript(SCHEMA)
-    for migr in MIGRATIONS:
-        try:
-            db.executescript(migr)
-        except sqlite3.OperationalError:
-            pass
     return db
 
 
@@ -188,12 +181,12 @@ def getmtime(self, name: str) -> float:
         assert isinstance(mtime, float)
         return mtime
 
-    def read(self, name: str) -> str:
+    def read(self, name: str) -> bytes:
         data = self._query(name, "data")
-        assert isinstance(data, str)
+        assert isinstance(data, bytes)
         return data
 
-    def write(self, name: str, data: str, mtime: float | None = None) -> bool:
+    def write(self, name: str, data: bytes, mtime: float | None = None) -> bool:
         import sqlite3
 
         if not self.db:
@@ -202,7 +195,7 @@ def write(self, name: str, data: str, mtime: float | None = None) -> bool:
             if mtime is None:
                 mtime = time.time()
             self.db.execute(
-                "INSERT OR REPLACE INTO files(path, mtime, data) VALUES(?, ?, ?)",
+                "INSERT OR REPLACE INTO files2(path, mtime, data) VALUES(?, ?, ?)",
                 (name, mtime, data),
             )
         except sqlite3.OperationalError:
@@ -213,13 +206,13 @@ def remove(self, name: str) -> None:
         if not self.db:
             raise FileNotFoundError()
 
-        self.db.execute("DELETE FROM files WHERE path = ?", (name,))
+        self.db.execute("DELETE FROM files2 WHERE path = ?", (name,))
 
     def commit(self) -> None:
         if self.db:
             self.db.commit()
 
     def list_all(self) -> Iterable[str]:
         if self.db:
-            for row in self.db.execute("SELECT path FROM files"):
+            for row in self.db.execute("SELECT path FROM files2"):
                 yield row[0]
diff --git a/mypy/util.py b/mypy/util.py
@@ -4,16 +4,23 @@
 
 import hashlib
 import io
+import json
 import os
 import pathlib
 import re
 import shutil
 import sys
 import time
 from importlib import resources as importlib_resources
-from typing import IO, Callable, Container, Final, Iterable, Sequence, Sized, TypeVar
+from typing import IO, Any, Callable, Container, Final, Iterable, Sequence, Sized, TypeVar
 from typing_extensions import Literal
 
+orjson: Any
+try:
+    import orjson  # type: ignore[import-not-found, no-redef, unused-ignore]
+except ImportError:
+    orjson = None
+
 try:
     import curses
 
@@ -874,3 +881,22 @@ def quote_docstring(docstr: str) -> str:
         return f"''{docstr_repr}''"
     else:
         return f'""{docstr_repr}""'
+
+
+def json_dumps(obj: object, debug: bool = False) -> bytes:
+    if orjson is not None:
+        if debug:
+            return orjson.dumps(obj, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS)  # type: ignore[no-any-return]
+        else:
+            return orjson.dumps(obj)  # type: ignore[no-any-return]
+
+    if debug:
+        return json.dumps(obj, indent=2, sort_keys=True).encode("utf-8")
+    else:
+        return json.dumps(obj, separators=(",", ":")).encode("utf-8")
+
+
+def json_loads(data: bytes) -> Any:
+    if orjson is not None:
+        return orjson.loads(data)
+    return json.loads(data)
diff --git a/mypyc/codegen/emitmodule.py b/mypyc/codegen/emitmodule.py
@@ -24,7 +24,7 @@
 from mypy.nodes import MypyFile
 from mypy.options import Options
 from mypy.plugin import Plugin, ReportConfigContext
-from mypy.util import hash_digest
+from mypy.util import hash_digest, json_dumps
 from mypyc.codegen.cstring import c_string_initializer
 from mypyc.codegen.emit import Emitter, EmitterContext, HeaderDeclaration, c_array_initializer
 from mypyc.codegen.emitclass import generate_class, generate_class_type_decl
@@ -154,7 +154,7 @@ def report_config_data(self, ctx: ReportConfigContext) -> tuple[str | None, list
         ir_data = json.loads(ir_json)
 
         # Check that the IR cache matches the metadata cache
-        if compute_hash(meta_json) != ir_data["meta_hash"]:
+        if hash_digest(meta_json) != ir_data["meta_hash"]:
             return None
 
         # Check that all of the source files are present and as
@@ -369,11 +369,11 @@ def write_cache(
         newpath = get_state_ir_cache_name(st)
         ir_data = {
             "ir": module.serialize(),
-            "meta_hash": compute_hash(meta_data),
+            "meta_hash": hash_digest(meta_data),
             "src_hashes": hashes[group_map[id]],
         }
 
-        result.manager.metastore.write(newpath, json.dumps(ir_data, separators=(",", ":")))
+        result.manager.metastore.write(newpath, json_dumps(ir_data))
 
     result.manager.metastore.commit()