From bc3d942e0a5c8f8162154b6fb585a93230307672 Mon Sep 17 00:00:00 2001 From: Rushikesh Todkar <98420315+RushiT0122@users.noreply.github.com> Date: Thu, 3 Nov 2022 14:18:23 +0530 Subject: [PATCH 01/13] parse dev mode argument (#158) * parse dev mode argument * update the short flag for dev mode Replace `-dev` to `-D` * Updated the short flag for dev mode * - remove dev mode short flag Co-authored-by: RushiT0122 --- singer/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/singer/utils.py b/singer/utils.py index 492e03b..48675eb 100644 --- a/singer/utils.py +++ b/singer/utils.py @@ -134,6 +134,7 @@ def parse_args(required_config_keys): -d,--discover Run in discover mode -p,--properties Properties file: DEPRECATED, please use --catalog instead --catalog Catalog file + --dev Runs the tap in dev mode Returns the parsed args object from argparse. For each argument that point to JSON files (config, state, properties), we will automatically @@ -163,6 +164,11 @@ def parse_args(required_config_keys): action='store_true', help='Do schema discovery') + parser.add_argument( + '--dev', + action='store_true', + help='Runs tap in dev mode') + args = parser.parse_args() if args.config: setattr(args, 'config_path', args.config) From 2c053f4c5f468235bba85a203a252438b1b3c704 Mon Sep 17 00:00:00 2001 From: Rushikesh Todkar <98420315+RushiT0122@users.noreply.github.com> Date: Mon, 7 Nov 2022 15:00:12 +0530 Subject: [PATCH 02/13] Bump version 5.13.0 (#160) * Bump version 5.13.0 * update changelog for dev mode argument parsing * change versioning to minor version bump Co-authored-by: RushiT0122 --- CHANGELOG.md | 3 +++ setup.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ffcb8e6..d06a7a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## 5.13.0 + * Add support for dev mode argument parsing [#158](https://github.com/singer-io/singer-python/pull/158) + ## 5.12.2 * Removes pinned `pytz` version [#152](https://github.com/singer-io/singer-python/pull/152) diff --git a/setup.py b/setup.py index cd5825c..3d95c5d 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import subprocess setup(name="singer-python", - version='5.12.2', + version='5.13.0', description="Singer.io utility library", author="Stitch", classifiers=['Programming Language :: Python :: 3 :: Only'], From 0d0ff1e990f1a7cf5c48fe6a75ee99a8aa9122c5 Mon Sep 17 00:00:00 2001 From: Leslie VanDeMark <38043390+leslievandemark@users.noreply.github.com> Date: Tue, 16 Jan 2024 10:27:14 -0500 Subject: [PATCH 03/13] Bump backoff to be compatible with newer python versions (#165) * bump backoff for pyton 3.11 compatibility * update pip version * try new circleci yml * remove 'make' * make pylint happy * make pylint happy * make pylint happy again * backoff version is a breaking change for old python versions * Changelog update --- .circleci/config.yml | 35 ++++++++++++++++++++++------------- CHANGELOG.md | 3 +++ setup.py | 4 ++-- singer/catalog.py | 2 +- singer/exceptions.py | 2 +- singer/messages.py | 8 ++++---- singer/transform.py | 8 ++++---- singer/utils.py | 4 ++-- 8 files changed, 39 insertions(+), 27 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index ae6734c..a64745e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,26 +1,35 @@ -version: 2 +version: 2.1 + +workflows: + build: + jobs: + - build: + context: + - circleci-user + jobs: build: docker: - - image: ubuntu:16.04 + - image: 218546966473.dkr.ecr.us-east-1.amazonaws.com/sources-python:1.1.0 steps: - checkout - - run: - name: 'Install python 3.5.2' - command: | - apt update - apt install --yes python3 python3-pip python3-venv - run: name: 'Setup virtualenv' command: | - mkdir -p ~/.virtualenvs + pyenv global 3.11.7 python3 -m venv ~/.virtualenvs/singer-python source ~/.virtualenvs/singer-python/bin/activate - pip install -U 'pip<19.2' 'setuptools<51.0.0' - make install + pip install -U 'pip==20.3.4' 'setuptools<51.0.0' + pip install .[dev] + - run: + name: 'Pylint' + command: | + source ~/.virtualenvs/singer-python/bin/activate + pip install pylint + pylint singer --extension-pkg-whitelist=ciso8601 -d missing-docstring,broad-exception-raised,broad-exception-caught,bare-except,too-many-return-statements,too-many-branches,too-many-arguments,no-else-return,too-few-public-methods,fixme,protected-access,consider-using-f-string - run: - name: 'Run tests' + name: 'Run Tests' command: | - # Need to re-activate the virtualenv source ~/.virtualenvs/singer-python/bin/activate - make test + pip install nose2 + nose2 -v -s tests diff --git a/CHANGELOG.md b/CHANGELOG.md index d06a7a7..21dbe09 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## 6.0.0 + * Bump backoff version to 2.2.1. This version drops support for python 3.5, but adds it for 3.1o [#165](https://github.com/singer-io/singer-python/pull/165) + ## 5.13.0 * Add support for dev mode argument parsing [#158](https://github.com/singer-io/singer-python/pull/158) diff --git a/setup.py b/setup.py index 3d95c5d..48305ab 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import subprocess setup(name="singer-python", - version='5.13.0', + version='6.0.0', description="Singer.io utility library", author="Stitch", classifiers=['Programming Language :: Python :: 3 :: Only'], @@ -14,7 +14,7 @@ 'jsonschema==2.6.0', 'simplejson==3.11.1', 'python-dateutil>=2.6.0', - 'backoff==1.8.0', + 'backoff==2.2.1', 'ciso8601', ], extras_require={ diff --git a/singer/catalog.py b/singer/catalog.py index 1767ff1..373a606 100644 --- a/singer/catalog.py +++ b/singer/catalog.py @@ -92,7 +92,7 @@ def __eq__(self, other): @classmethod def load(cls, filename): - with open(filename) as fp: # pylint: disable=invalid-name + with open(filename, encoding="utf-8") as fp: return Catalog.from_dict(json.load(fp)) @classmethod diff --git a/singer/exceptions.py b/singer/exceptions.py index 9231328..b13016d 100644 --- a/singer/exceptions.py +++ b/singer/exceptions.py @@ -11,7 +11,7 @@ def __init__(self, message): The first line is the error's class name. The subsequent lines are the message that class was created with. """ - super().__init__('{}\n{}'.format(self.__class__.__name__, message)) + super().__init__(f"{self.__class__.__name__}\n{message}") class SingerConfigurationError(SingerError): diff --git a/singer/messages.py b/singer/messages.py index 3848801..4a87235 100644 --- a/singer/messages.py +++ b/singer/messages.py @@ -11,16 +11,16 @@ class Message(): '''Base class for messages.''' - def asdict(self): # pylint: disable=no-self-use + def asdict(self): raise Exception('Not implemented') def __eq__(self, other): return isinstance(other, Message) and self.asdict() == other.asdict() def __repr__(self): - pairs = ["{}={}".format(k, v) for k, v in self.asdict().items()] + pairs = [f"{k}={v}" for k, v in self.asdict().items()] attrstr = ", ".join(pairs) - return "{}({})".format(self.__class__.__name__, attrstr) + return f"{self.__class__.__name__}({attrstr})" def __str__(self): return str(self.asdict()) @@ -169,7 +169,7 @@ def asdict(self): def _required_key(msg, k): if k not in msg: - raise Exception("Message is missing required key '{}': {}".format(k, msg)) + raise Exception(f"Message is missing required key '{k}': {msg}") return msg[k] diff --git a/singer/transform.py b/singer/transform.py index 3fdefdf..69f812a 100644 --- a/singer/transform.py +++ b/singer/transform.py @@ -77,16 +77,16 @@ def tostr(self): path = ".".join(map(str, self.path)) if self.schema: if self.logging_level >= logging.INFO: - msg = "data does not match {}".format(self.schema) + msg = f"data does not match {self.schema}" else: - msg = "does not match {}".format(self.schema) + msg = f"does not match {self.schema}" else: msg = "not in schema" if self.logging_level >= logging.INFO: - output = "{}: {}".format(path, msg) + output = f"{path}: {msg}" else: - output = "{}: {} {}".format(path, self.data, msg) + output = f"{path}: {self.data} {msg}" return output diff --git a/singer/utils.py b/singer/utils.py index 48675eb..6620005 100644 --- a/singer/utils.py +++ b/singer/utils.py @@ -105,7 +105,7 @@ def chunk(array, num): def load_json(path): - with open(path) as fil: + with open(path, encoding="utf-8") as fil: return json.load(fil) @@ -193,7 +193,7 @@ def parse_args(required_config_keys): def check_config(config, required_keys): missing_keys = [key for key in required_keys if key not in config] if missing_keys: - raise Exception("Config is missing required keys: {}".format(missing_keys)) + raise Exception(f"Config is missing required keys: {missing_keys}") def backoff(exceptions, giveup): From d6f0d2026645d7cc45b01a6116701e3564b42628 Mon Sep 17 00:00:00 2001 From: Bryant Gray Date: Tue, 19 Mar 2024 13:45:29 -0400 Subject: [PATCH 04/13] Relax dependency version requirements (#167) * Relax dependency constraints * Bump version to `6.0.1` * pin backoff and simplejson to major version * Don't allow older versions * Update changelog * Pin minumum and major versions --- CHANGELOG.md | 5 ++++- setup.py | 12 ++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 21dbe09..3633703 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,10 @@ # Changelog +## 6.0.1 + * Pin backoff and simplejson to any version greater than or equal to the previously allowed version, up to the next major version [#167](https://github.com/singer-io/singer-python/pull/167) + ## 6.0.0 - * Bump backoff version to 2.2.1. This version drops support for python 3.5, but adds it for 3.1o [#165](https://github.com/singer-io/singer-python/pull/165) + * Bump backoff version to 2.2.1. This version drops support for python 3.5, but adds it for 3.10 [#165](https://github.com/singer-io/singer-python/pull/165) ## 5.13.0 * Add support for dev mode argument parsing [#158](https://github.com/singer-io/singer-python/pull/158) diff --git a/setup.py b/setup.py index 48305ab..91f3ed0 100755 --- a/setup.py +++ b/setup.py @@ -4,18 +4,18 @@ import subprocess setup(name="singer-python", - version='6.0.0', + version='6.0.1', description="Singer.io utility library", author="Stitch", classifiers=['Programming Language :: Python :: 3 :: Only'], url="http://singer.io", install_requires=[ 'pytz>=2018.4', - 'jsonschema==2.6.0', - 'simplejson==3.11.1', - 'python-dateutil>=2.6.0', - 'backoff==2.2.1', - 'ciso8601', + 'jsonschema>=2.6.0,==2.*', + 'simplejson>=3.13.2,==3.*', + 'python-dateutil>=2.7.3,==2.*', + 'backoff>=2.2.1,==2.*', + 'ciso8601>=2.3.1,==2.*', ], extras_require={ 'dev': [ From 0cb22883deb76f1e8c116d5056a935410527948b Mon Sep 17 00:00:00 2001 From: Sourabh Gandhi <105213416+sgandhi1311@users.noreply.github.com> Date: Tue, 13 Aug 2024 18:47:08 +0530 Subject: [PATCH 05/13] Make `ensure_ascii` Dynamic with Default Set to `True` in JSON Serialization (#168) * add parameter - ensure_ascii to load non ascii characters when set to false * add unit test for ensuring ascii characters while loading * update setup and changelog --- CHANGELOG.md | 3 +++ setup.py | 2 +- singer/messages.py | 8 ++++---- tests/test_singer.py | 26 ++++++++++++++++++++++++++ 4 files changed, 34 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3633703..4a06bd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## 6.1.0 + * Make ensure_ascii Dynamic with Default Set to True in JSON Serialization. Required to handle the special characters [#168](https://github.com/singer-io/singer-python/pull/168) + ## 6.0.1 * Pin backoff and simplejson to any version greater than or equal to the previously allowed version, up to the next major version [#167](https://github.com/singer-io/singer-python/pull/167) diff --git a/setup.py b/setup.py index 91f3ed0..7ed9177 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import subprocess setup(name="singer-python", - version='6.0.1', + version='6.1.0', description="Singer.io utility library", author="Stitch", classifiers=['Programming Language :: Python :: 3 :: Only'], diff --git a/singer/messages.py b/singer/messages.py index 4a87235..de6e076 100644 --- a/singer/messages.py +++ b/singer/messages.py @@ -218,12 +218,12 @@ def parse_message(msg): return None -def format_message(message): - return json.dumps(message.asdict(), use_decimal=True) +def format_message(message, ensure_ascii=True): + return json.dumps(message.asdict(), use_decimal=True, ensure_ascii=ensure_ascii) -def write_message(message): - sys.stdout.write(format_message(message) + '\n') +def write_message(message, ensure_ascii=True): + sys.stdout.write(format_message(message, ensure_ascii=ensure_ascii) + '\n') sys.stdout.flush() diff --git a/tests/test_singer.py b/tests/test_singer.py index 4fb74de..7f69bb5 100644 --- a/tests/test_singer.py +++ b/tests/test_singer.py @@ -1,5 +1,6 @@ import singer import unittest +from unittest.mock import patch import datetime import dateutil from decimal import Decimal @@ -179,6 +180,31 @@ def test_parse_bulk_decs(self): value = self.create_record(value_str) self.assertEqual(Decimal(value_str), value) + @patch('sys.stdout') + def test_ensure_ascii_false(self, mock_stdout): + """ + Setting ensure_ascii=False will preserve special characters like é + in their original form. + """ + rec = {"name": "José"} + expected_output = '{"type": "RECORD", "stream": "test_stream", "record": {"name": "José"}}\n' + rec_message = singer.RecordMessage(stream="test_stream", record=rec) + result = singer.write_message(rec_message, ensure_ascii=False) + mock_stdout.write.assert_called_once_with(expected_output) + mock_stdout.flush.assert_called_once() + + @patch('sys.stdout') + def test_ensure_ascii_true(self, mock_stdout): + """ + ensure_ascii defaults to True, special characters like é are + escaped into their ASCII representation (e.g., \u00e9) + """ + rec = {"name": "José"} + expected_output = '{"type": "RECORD", "stream": "test_stream", "record": {"name": "Jos\\u00e9"}}\n' + rec_message = singer.RecordMessage(stream="test_stream", record=rec) + result = singer.write_message(rec_message) + mock_stdout.write.assert_called_once_with(expected_output) + mock_stdout.flush.assert_called_once() if __name__ == '__main__': unittest.main() From ae50276b7055248273d0458b85d891773f7d4597 Mon Sep 17 00:00:00 2001 From: Eivin Giske Skaaren Date: Tue, 3 Sep 2024 12:15:18 +0200 Subject: [PATCH 06/13] Enable copilot usage in PR template according to Qlik policy --- .github/pull_request_template.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 6e46b00..ef49bc0 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -9,3 +9,7 @@ # Rollback steps - revert this branch + +#### AI generated code +https://internal.qlik.dev/general/ways-of-working/code-reviews/#guidelines-for-ai-generated-code +- [ ] this PR has been written with the help of GitHub Copilot or another generative AI tool From f0e1e6ec69b5c57116c3aa29a66ff7485f162fc6 Mon Sep 17 00:00:00 2001 From: Bryant Gray Date: Mon, 24 Mar 2025 16:59:41 -0400 Subject: [PATCH 07/13] Use underscore instead of dash in setup.cfg (#171) * Use underscore instead of dash in setup.cfg https://github.com/pypa/setuptools/issues/4910 * disable some pylint warnings --- setup.cfg | 2 +- setup.py | 2 +- singer/catalog.py | 1 + singer/schema.py | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index b88034e..08aedd7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,2 @@ [metadata] -description-file = README.md +description_file = README.md diff --git a/setup.py b/setup.py index 7ed9177..c48fb80 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import subprocess setup(name="singer-python", - version='6.1.0', + version='6.1.1', description="Singer.io utility library", author="Stitch", classifiers=['Programming Language :: Python :: 3 :: Only'], diff --git a/singer/catalog.py b/singer/catalog.py index 373a606..d8ef147 100644 --- a/singer/catalog.py +++ b/singer/catalog.py @@ -20,6 +20,7 @@ def write_catalog(catalog): # pylint: disable=too-many-instance-attributes class CatalogEntry(): + # pylint: disable=too-many-positional-arguments def __init__(self, tap_stream_id=None, stream=None, key_properties=None, schema=None, replication_key=None, is_view=None, database=None, table=None, row_count=None, diff --git a/singer/schema.py b/singer/schema.py index b4da4ac..2fcafd6 100644 --- a/singer/schema.py +++ b/singer/schema.py @@ -31,7 +31,7 @@ class Schema(): # pylint: disable=too-many-instance-attributes ''' - # pylint: disable=too-many-locals + # pylint: disable=too-many-locals,too-many-positional-arguments def __init__(self, type=None, format=None, properties=None, items=None, selected=None, inclusion=None, description=None, minimum=None, maximum=None, exclusiveMinimum=None, exclusiveMaximum=None, From 1e0bccb34b1d2e2e346648c013a0e534a1727911 Mon Sep 17 00:00:00 2001 From: Ben Allred Date: Fri, 19 Sep 2025 09:58:13 -0600 Subject: [PATCH 08/13] add json schema generation (#175) Co-authored-by: Dylan Sprayberry --- CHANGELOG.md | 3 ++ setup.py | 2 +- singer/schema_generation.py | 92 +++++++++++++++++++++++++++++++++ tests/test_catalog.py | 6 +-- tests/test_exceptions.py | 24 ++++----- tests/test_schema.py | 28 +++++----- tests/test_schema_generation.py | 76 +++++++++++++++++++++++++++ tests/test_transform.py | 48 ++++++++--------- 8 files changed, 225 insertions(+), 54 deletions(-) create mode 100644 singer/schema_generation.py create mode 100644 tests/test_schema_generation.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a06bd4..59b0bd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## 6.2.0 + * Adds json schema generation [#175](https://github.com/singer-io/singer-python/pull/175) + ## 6.1.0 * Make ensure_ascii Dynamic with Default Set to True in JSON Serialization. Required to handle the special characters [#168](https://github.com/singer-io/singer-python/pull/168) diff --git a/setup.py b/setup.py index c48fb80..4435246 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import subprocess setup(name="singer-python", - version='6.1.1', + version='6.2.0', description="Singer.io utility library", author="Stitch", classifiers=['Programming Language :: Python :: 3 :: Only'], diff --git a/singer/schema_generation.py b/singer/schema_generation.py new file mode 100644 index 0000000..3d11f74 --- /dev/null +++ b/singer/schema_generation.py @@ -0,0 +1,92 @@ +import dateutil.parser + + +def add_observation(acc, path): + + node = acc + for i in range(0, len(path) - 1): + k = path[i] + if k not in node: + node[k] = {} + node = node[k] + + node[path[-1]] = True + +# pylint: disable=too-many-branches +def add_observations(acc, path, data): + if isinstance(data, dict): + for key in data: + add_observations(acc, path + ["object", key], data[key]) + elif isinstance(data, list): + for item in data: + add_observations(acc, path + ["array"], item) + elif isinstance(data, str): + # If the string parses as a date, add an observation that its a date + try: + data = dateutil.parser.parse(data) + except (dateutil.parser.ParserError, OverflowError): + data = None + if data: + add_observation(acc, path + ["date"]) + else: + add_observation(acc, path + ["string"]) + + elif isinstance(data, bool): + add_observation(acc, path + ["boolean"]) + elif isinstance(data, int): + add_observation(acc, path + ["integer"]) + elif isinstance(data, float): + add_observation(acc, path + ["number"]) + elif data is None: + add_observation(acc, path + ["null"]) + else: + raise Exception("Unexpected value " + repr(data) + " at path " + repr(path)) + + return acc + +def to_json_schema(obs): + result = {'type': ['null']} + + for key in obs: + + if key == 'object': + result['type'] += ['object'] + if 'properties' not in result: + result['properties'] = {} + for obj_key in obs['object']: + result['properties'][obj_key] = to_json_schema(obs['object'][obj_key]) + + elif key == 'array': + result['type'] += ['array'] + result['items'] = to_json_schema(obs['array']) + + elif key == 'date': + result['type'] += ['string'] + result['format'] = 'date-time' + elif key == 'string': + result['type'] += ['string'] + + elif key == 'boolean': + result['type'] += ['boolean'] + + elif key == 'integer': + result['type'] += ['integer'] + + elif key == 'number': + # Use type=string, format=singer.decimal + result['type'] += ['string'] + result['format'] = 'singer.decimal' + + elif key == 'null': + pass + + else: + raise Exception("Unexpected data type " + key) + + return result + +def generate_schema(records): + obs = {} + for record in records: + obs = add_observations(obs, [], record) + return to_json_schema(obs) diff --git a/tests/test_catalog.py b/tests/test_catalog.py index cd6dc50..8a72e1a 100644 --- a/tests/test_catalog.py +++ b/tests/test_catalog.py @@ -25,7 +25,7 @@ def test_one_selected_stream(self): CatalogEntry(tap_stream_id='c',schema=Schema(),metadata=[])]) state = {} selected_streams = catalog.get_selected_streams(state) - self.assertEquals([e for e in selected_streams],[selected_entry]) + self.assertEqual([e for e in selected_streams],[selected_entry]) def test_resumes_currently_syncing_stream(self): selected_entry_a = CatalogEntry(tap_stream_id='a', @@ -44,7 +44,7 @@ def test_resumes_currently_syncing_stream(self): selected_entry_c]) state = {'currently_syncing': 'c'} selected_streams = catalog.get_selected_streams(state) - self.assertEquals([e for e in selected_streams][0],selected_entry_c) + self.assertEqual([e for e in selected_streams][0],selected_entry_c) class TestToDictAndFromDict(unittest.TestCase): @@ -141,4 +141,4 @@ def test(self): CatalogEntry(tap_stream_id='b'), CatalogEntry(tap_stream_id='c')]) entry = catalog.get_stream('b') - self.assertEquals('b', entry.tap_stream_id) + self.assertEqual('b', entry.tap_stream_id) diff --git a/tests/test_exceptions.py b/tests/test_exceptions.py index 491595f..50cf7a1 100644 --- a/tests/test_exceptions.py +++ b/tests/test_exceptions.py @@ -14,8 +14,8 @@ def test_SingerError_prints_correctly(self): raise SingerError(error_text) expected_text = "SingerError\n" + error_text - self.assertEquals(expected_text, - str(test_run.exception)) + self.assertEqual(expected_text, + str(test_run.exception)) def test_SingerConfigurationError_prints_correctly(self): error_text = "An error occured" @@ -24,8 +24,8 @@ def test_SingerConfigurationError_prints_correctly(self): raise SingerConfigurationError(error_text) expected_text = "SingerConfigurationError\n" + error_text - self.assertEquals(expected_text, - str(test_run.exception)) + self.assertEqual(expected_text, + str(test_run.exception)) def test_SingerDiscoveryError_prints_correctly(self): error_text = "An error occured" @@ -34,8 +34,8 @@ def test_SingerDiscoveryError_prints_correctly(self): raise SingerDiscoveryError(error_text) expected_text = "SingerDiscoveryError\n" + error_text - self.assertEquals(expected_text, - str(test_run.exception)) + self.assertEqual(expected_text, + str(test_run.exception)) def test_SingerSyncError_prints_correctly(self): error_text = "An error occured" @@ -44,8 +44,8 @@ def test_SingerSyncError_prints_correctly(self): raise SingerSyncError(error_text) expected_text = "SingerSyncError\n" + error_text - self.assertEquals(expected_text, - str(test_run.exception)) + self.assertEqual(expected_text, + str(test_run.exception)) def test_SingerRetryableRequestError_prints_correctly(self): error_text = "An error occured" @@ -54,8 +54,8 @@ def test_SingerRetryableRequestError_prints_correctly(self): raise SingerRetryableRequestError(error_text) expected_text = "SingerRetryableRequestError\n" + error_text - self.assertEquals(expected_text, - str(test_run.exception)) + self.assertEqual(expected_text, + str(test_run.exception)) def test_SingerError_prints_multiple_lines_correctly(self): error_text = "\n".join(["Line 1", "Line 2", "Line 3"]) @@ -64,5 +64,5 @@ def test_SingerError_prints_multiple_lines_correctly(self): raise SingerError(error_text) expected_text = "SingerError\n" + error_text - self.assertEquals(expected_text, - str(test_run.exception)) + self.assertEqual(expected_text, + str(test_run.exception)) diff --git a/tests/test_schema.py b/tests/test_schema.py index fa28bac..5682755 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -44,38 +44,38 @@ class TestSchema(unittest.TestCase): additionalProperties=True) def test_string_to_dict(self): - self.assertEquals(self.string_dict, self.string_obj.to_dict()) + self.assertEqual(self.string_dict, self.string_obj.to_dict()) def test_integer_to_dict(self): - self.assertEquals(self.integer_dict, self.integer_obj.to_dict()) + self.assertEqual(self.integer_dict, self.integer_obj.to_dict()) def test_array_to_dict(self): - self.assertEquals(self.array_dict, self.array_obj.to_dict()) + self.assertEqual(self.array_dict, self.array_obj.to_dict()) def test_object_to_dict(self): - self.assertEquals(self.object_dict, self.object_obj.to_dict()) + self.assertEqual(self.object_dict, self.object_obj.to_dict()) def test_string_from_dict(self): - self.assertEquals(self.string_obj, Schema.from_dict(self.string_dict)) + self.assertEqual(self.string_obj, Schema.from_dict(self.string_dict)) def test_integer_from_dict(self): - self.assertEquals(self.integer_obj, Schema.from_dict(self.integer_dict)) + self.assertEqual(self.integer_obj, Schema.from_dict(self.integer_dict)) def test_array_from_dict(self): - self.assertEquals(self.array_obj, Schema.from_dict(self.array_dict)) + self.assertEqual(self.array_obj, Schema.from_dict(self.array_dict)) def test_object_from_dict(self): - self.assertEquals(self.object_obj, Schema.from_dict(self.object_dict)) + self.assertEqual(self.object_obj, Schema.from_dict(self.object_dict)) def test_repr_atomic(self): - self.assertEquals(self.string_obj, eval(repr(self.string_obj))) + self.assertEqual(self.string_obj, eval(repr(self.string_obj))) def test_repr_recursive(self): - self.assertEquals(self.object_obj, eval(repr(self.object_obj))) + self.assertEqual(self.object_obj, eval(repr(self.object_obj))) def test_object_from_dict_with_defaults(self): schema = Schema.from_dict(self.object_dict, inclusion='automatic') - self.assertEquals('whatever', schema.inclusion, - msg='The schema value should override the default') - self.assertEquals('automatic', schema.properties['a_string'].inclusion) - self.assertEquals('automatic', schema.properties['an_array'].items.inclusion) + self.assertEqual('whatever', schema.inclusion, + msg='The schema value should override the default') + self.assertEqual('automatic', schema.properties['a_string'].inclusion) + self.assertEqual('automatic', schema.properties['an_array'].items.inclusion) diff --git a/tests/test_schema_generation.py b/tests/test_schema_generation.py new file mode 100644 index 0000000..5e00738 --- /dev/null +++ b/tests/test_schema_generation.py @@ -0,0 +1,76 @@ +import unittest +from singer.schema_generation import generate_schema + +class TestSchemaGeneration(unittest.TestCase): + def test_simple_schema(self): + records = [{'a': 1, 'b': 'two', 'c': True, 'dt': '2000-01-01T00:11:22Z'}] + expected_schema = { + 'type': ['null', 'object'], + 'properties': { + 'a': {'type': ['null', 'integer']}, + 'b': {'type': ['null', 'string']}, + 'c': {'type': ['null', 'boolean']}, + 'dt': {'type': ['null', 'string'], 'format': 'date-time'} + } + } + self.assertEqual(expected_schema, generate_schema(records)) + + def test_mix_n_match_records_schema(self): + records = [ + {'a': 1, 'b': 'b'}, + {'a': 'two', 'c': 7, 'd': [1, 'two']}, + {'a': True, 'c': 7.7, 'd': {'one': 1, 'two': 'two'}} + ] + expected_schema = { + 'type': ['null', 'object'], + 'properties': { + 'a': {'type': {'null', 'integer', 'string', 'boolean'}}, + 'b': {'type': ['null', 'string']}, + 'c': {'type': {'null', 'integer', 'string'}, 'format': 'singer.decimal'}, + 'd': { + 'type': {'null', 'array', 'object'}, + 'items': {'type': {'null', 'integer', 'string'}}, + 'properties': {'one': {'type': ['null', 'integer']}, + 'two': {'type': ['null', 'string']}} + + } + } + } + actual_schema = generate_schema(records) + actual_schema['properties']['a']['type'] = set(actual_schema['properties']['a']['type']) + actual_schema['properties']['c']['type'] = set(actual_schema['properties']['c']['type']) + actual_schema['properties']['d']['type'] = set(actual_schema['properties']['d']['type']) + actual_schema['properties']['d']['items']['type'] = set(actual_schema['properties']['d']['items']['type']) + self.assertEqual(expected_schema, actual_schema) + + def test_nested_structue_schema(self): + records = [{'a': {'b': {'c': [{'d': 7}]}, 'e': [[1, 2, 3]]}}] + expected_schema = { + 'type': ['null', 'object'], + 'properties': { + 'a': { + 'type': ['null', 'object'], + 'properties': { + 'b': { + 'type': ['null', 'object'], + 'properties': { + 'c': { + 'type': ['null', 'array'], + 'items': { + 'type': ['null', 'object'], + 'properties': {'d': {'type': ['null', 'integer']}} + } + } + } + }, + 'e': { + 'type': ['null', 'array'], + 'items': { + 'type': ['null', 'array'], + 'items': {'type': ['null', 'integer']}} + } + } + } + } + } + self.assertEqual(expected_schema, generate_schema(records)) diff --git a/tests/test_transform.py b/tests/test_transform.py index 959c4b8..96e0c3b 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -264,11 +264,11 @@ def test_decimal_types_transform(self): nan = {'percentage': decimal.Decimal('NaN')} snan = {'percentage': decimal.Decimal('sNaN')} - self.assertEquals(inf, transform(inf, schema)) - self.assertEquals(negative_inf, transform(negative_inf, schema)) - self.assertEquals({'percentage': '1.4142135623730951'}, transform(root2, schema)) - self.assertEquals({'percentage': 'NaN'}, transform(nan, schema)) - self.assertEquals({'percentage': 'NaN'}, transform(snan, schema)) + self.assertEqual(inf, transform(inf, schema)) + self.assertEqual(negative_inf, transform(negative_inf, schema)) + self.assertEqual({'percentage': '1.4142135623730951'}, transform(root2, schema)) + self.assertEqual({'percentage': 'NaN'}, transform(nan, schema)) + self.assertEqual({'percentage': 'NaN'}, transform(snan, schema)) str1 = {'percentage':'0.1'} @@ -276,11 +276,11 @@ def test_decimal_types_transform(self): str3 = {'percentage': '1E+13'} str4 = {'percentage': '100'} str5 = {'percentage': '-100'} - self.assertEquals(str1, transform(str1, schema)) - self.assertEquals({'percentage': '1E-13'}, transform(str2, schema)) - self.assertEquals({'percentage': '1E+13'}, transform(str3, schema)) - self.assertEquals({'percentage': '100'}, transform(str4, schema)) - self.assertEquals({'percentage': '-100'}, transform(str5, schema)) + self.assertEqual(str1, transform(str1, schema)) + self.assertEqual({'percentage': '1E-13'}, transform(str2, schema)) + self.assertEqual({'percentage': '1E+13'}, transform(str3, schema)) + self.assertEqual({'percentage': '100'}, transform(str4, schema)) + self.assertEqual({'percentage': '-100'}, transform(str5, schema)) float1 = {'percentage': 12.0000000000000000000000000001234556} float2 = {'percentage': 0.0123} @@ -288,28 +288,28 @@ def test_decimal_types_transform(self): float4 = {'percentage': -100.0123} float5 = {'percentage': 0.000001} float6 = {'percentage': 0.0000001} - self.assertEquals({'percentage':'12.0'}, transform(float1, schema)) - self.assertEquals({'percentage':'0.0123'}, transform(float2, schema)) - self.assertEquals({'percentage':'100.0123'}, transform(float3, schema)) - self.assertEquals({'percentage':'-100.0123'}, transform(float4, schema)) - self.assertEquals({'percentage':'0.000001'}, transform(float5, schema)) - self.assertEquals({'percentage':'1E-7'}, transform(float6, schema)) + self.assertEqual({'percentage':'12.0'}, transform(float1, schema)) + self.assertEqual({'percentage':'0.0123'}, transform(float2, schema)) + self.assertEqual({'percentage':'100.0123'}, transform(float3, schema)) + self.assertEqual({'percentage':'-100.0123'}, transform(float4, schema)) + self.assertEqual({'percentage':'0.000001'}, transform(float5, schema)) + self.assertEqual({'percentage':'1E-7'}, transform(float6, schema)) int1 = {'percentage': 123} int2 = {'percentage': 0} int3 = {'percentage': -1000} - self.assertEquals({'percentage':'123'}, transform(int1, schema)) - self.assertEquals({'percentage':'0'}, transform(int2, schema)) - self.assertEquals({'percentage':'-1000'}, transform(int3, schema)) + self.assertEqual({'percentage':'123'}, transform(int1, schema)) + self.assertEqual({'percentage':'0'}, transform(int2, schema)) + self.assertEqual({'percentage':'-1000'}, transform(int3, schema)) dec1 = {'percentage': decimal.Decimal('1.1010101')} dec2 = {'percentage': decimal.Decimal('.111111111111111111111111')} dec3 = {'percentage': decimal.Decimal('-.111111111111111111111111')} dec4 = {'percentage': decimal.Decimal('100')} - self.assertEquals({'percentage':'1.1010101'}, transform(dec1, schema)) - self.assertEquals({'percentage':'0.111111111111111111111111'}, transform(dec2, schema)) - self.assertEquals({'percentage':'-0.111111111111111111111111'}, transform(dec3, schema)) - self.assertEquals({'percentage':'100'}, transform(dec4, schema)) + self.assertEqual({'percentage':'1.1010101'}, transform(dec1, schema)) + self.assertEqual({'percentage':'0.111111111111111111111111'}, transform(dec2, schema)) + self.assertEqual({'percentage':'-0.111111111111111111111111'}, transform(dec3, schema)) + self.assertEqual({'percentage':'100'}, transform(dec4, schema)) bad1 = {'percentage': 'fsdkjl'} with self.assertRaises(SchemaMismatch): @@ -317,7 +317,7 @@ def test_decimal_types_transform(self): badnull = {'percentage': None} with self.assertRaises(SchemaMismatch): - self.assertEquals({'percentage':None}, transform(badnull, schema)) + self.assertEqual({'percentage':None}, transform(badnull, schema)) class TestTransformsWithMetadata(unittest.TestCase): From 3cbe7ca68596b51e38564b2193873765afb4bd86 Mon Sep 17 00:00:00 2001 From: Ben Allred Date: Thu, 2 Oct 2025 12:37:23 -0600 Subject: [PATCH 09/13] Sac 28668 fix transform and schema (#177) * handle empty arrays and fields that could be either formatted or nested * remove ipdb * bump version and add changelog entry * handle string parsing similar to existin tap-s3 logic * fix syntax error * fix bad tests --- CHANGELOG.md | 6 ++++++ setup.py | 2 +- singer/schema_generation.py | 30 ++++++++++++++++++++++-------- singer/transform.py | 4 ++-- tests/test_transform.py | 4 ++-- 5 files changed, 33 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59b0bd4..1080b69 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## 6.2.1 + * Fixes json schema generation to not treat numbers as dates + * Fixes json schema generation to handle empty arrays + * Fixes record transformation to handle fields that could be either formatted string or nested data structure + * [#177](https://github.com/singer-io/singer-python/pull/177) + ## 6.2.0 * Adds json schema generation [#175](https://github.com/singer-io/singer-python/pull/175) diff --git a/setup.py b/setup.py index 4435246..c34951f 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import subprocess setup(name="singer-python", - version='6.2.0', + version='6.2.1', description="Singer.io utility library", author="Stitch", classifiers=['Programming Language :: Python :: 3 :: Only'], diff --git a/singer/schema_generation.py b/singer/schema_generation.py index 3d11f74..1b73388 100644 --- a/singer/schema_generation.py +++ b/singer/schema_generation.py @@ -18,19 +18,33 @@ def add_observations(acc, path, data): for key in data: add_observations(acc, path + ["object", key], data[key]) elif isinstance(data, list): + if len(data) == 0: + add_observations(acc, path + ["array"], None) for item in data: add_observations(acc, path + ["array"], item) elif isinstance(data, str): - # If the string parses as a date, add an observation that its a date try: - data = dateutil.parser.parse(data) - except (dateutil.parser.ParserError, OverflowError): - data = None - if data: + # If the string parses as a int, add an observation that it's a integer + int(data) + add_observation(acc, path + ["integer"]) + return acc + except (ValueError, TypeError): + pass + try: + # If the string parses as a float, add an observation that it's a number + float(data) + add_observation(acc, path + ["number"]) + return acc + except (ValueError, TypeError): + pass + try: + # If the string parses as a date, add an observation that it's a date + dateutil.parser.parse(data) add_observation(acc, path + ["date"]) - else: - add_observation(acc, path + ["string"]) - + return acc + except (dateutil.parser.ParserError, OverflowError): + pass + add_observation(acc, path + ["string"]) elif isinstance(data, bool): add_observation(acc, path + ["boolean"]) elif isinstance(data, int): diff --git a/singer/transform.py b/singer/transform.py index 69f812a..f125fd5 100644 --- a/singer/transform.py +++ b/singer/transform.py @@ -266,13 +266,13 @@ def _transform(self, data, typ, schema, path): else: return False, None - elif schema.get("format") == "date-time": + elif typ == "string" and schema.get("format") == "date-time": data = self._transform_datetime(data) if data is None: return False, None return True, data - elif schema.get("format") == "singer.decimal": + elif typ == "string" and schema.get("format") == "singer.decimal": if data is None: return False, None diff --git a/tests/test_transform.py b/tests/test_transform.py index 96e0c3b..308ac4e 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -25,7 +25,7 @@ def test_nested_transform(self): def test_multi_type_object_transform(self): schema = {"type": ["null", "object", "string"], - "properties": {"whatever": {"type": "date-time", + "properties": {"whatever": {"type": "string", "format": "date-time"}}} data = {"whatever": "2017-01-01"} expected = {"whatever": "2017-01-01T00:00:00.000000Z"} @@ -36,7 +36,7 @@ def test_multi_type_object_transform(self): def test_multi_type_array_transform(self): schema = {"type": ["null", "array", "integer"], - "items": {"type": "date-time", "format": "date-time"}} + "items": {"type": "string", "format": "date-time"}} data = ["2017-01-01"] expected = ["2017-01-01T00:00:00.000000Z"] self.assertEqual(expected, transform(data, schema)) From bb412f4795b85b8f932ad3fb958c4fbc17667581 Mon Sep 17 00:00:00 2001 From: Ben Allred Date: Wed, 8 Oct 2025 14:59:17 -0600 Subject: [PATCH 10/13] SAC-28668: update schema generation v6 (#180) * use `anyOf` when multiple types are found * fix test * Update schema generation and bump version for v6 deploy Co-authored-by: Bryant Gray Co-authored-by: Andres Pineda * Grab error list changes from v5 Co-authored-by: Bryant Gray Co-authored-by: Andres Pineda * Fix linting error Co-authored-by: Bryant Gray --------- Co-authored-by: Bryant Gray Co-authored-by: Andres Pineda --- CHANGELOG.md | 6 ++++++ setup.py | 2 +- singer/schema_generation.py | 31 +++++++++++++++---------------- singer/transform.py | 2 ++ tests/test_schema_generation.py | 30 +++++++++++++----------------- 5 files changed, 37 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1080b69..18a4ea5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## 6.2.2 + * Updates json schema generation to not emit dates + * Handle multiple schemas with anyOf and emit them in a specific order + * Do not emit error messages when checking multiple schemas and a subsequent schema passes + * [#179](https://github.com/singer-io/singer-python/pull/179) + ## 6.2.1 * Fixes json schema generation to not treat numbers as dates * Fixes json schema generation to handle empty arrays diff --git a/setup.py b/setup.py index c34951f..c7ea106 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import subprocess setup(name="singer-python", - version='6.2.1', + version='6.2.2', description="Singer.io utility library", author="Stitch", classifiers=['Programming Language :: Python :: 3 :: Only'], diff --git a/singer/schema_generation.py b/singer/schema_generation.py index 1b73388..003a16f 100644 --- a/singer/schema_generation.py +++ b/singer/schema_generation.py @@ -1,6 +1,3 @@ -import dateutil.parser - - def add_observation(acc, path): node = acc @@ -37,13 +34,6 @@ def add_observations(acc, path, data): return acc except (ValueError, TypeError): pass - try: - # If the string parses as a date, add an observation that it's a date - dateutil.parser.parse(data) - add_observation(acc, path + ["date"]) - return acc - except (dateutil.parser.ParserError, OverflowError): - pass add_observation(acc, path + ["string"]) elif isinstance(data, bool): add_observation(acc, path + ["boolean"]) @@ -59,9 +49,13 @@ def add_observations(acc, path, data): return acc def to_json_schema(obs): - result = {'type': ['null']} + types = [] + # add schema types in a specific order to anyOf list + for key in ['array', 'object', 'number', 'integer', 'boolean', 'string', 'null']: + if key not in obs: + continue - for key in obs: + result = {'type': ['null']} if key == 'object': result['type'] += ['object'] @@ -74,9 +68,6 @@ def to_json_schema(obs): result['type'] += ['array'] result['items'] = to_json_schema(obs['array']) - elif key == 'date': - result['type'] += ['string'] - result['format'] = 'date-time' elif key == 'string': result['type'] += ['string'] @@ -97,7 +88,15 @@ def to_json_schema(obs): else: raise Exception("Unexpected data type " + key) - return result + types.append(result) + + if len(types) == 0: + return {'type': ['null', 'string']} + + if len(types) == 1: + return types[0] + + return {'anyOf': types} def generate_schema(records): obs = {} diff --git a/singer/transform.py b/singer/transform.py index f125fd5..3a9fc96 100644 --- a/singer/transform.py +++ b/singer/transform.py @@ -185,6 +185,8 @@ def _transform_anyof(self, data, schema, path): success, transformed_data = self.transform_recur(data, subschema, path) if success: return success, transformed_data + else: + self.errors.pop() else: # pylint: disable=useless-else-on-loop # exhaused all schemas and didn't return, so we failed :-( self.errors.append(Error(path, data, schema, logging_level=LOGGER.level)) diff --git a/tests/test_schema_generation.py b/tests/test_schema_generation.py index 5e00738..610ac20 100644 --- a/tests/test_schema_generation.py +++ b/tests/test_schema_generation.py @@ -10,7 +10,7 @@ def test_simple_schema(self): 'a': {'type': ['null', 'integer']}, 'b': {'type': ['null', 'string']}, 'c': {'type': ['null', 'boolean']}, - 'dt': {'type': ['null', 'string'], 'format': 'date-time'} + 'dt': {'type': ['null', 'string']} } } self.assertEqual(expected_schema, generate_schema(records)) @@ -23,24 +23,20 @@ def test_mix_n_match_records_schema(self): ] expected_schema = { 'type': ['null', 'object'], - 'properties': { - 'a': {'type': {'null', 'integer', 'string', 'boolean'}}, - 'b': {'type': ['null', 'string']}, - 'c': {'type': {'null', 'integer', 'string'}, 'format': 'singer.decimal'}, - 'd': { - 'type': {'null', 'array', 'object'}, - 'items': {'type': {'null', 'integer', 'string'}}, - 'properties': {'one': {'type': ['null', 'integer']}, - 'two': {'type': ['null', 'string']}} - - } - } + 'properties': {'a': {'anyOf': [{'type': ['null', 'integer']}, + {'type': ['null', 'boolean']}, + {'type': ['null', 'string']}]}, + 'b': {'type': ['null', 'string']}, + 'c': {'anyOf': [{'type': ['null', 'string'], 'format': 'singer.decimal'}, + {'type': ['null', 'integer']}]}, + 'd': {'anyOf': [{'type': ['null', 'array'], + 'items': {'anyOf': [{'type': ['null', 'integer']}, + {'type': ['null', 'string']}]}}, + {'type': ['null', 'object'], + 'properties': {'one': {'type': ['null', 'integer']}, + 'two': {'type': ['null', 'string']}}}]}} } actual_schema = generate_schema(records) - actual_schema['properties']['a']['type'] = set(actual_schema['properties']['a']['type']) - actual_schema['properties']['c']['type'] = set(actual_schema['properties']['c']['type']) - actual_schema['properties']['d']['type'] = set(actual_schema['properties']['d']['type']) - actual_schema['properties']['d']['items']['type'] = set(actual_schema['properties']['d']['items']['type']) self.assertEqual(expected_schema, actual_schema) def test_nested_structue_schema(self): From 9145ecb84f65c279a13e9eea5b2b704cb967e2c1 Mon Sep 17 00:00:00 2001 From: Ben Allred Date: Wed, 15 Oct 2025 13:58:57 -0600 Subject: [PATCH 11/13] Default to string for schema generation (#182) Co-authored-by: Dylan Sprayberry Co-authored-by: Bryant Gray Co-authored-by: Andres Pineda --- CHANGELOG.md | 3 +++ setup.py | 2 +- singer/schema_generation.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 18a4ea5..43ce58e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## 6.2.3 + * Default type for non-standard data types is string [#182](https://github.com/singer-io/singer-python/pull/182) + ## 6.2.2 * Updates json schema generation to not emit dates * Handle multiple schemas with anyOf and emit them in a specific order diff --git a/setup.py b/setup.py index c7ea106..9940901 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import subprocess setup(name="singer-python", - version='6.2.2', + version='6.2.3', description="Singer.io utility library", author="Stitch", classifiers=['Programming Language :: Python :: 3 :: Only'], diff --git a/singer/schema_generation.py b/singer/schema_generation.py index 003a16f..177d659 100644 --- a/singer/schema_generation.py +++ b/singer/schema_generation.py @@ -44,7 +44,7 @@ def add_observations(acc, path, data): elif data is None: add_observation(acc, path + ["null"]) else: - raise Exception("Unexpected value " + repr(data) + " at path " + repr(path)) + add_observation(acc, path + ["string"]) return acc From ccf2266b3dce333d9865d64878ebf4e0855833d8 Mon Sep 17 00:00:00 2001 From: Sourabh Gandhi <105213416+sgandhi1311@users.noreply.github.com> Date: Wed, 22 Oct 2025 19:08:01 +0530 Subject: [PATCH 12/13] Support allow_nan in message JSON output (#183) * allow nan values to replicate * update setup and changelog * make pylint happy * add test cases for allow nan --- CHANGELOG.md | 3 +++ setup.py | 2 +- singer/messages.py | 13 ++++++--- tests/test_transform.py | 60 ++++++++++++++++++++++++++++++++++++++++- 4 files changed, 72 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 43ce58e..a878fa9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## 6.3.0 + * Support allow_nan in message JSON output [#183](https://github.com/singer-io/singer-python/pull/183) + ## 6.2.3 * Default type for non-standard data types is string [#182](https://github.com/singer-io/singer-python/pull/182) diff --git a/setup.py b/setup.py index 9940901..a25f30b 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import subprocess setup(name="singer-python", - version='6.2.3', + version='6.3.0', description="Singer.io utility library", author="Stitch", classifiers=['Programming Language :: Python :: 3 :: Only'], diff --git a/singer/messages.py b/singer/messages.py index de6e076..941670c 100644 --- a/singer/messages.py +++ b/singer/messages.py @@ -218,12 +218,17 @@ def parse_message(msg): return None -def format_message(message, ensure_ascii=True): - return json.dumps(message.asdict(), use_decimal=True, ensure_ascii=ensure_ascii) +def format_message(message, ensure_ascii=True, allow_nan=False): + return json.dumps( + message.asdict(), + use_decimal=True, + ensure_ascii=ensure_ascii, + allow_nan=allow_nan + ) -def write_message(message, ensure_ascii=True): - sys.stdout.write(format_message(message, ensure_ascii=ensure_ascii) + '\n') +def write_message(message, ensure_ascii=True, allow_nan=False): + sys.stdout.write(format_message(message, ensure_ascii=ensure_ascii, allow_nan=allow_nan) + '\n') sys.stdout.flush() diff --git a/tests/test_transform.py b/tests/test_transform.py index 308ac4e..b398e93 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -1,9 +1,12 @@ +import io +import sys import unittest import decimal +import simplejson as json +import singer.messages as messages from singer import transform from singer.transform import * - class TestTransform(unittest.TestCase): def test_integer_transform(self): schema = {'type': 'integer'} @@ -486,3 +489,58 @@ def test_pattern_properties_match_multiple(self): dict_value = {"name": "chicken", "unit_cost": 1.45, "SKU": '123456'} expected = dict(dict_value) self.assertEqual(expected, transform(dict_value, schema)) + +class DummyMessage: + """A dummy message object with an asdict() method.""" + def __init__(self, value): + self.value = value + + def asdict(self): + return {"value": self.value} + + +class TestAllowNan(unittest.TestCase): + """Unit tests for allow_nan support in singer.messages.""" + + def test_format_message_allow_nan_true(self): + """Should serialize NaN successfully when allow_nan=True.""" + msg = DummyMessage(float("nan")) + result = messages.format_message(msg, allow_nan=True) + + # The output JSON should contain NaN literal (not quoted) + self.assertIn("NaN", result) + + # Replace NaN with null to make it valid JSON for parsing check + json.loads(result.replace("NaN", "null")) + + def test_format_message_allow_nan_false(self): + """Should raise ValueError when allow_nan=False and value is NaN.""" + msg = DummyMessage(float("nan")) + with self.assertRaises(ValueError): + messages.format_message(msg, allow_nan=False) + + def test_write_message_allow_nan_true(self): + """Should write to stdout successfully when allow_nan=True.""" + msg = DummyMessage(float("nan")) + fake_stdout = io.StringIO() + original_stdout = sys.stdout + sys.stdout = fake_stdout + try: + messages.write_message(msg, allow_nan=True) + output = fake_stdout.getvalue() + self.assertIn("NaN", output) + self.assertTrue(output.endswith("\n")) + finally: + sys.stdout = original_stdout + + def test_write_message_allow_nan_false(self): + """Should raise ValueError when allow_nan=False and message has NaN.""" + msg = DummyMessage(float("nan")) + fake_stdout = io.StringIO() + original_stdout = sys.stdout + sys.stdout = fake_stdout + try: + with self.assertRaises(ValueError): + messages.write_message(msg, allow_nan=False) + finally: + sys.stdout = original_stdout From 6ae25be46879066d95d881de857c9a15226b0b1a Mon Sep 17 00:00:00 2001 From: Ben Allred Date: Mon, 2 Feb 2026 11:39:23 -0700 Subject: [PATCH 13/13] SAC-29666: Update clear_offset to remove offset key from bookmark (#185) * Update clear_offset to remove offset key from bookmark ----------------------------- Co-authored-by: Ben Allred * bump to version 6.4.0 ----------------------------- Co-authored-by: Ben Allred --------- Co-authored-by: Leslie VanDeMark --- CHANGELOG.md | 3 +++ setup.py | 2 +- singer/bookmarks.py | 4 +--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a878fa9..5b5fbd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## 6.4.0 + * Update clear_offset to remove offset key from bookmark [#185](https://github.com/singer-io/singer-python/pull/185) + ## 6.3.0 * Support allow_nan in message JSON output [#183](https://github.com/singer-io/singer-python/pull/183) diff --git a/setup.py b/setup.py index a25f30b..5b8d135 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import subprocess setup(name="singer-python", - version='6.3.0', + version='6.4.0', description="Singer.io utility library", author="Stitch", classifiers=['Programming Language :: Python :: 3 :: Only'], diff --git a/singer/bookmarks.py b/singer/bookmarks.py index fc6d7ca..40aa927 100644 --- a/singer/bookmarks.py +++ b/singer/bookmarks.py @@ -31,9 +31,7 @@ def set_offset(state, tap_stream_id, offset_key, offset_value): return state def clear_offset(state, tap_stream_id): - state = ensure_bookmark_path(state, ['bookmarks', tap_stream_id, "offset"]) - state['bookmarks'][tap_stream_id]["offset"] = {} - return state + return clear_bookmark(state, tap_stream_id, "offset") def get_offset(state, tap_stream_id, default=None): return state.get('bookmarks', {}).get(tap_stream_id, {}).get("offset", default)