From ffe8d8ade552b342a0fa160bf8ebdb2781efb8e6 Mon Sep 17 00:00:00 2001 From: Thomas Honeyman Date: Sun, 25 Jul 2021 12:36:06 -0400 Subject: [PATCH 01/12] Replace Slack links with Discord (#117) --- .github/ISSUE_TEMPLATE/config.yml | 8 ++++---- README.md | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index c47a263..d63c9d7 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -2,7 +2,7 @@ blank_issues_enabled: false contact_links: - name: PureScript Discourse url: https://discourse.purescript.org/ - about: Ask and answer questions here. - - name: Functional Programming Slack - url: https://functionalprogramming.slack.com - about: For casual chat and questions (use https://fpchat-invite.herokuapp.com to join). + about: Ask and answer questions on the PureScript discussion forum. + - name: PureScript Discord + url: https://discord.com/invite/sMqwYUbvz6/ + about: Ask and answer questions on the PureScript chat. diff --git a/README.md b/README.md index 947bb85..c21100d 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ The quick start hasn't been written yet (contributions are welcome!). The quick If you get stuck, there are several ways to get help: - [Open an issue](https://github.com/purescript-contrib/purescript-parsing/issues) if you have encountered a bug or problem. -- [Search or start a thread on the PureScript Discourse](https://discourse.purescript.org) if you have general questions. You can also ask questions in the `#purescript` and `#purescript-beginners` channels on the [Functional Programming Slack](https://functionalprogramming.slack.com) ([invite link](https://fpchat-invite.herokuapp.com/)). +- Ask general questions on the [PureScript Discourse](https://discourse.purescript.org) forum or the [PureScript Discord](https://discord.com/invite/sMqwYUbvz6) chat. ## Contributing From bc03b2984af7c55e417c0d3ef71f7a16830ad309 Mon Sep 17 00:00:00 2001 From: Thomas Honeyman Date: Wed, 28 Jul 2021 11:20:39 -0400 Subject: [PATCH 02/12] Update to the latest package set --- packages.dhall | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages.dhall b/packages.dhall index 2a4e0b9..7a6905b 100644 --- a/packages.dhall +++ b/packages.dhall @@ -1,4 +1,4 @@ let upstream = - https://github.com/purescript/package-sets/releases/download/psc-0.14.0/packages.dhall sha256:710b53c085a18aa1263474659daa0ae15b7a4f453158c4f60ab448a6b3ed494e + https://github.com/purescript/package-sets/releases/download/psc-0.14.3-20210722/packages.dhall sha256:1ceb43aa59436bf5601bac45f6f3781c4e1f0e4c2b8458105b018e5ed8c30f8c in upstream From 1d0aaa17313d50bf01bb6a9d7da6eead11460692 Mon Sep 17 00:00:00 2001 From: Thomas Honeyman Date: Wed, 4 Aug 2021 13:01:23 -0400 Subject: [PATCH 03/12] Replace Discord links with PS chat --- .github/ISSUE_TEMPLATE/config.yml | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index d63c9d7..8d7661e 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -4,5 +4,5 @@ contact_links: url: https://discourse.purescript.org/ about: Ask and answer questions on the PureScript discussion forum. - name: PureScript Discord - url: https://discord.com/invite/sMqwYUbvz6/ + url: https://purescript.org/chat about: Ask and answer questions on the PureScript chat. diff --git a/README.md b/README.md index c21100d..d65396b 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ The quick start hasn't been written yet (contributions are welcome!). The quick If you get stuck, there are several ways to get help: - [Open an issue](https://github.com/purescript-contrib/purescript-parsing/issues) if you have encountered a bug or problem. -- Ask general questions on the [PureScript Discourse](https://discourse.purescript.org) forum or the [PureScript Discord](https://discord.com/invite/sMqwYUbvz6) chat. +- Ask general questions on the [PureScript Discourse](https://discourse.purescript.org) forum or the [PureScript Discord](https://purescript.org/chat) chat. ## Contributing From d33aea3ca5ccbd85f0a3cd569dea26a398bf501a Mon Sep 17 00:00:00 2001 From: Thomas Honeyman Date: Tue, 28 Sep 2021 19:41:37 +0200 Subject: [PATCH 04/12] Assign jamesdbrock as maintainer --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index d65396b..458c977 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,7 @@ [![CI](https://github.com/purescript-contrib/purescript-parsing/workflows/CI/badge.svg?branch=main)](https://github.com/purescript-contrib/purescript-parsing/actions?query=workflow%3ACI+branch%3Amain) [![Release](https://img.shields.io/github/release/purescript-contrib/purescript-parsing.svg)](https://github.com/purescript-contrib/purescript-parsing/releases) [![Pursuit](https://pursuit.purescript.org/packages/purescript-parsing/badge)](https://pursuit.purescript.org/packages/purescript-parsing) -[![Maintainer: thomashoneyman](https://img.shields.io/badge/maintainer-thomashoneyman-teal.svg)](https://github.com/thomashoneyman) -[![Maintainer: garyb](https://img.shields.io/badge/maintainer-garyb-teal.svg)](https://github.com/garyb) +[![Maintainer: jamesdbrock](https://img.shields.io/badge/maintainer-jamesdbrock-teal.svg)](https://github.com/jamesdbrock) A parser combinator library based on Haskell's [Parsec](https://hackage.haskell.org/package/parsec). From 50d3bc80494784153b2a5b830aabc44682dbc054 Mon Sep 17 00:00:00 2001 From: Robert Porter Date: Wed, 6 Oct 2021 10:40:52 +0900 Subject: [PATCH 05/12] Added maintainer badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 458c977..2dcf730 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ [![Release](https://img.shields.io/github/release/purescript-contrib/purescript-parsing.svg)](https://github.com/purescript-contrib/purescript-parsing/releases) [![Pursuit](https://pursuit.purescript.org/packages/purescript-parsing/badge)](https://pursuit.purescript.org/packages/purescript-parsing) [![Maintainer: jamesdbrock](https://img.shields.io/badge/maintainer-jamesdbrock-teal.svg)](https://github.com/jamesdbrock) +[![Maintainer: robertdp](https://img.shields.io/badge/maintainer-robertdp-teal.svg)](https://github.com/robertdp) A parser combinator library based on Haskell's [Parsec](https://hackage.haskell.org/package/parsec). From a1413b1607b8cc4aee57e99bc6818f955f533e85 Mon Sep 17 00:00:00 2001 From: James Brock Date: Fri, 24 Sep 2021 16:31:35 +0900 Subject: [PATCH 06/12] Unicode correctness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Correctly handle UTF-16 surrogate pairs in `String`s. We keep all of the API, but we change the primitive parsers so that instead of succeeding and incorrectly returning half of a surrogate pair, they will fail. All prior tests pass with no modifications. Add a few new tests. Non-breaking changes ==================== Add primitive parsers `anyCodePoint` and `satisfyCodePoint` for parsing `CodePoint`s. Add the `match` combinator. Move `updatePosString` to the `Text.Parsing.Parser.String` module and don't export it. Split dev dependencies into spago-dev.dhall. Add benchmark suite. Add astral UTF-16 test. Breaking changes ================ Change the definition of `whiteSpace` and `skipSpaces` to `Data.CodePoint.Unicode.isSpace`. To make this library handle Unicode correctly, it is necessary to either alter the `StringLike` class or delete it. We decided to delete it. The `String` module will now operate only on inputs of the concrete `String` type. `StringLike` has no laws, and during the five years of its life, no-one on Github has ever written another instance of `StringLike`. https://github.com/search?l=&q=StringLike+language%3APureScript&type=code The last time someone tried to alter `StringLike`, this is what happened: https://github.com/purescript-contrib/purescript-parsing/pull/62 Breaking changes which won’t be caught by the compiler ====================================================== Fundamentally, we change the way we consume the next input character from `Data.String.CodeUnits.uncons` to `Data.String.CodePoints.uncons`. `anyChar` will no longer always succeed. It will only succeed on a Basic Multilingual Plane character. The new parser `anyCodePoint` will always succeed. We are not quite “making the default `CodePoint`”, as was discussed in https://github.com/purescript-contrib/purescript-parsing/pull/76#issuecomment-398403864 . Rather we are keeping most of the current API and making it work properly with astral Unicode. We keep the `Char` parsers for backward compatibility. We also keep the `Char` parsers for ergonomic reasons. For example the parser `char :: forall s m. Monad m => Char -> ParserT s m Char`. This parser is usually called with a literal like `char 'a'`. It would be annoying to call this parser with `char (codePointFromChar 'a')`. Benchmarks ========== For Unicode correctness, we're now consuming characters with `Data.String.CodePoints.uncons` instead of `Data.String.CodeUnits.uncons`. If that were going to effect performance, then the effect would show up in the `runParser parse23` benchmark, but it doesn’t. Before ------ ``` runParser parse23 mean = 43.36 ms stddev = 6.75 ms min = 41.12 ms max = 124.65 ms runParser parseSkidoo mean = 22.53 ms stddev = 3.86 ms min = 21.40 ms max = 61.76 ms ``` After ----- ``` runParser parse23 mean = 42.90 ms stddev = 6.01 ms min = 40.97 ms max = 115.74 ms runParser parseSkidoo mean = 22.03 ms stddev = 2.79 ms min = 20.78 ms max = 53.34 ms ``` --- .github/workflows/ci.yml | 6 +- CHANGELOG.md | 22 ++- CONTRIBUTING.md | 13 ++ bench/Main.purs | 134 +++++++++++++++++++ spago-dev.dhall | 22 +++ spago.dhall | 7 +- src/Text/Parsing/Parser/Language.purs | 2 +- src/Text/Parsing/Parser/Pos.purs | 14 +- src/Text/Parsing/Parser/String.purs | 184 ++++++++++++++++++-------- src/Text/Parsing/Parser/Token.purs | 16 +-- test/Main.purs | 28 +++- 11 files changed, 354 insertions(+), 94 deletions(-) create mode 100644 bench/Main.purs create mode 100644 spago-dev.dhall diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cd71fe8..b8d697e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,10 +25,10 @@ jobs: output - name: Install dependencies - run: spago install + run: spago -x spago-dev.dhall install - name: Build source - run: spago build --no-install --purs-args '--censor-lib --strict --censor-codes='UserDefinedWarning'' + run: spago -x spago-dev.dhall build --no-install --purs-args '--censor-lib --strict --censor-codes='UserDefinedWarning'' - name: Run tests - run: spago test --no-install + run: spago -x spago-dev.dhall test --no-install diff --git a/CHANGELOG.md b/CHANGELOG.md index 709b3ed..5ecc1ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,10 +6,28 @@ Notable changes to this project are documented in this file. The format is based Breaking changes: +- `anyChar` will no longer always succeed. It will only succeed on a Basic + Multilingual Plane character. The new parser `anyCodePoint` will always + succeed. (#119 by @jamesdbrock) +- Delete the `StringLike` typeclass. Users must delete all `StringLike` + constraints. (#119 by @jamesdbrock) +- Move `updatePosString` to the `String` module and don’t + export it. (#119 by @jamesdbrock) +- Change the definition of `whiteSpace` and `skipSpaces` to + `Data.CodePoint.Unicode.isSpace`. (#119 by @jamesdbrock) + New features: +- Add primitive parsers `anyCodePoint` and `satisfyCodePoint` for parsing + `CodePoint`s. (#119 by @jamesdbrock) +- Add `match` combinator (#119 by @jamesdbrock) +- Add benchmark suite (#119 by @jamesdbrock) +- Split the dev dependencies out into `spago-dev.dhall`. + Bugfixes: +- Unicode correctness (#119 by @jamesdbrock) + Other improvements: ## [v6.0.2](https://github.com/purescript-contrib/purescript-parsing/releases/tag/v6.0.2) - 2021-05-09 @@ -26,12 +44,12 @@ Other improvements: ## [v6.0.0](https://github.com/purescript-contrib/purescript-parsing/releases/tag/v6.0.0) - 2021-02-26 Breaking changes: -- Improved performance of `string` and update `StringLike` to have `stripPrefix` as a class member instead of `indexOf` (#93) +- Improved performance of `string` and update `StringLike` to have `stripPrefix` as a class member instead of `indexOf` (#93) - Non-empty combinators now return `NonEmptyList` (#102) - Added support for PureScript 0.14 and dropped support for all previous versions (#101) New features: -- Derived `Generic` instance of Position (#87) +- Derived `Generic` instance of Position (#87) Bugfixes: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f9c4c59..768c507 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -3,3 +3,16 @@ Thanks for your interest in contributing to `parsing`! We welcome new contributions regardless of your level of experience or familiarity with PureScript. Every library in the Contributors organization shares a simple handbook that helps new contributors get started. With that in mind, please [read the short contributing guide on purescript-contrib/governance](https://github.com/purescript-contrib/governance/blob/main/contributing.md) before contributing to this library. + +# Development + +This package includes a `spago-dev.dhall` which provides the dependencies +for development and testing. + +## Testing + +To run the test suite: + +``` +spago -x spago-dev.dhall test +``` \ No newline at end of file diff --git a/bench/Main.purs b/bench/Main.purs new file mode 100644 index 0000000..d5d5bf2 --- /dev/null +++ b/bench/Main.purs @@ -0,0 +1,134 @@ +-- | # Benchmarking +-- | +-- | spago -x spago-dev.dhall run --main Bench.Main +-- | +-- | This benchmark suite is intended to guide changes to this package so that +-- | we can compare the benchmarks of different commits. +-- | +-- | This benchmark suite also compares parsers to equivalent Regex. This +-- | provides an answer to the common question “How much slower is this package +-- | than Regex?” Answer: approximately 100×. The Regex benchmarks also give +-- | us a rough way to calibrate benchmarks run on different platforms. +-- | +-- | # Profiling +-- | +-- | https://nodejs.org/en/docs/guides/simple-profiling/ +-- | https://nodesource.com/blog/diagnostics-in-NodeJS-2 +-- | +-- | spago -x spago-dev.dhall build --source-maps +-- | purs bundle output/**/*.js --source-maps --output ./index.bundle.js +-- | +-- | +-- | spago -x spago-dev.dhall build --source-maps --purs-args '--codegen corefn,sourcemaps' +-- | zephyr Bench.Main.main --codegen sourcemaps,js +-- | purs bundle dce-output/**/*.js --source-maps --module Bench.Main --main Bench.Main --output ./index.dce.bundle.js +-- | node index.dce.bundle.js +-- | +-- | spago -x spago-dev.dhall build --source-maps --purs-args '--codegen corefn,sourcemaps' +-- | purs bundle output/**/*.js --source-maps --module Bench.Main --main Bench.Main --output ./index.bundle.js +-- | node index.bundle.js +-- | node --prof --enable-source-maps ./index.bundle.js +-- | node --prof-process --source-map ./index.bundle.js.map isolate--.log > prof.txt +-- | +-- | node --prof --enable-source-maps -e 'require("./output/Bench.Main/index.js").main()' +-- | node --prof-process isolate--.log +-- | +-- | spago -x spago-dev.dhall build +-- | node --prof -e 'require("./output/Bench.Main/index.js").main()' +-- | node --prof-process isolate--.log > prof.txt + +module Bench.Main where + +import Prelude + +import Data.Array (fold, replicate) +import Data.Either (either) +import Data.List (manyRec) +import Data.List.Types (List) +import Data.String.Regex (Regex, regex) +import Data.String.Regex as Regex +import Data.String.Regex.Flags (RegexFlags(..)) +import Effect (Effect) +import Effect.Console (log) +import Effect.Exception (throw) +import Effect.Unsafe (unsafePerformEffect) +import Performance.Minibench (benchWith) +import Text.Parsing.Parser (Parser, runParser) +import Text.Parsing.Parser.String (string) +import Text.Parsing.Parser.Token (digit) +import Text.Parsing.StringParser as StringParser +import Text.Parsing.StringParser.CodePoints as StringParser.CodePoints +import Text.Parsing.StringParser.CodeUnits as StringParser.CodeUnits + +string23 :: String +string23 = "23" +string23_2 :: String +string23_2 = fold $ replicate 2 string23 +string23_10000 :: String +string23_10000 = fold $ replicate 10000 string23 + +stringSkidoo :: String +stringSkidoo = "skidoo" +stringSkidoo_2 :: String +stringSkidoo_2 = fold $ replicate 2 stringSkidoo +stringSkidoo_10000 :: String +stringSkidoo_10000 = fold $ replicate 10000 stringSkidoo + +parse23 :: Parser String (List Char) +parse23 = manyRec digit + +parse23Points :: StringParser.Parser (List Char) +parse23Points = manyRec StringParser.CodePoints.anyDigit + +parse23Units :: StringParser.Parser (List Char) +parse23Units = manyRec StringParser.CodeUnits.anyDigit + +pattern23 :: Regex +pattern23 = either (unsafePerformEffect <<< throw) identity $ + regex "\\d" $ RegexFlags + { dotAll: true + , global: true + , ignoreCase: false + , multiline: true + , sticky: false + , unicode: true + } + +parseSkidoo :: Parser String (List String) +parseSkidoo = manyRec $ string "skidoo" + +patternSkidoo :: Regex +patternSkidoo = either (unsafePerformEffect <<< throw) identity $ + regex "skidoo" $ RegexFlags + { dotAll: true + , global: true + , ignoreCase: false + , multiline: true + , sticky: false + , unicode: true + } + +main :: Effect Unit +main = do + -- log $ show $ runParser string23_2 parse23 + -- log $ show $ Regex.match pattern23 string23_2 + -- log $ show $ runParser stringSkidoo_2 parseSkidoo + -- log $ show $ Regex.match patternSkidoo stringSkidoo_2 + log "runParser parse23" + benchWith 200 + $ \_ -> runParser string23_10000 parse23 + log "StringParser.runParser parse23Points" + benchWith 20 + $ \_ -> StringParser.runParser parse23Points string23_10000 + log "StringParser.runParser parse23Units" + benchWith 200 + $ \_ -> StringParser.runParser parse23Units string23_10000 + log "Regex.match pattern23" + benchWith 200 + $ \_ -> Regex.match pattern23 string23_10000 + log "runParser parseSkidoo" + benchWith 200 + $ \_ -> runParser stringSkidoo_10000 parseSkidoo + log "Regex.match patternSkidoo" + benchWith 200 + $ \_ -> Regex.match patternSkidoo stringSkidoo_10000 diff --git a/spago-dev.dhall b/spago-dev.dhall new file mode 100644 index 0000000..9bbec6e --- /dev/null +++ b/spago-dev.dhall @@ -0,0 +1,22 @@ +-- Spago configuration for testing, benchmarking, development. +-- +-- See: +-- * ./CONTRIBUTING.md +-- * https://github.com/purescript/spago#devdependencies-testdependencies-or-in-general-a-situation-with-many-configurations +-- + +let conf = ./spago.dhall + +in conf // +{ sources = [ "src/**/*.purs", "test/**/*.purs", "bench/**/*.purs" ] +, dependencies = conf.dependencies # + [ "assert" + , "console" + , "effect" + , "psci-support" + , "minibench" + , "exceptions" + , "string-parsers" + ] +, packages = ./packages.dhall +} diff --git a/spago.dhall b/spago.dhall index 139bc69..fe22dd2 100644 --- a/spago.dhall +++ b/spago.dhall @@ -1,10 +1,7 @@ { name = "parsing" , dependencies = [ "arrays" - , "assert" - , "console" , "control" - , "effect" , "either" , "foldable-traversable" , "identity" @@ -14,13 +11,13 @@ , "maybe" , "newtype" , "prelude" - , "psci-support" , "strings" , "tailrec" , "transformers" , "tuples" , "unicode" + , "unsafe-coerce" ] , packages = ./packages.dhall -, sources = [ "src/**/*.purs", "test/**/*.purs" ] +, sources = [ "src/**/*.purs" ] } diff --git a/src/Text/Parsing/Parser/Language.purs b/src/Text/Parsing/Parser/Language.purs index c6d71e2..cc59758 100644 --- a/src/Text/Parsing/Parser/Language.purs +++ b/src/Text/Parsing/Parser/Language.purs @@ -13,7 +13,7 @@ import Prelude import Control.Alt ((<|>)) import Text.Parsing.Parser (ParserT) import Text.Parsing.Parser.String (char, oneOf) -import Text.Parsing.Parser.Token (LanguageDef, TokenParser, GenLanguageDef(..), unGenLanguageDef, makeTokenParser, alphaNum, letter) +import Text.Parsing.Parser.Token (GenLanguageDef(..), LanguageDef, TokenParser, alphaNum, letter, makeTokenParser, unGenLanguageDef) ----------------------------------------------------------- -- Styles: haskellStyle, javaStyle diff --git a/src/Text/Parsing/Parser/Pos.purs b/src/Text/Parsing/Parser/Pos.purs index 6a20dff..0421773 100644 --- a/src/Text/Parsing/Parser/Pos.purs +++ b/src/Text/Parsing/Parser/Pos.purs @@ -1,10 +1,8 @@ module Text.Parsing.Parser.Pos where import Prelude + import Data.Generic.Rep (class Generic) -import Data.Foldable (foldl) -import Data.Newtype (wrap) -import Data.String (split) -- | `Position` represents the position of the parser in the input. -- | @@ -27,13 +25,3 @@ derive instance ordPosition :: Ord Position -- | The `Position` before any input has been parsed. initialPos :: Position initialPos = Position { line: 1, column: 1 } - --- | Updates a `Position` by adding the columns and lines in `String`. -updatePosString :: Position -> String -> Position -updatePosString pos' str = foldl updatePosChar pos' (split (wrap "") str) - where - updatePosChar (Position pos) c = case c of - "\n" -> Position { line: pos.line + 1, column: 1 } - "\r" -> Position { line: pos.line + 1, column: 1 } - "\t" -> Position { line: pos.line, column: pos.column + 8 - ((pos.column - 1) `mod` 8) } - _ -> Position { line: pos.line, column: pos.column + 1 } diff --git a/src/Text/Parsing/Parser/String.purs b/src/Text/Parsing/Parser/String.purs index a5deb01..dd303f2 100644 --- a/src/Text/Parsing/Parser/String.purs +++ b/src/Text/Parsing/Parser/String.purs @@ -1,92 +1,160 @@ -- | Primitive parsers for working with an input stream of type `String`. - -module Text.Parsing.Parser.String where +-- | +-- | The behavior of these primitive parsers is based on the behavior of the +-- | `Data.String` module in the __strings__ package. +-- | In most JavaScript runtime environments, the `String` +-- | is little-endian [UTF-16](https://en.wikipedia.org/wiki/UTF-16). +-- | +-- | The primitive parsers which return `Char` will only succeed when the character +-- | being parsed is a code point in the +-- | [Basic Multilingual Plane](https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane) +-- | (the “BMP”). These parsers can be convenient because of the good support +-- | that PureScript has for writing `Char` literals like `'あ'`, `'β'`, `'C'`. +-- | +-- | The other primitive parsers, which return `CodePoint` and `String` types, +-- | can parse the full Unicode character set. All of the primitive parsers +-- | in this module can be used together. +module Text.Parsing.Parser.String +( string +, eof +, anyChar +, anyCodePoint +, satisfy +, satisfyCodePoint +, char +, whiteSpace +, skipSpaces +, oneOf +, noneOf +, match +) +where import Prelude hiding (between) -import Control.Monad.State (gets, modify_) -import Data.Array (many) -import Data.Foldable (elem, notElem) +import Control.Monad.State (get, put) +import Data.Array (notElem) +import Data.Char (fromCharCode) +import Data.CodePoint.Unicode (isSpace) +import Data.Foldable (elem) import Data.Maybe (Maybe(..)) -import Data.Newtype (wrap) -import Data.String (Pattern) -import Data.String as S +import Data.String (CodePoint, Pattern(..), null, stripPrefix, uncons) import Data.String.CodeUnits as SCU +import Data.Tuple (Tuple(..), fst) import Text.Parsing.Parser (ParseState(..), ParserT, fail) -import Text.Parsing.Parser.Combinators (tryRethrow, ()) -import Text.Parsing.Parser.Pos (updatePosString) - --- | This class exists to abstract over streams which support the string-like --- | operations which this modules needs. -class StringLike s where - drop :: Int -> s -> s - stripPrefix :: Pattern -> s -> Maybe s - null :: s -> Boolean - uncons :: s -> Maybe { head :: Char, tail :: s } - -instance stringLikeString :: StringLike String where - uncons = SCU.uncons - drop = S.drop - stripPrefix = S.stripPrefix - null = S.null +import Text.Parsing.Parser.Combinators (skipMany, tryRethrow, ()) +import Text.Parsing.Parser.Pos (Position(..)) +import Unsafe.Coerce (unsafeCoerce) -- | Match end-of-file. -eof :: forall s m. StringLike s => Monad m => ParserT s m Unit +eof :: forall m. Monad m => ParserT String m Unit eof = do - input <- gets \(ParseState input _ _) -> input + ParseState input _ _ <- get unless (null input) (fail "Expected EOF") -- | Match the specified string. -string :: forall s m. StringLike s => Monad m => String -> ParserT s m String +string :: forall m. Monad m => String -> ParserT String m String string str = do - input <- gets \(ParseState input _ _) -> input - case stripPrefix (wrap str) input of + ParseState input position _ <- get + case stripPrefix (Pattern str) input of Just remainder -> do - modify_ \(ParseState _ position _) -> - ParseState remainder - (updatePosString position str) - true + put $ ParseState remainder (updatePosString position str) true pure str _ -> fail ("Expected " <> show str) --- | Match any character. -anyChar :: forall s m. StringLike s => Monad m => ParserT s m Char -anyChar = do - input <- gets \(ParseState input _ _) -> input +-- | Match any BMP `Char`. +-- | Parser will fail if the character is not in the Basic Multilingual Plane. +anyChar :: forall m. Monad m => ParserT String m Char +anyChar = tryRethrow do + cp :: Int <- unCodePoint <$> anyCodePoint + -- the `fromCharCode` function doesn't check if this is beyond the + -- BMP, so we check that ourselves. + -- https://github.com/purescript/purescript-strings/issues/153 + if cp > 65535 -- BMP + then fail "Not a Char" + else case fromCharCode cp of + Nothing -> fail "Not a Char" + Just c -> pure c + +-- | Match any Unicode character. +-- | Always succeeds. +anyCodePoint :: forall m. Monad m => ParserT String m CodePoint +anyCodePoint = do + ParseState input position _ <- get case uncons input of Nothing -> fail "Unexpected EOF" Just { head, tail } -> do - modify_ \(ParseState _ position _) -> - ParseState tail - (updatePosString position (SCU.singleton head)) - true + put $ ParseState tail (updatePosSingle position head) true pure head --- | Match a character satisfying the specified predicate. -satisfy :: forall s m. StringLike s => Monad m => (Char -> Boolean) -> ParserT s m Char +-- | Match a BMP `Char` satisfying the predicate. +satisfy :: forall m. Monad m => (Char -> Boolean) -> ParserT String m Char satisfy f = tryRethrow do c <- anyChar - if f c then pure c - else fail $ "Character '" <> SCU.singleton c <> "' did not satisfy predicate" + if f c + then pure c + else fail "Predicate unsatisfied" --- | Match the specified character -char :: forall s m. StringLike s => Monad m => Char -> ParserT s m Char +-- | Match a Unicode character satisfying the predicate. +satisfyCodePoint :: forall m. Monad m => (CodePoint -> Boolean) -> ParserT String m CodePoint +satisfyCodePoint f = tryRethrow do + c <- anyCodePoint + if f c + then pure c + else fail "Predicate unsatisfied" + +-- | Match the specified BMP `Char`. +char :: forall m. Monad m => Char -> ParserT String m Char char c = satisfy (_ == c) show c --- | Match zero or more whitespace characters. -whiteSpace :: forall s m. StringLike s => Monad m => ParserT s m String -whiteSpace = do - cs <- many $ satisfy \c -> c == '\n' || c == '\r' || c == ' ' || c == '\t' - pure $ SCU.fromCharArray cs +-- | Match zero or more whitespace characters satisfying +-- | `Data.CodePoint.Unicode.isSpace`. +whiteSpace :: forall m. Monad m => ParserT String m String +whiteSpace = fst <$> match skipSpaces -- | Skip whitespace characters. -skipSpaces :: forall s m. StringLike s => Monad m => ParserT s m Unit -skipSpaces = void whiteSpace +skipSpaces :: forall m. Monad m => ParserT String m Unit +skipSpaces = skipMany (satisfyCodePoint isSpace) --- | Match one of the characters in the array. -oneOf :: forall s m. StringLike s => Monad m => Array Char -> ParserT s m Char +-- | Match one of the BMP `Char`s in the array. +oneOf :: forall m. Monad m => Array Char -> ParserT String m Char oneOf ss = satisfy (flip elem ss) ("one of " <> show ss) --- | Match any character not in the array. -noneOf :: forall s m. StringLike s => Monad m => Array Char -> ParserT s m Char +-- | Match any BMP `Char` not in the array. +noneOf :: forall m. Monad m => Array Char -> ParserT String m Char noneOf ss = satisfy (flip notElem ss) ("none of " <> show ss) + +-- | Updates a `Position` by adding the columns and lines in `String`. +updatePosString :: Position -> String -> Position +updatePosString pos str = case uncons str of + Nothing -> pos + Just {head,tail} -> updatePosString (updatePosSingle pos head) tail -- tail recursive + +-- | Updates a `Position` by adding the columns and lines in a +-- | single `CodePoint`. +updatePosSingle :: Position -> CodePoint -> Position +updatePosSingle (Position {line,column}) cp = case unCodePoint cp of + 10 -> Position { line: line + 1, column: 1 } -- "\n" + 13 -> Position { line: line + 1, column: 1 } -- "\r" + 9 -> Position { line, column: column + 8 - ((column - 1) `mod` 8) } -- "\t" Who says that one tab is 8 columns? + _ -> Position { line, column: column + 1 } + +-- | Combinator which returns both the result of a parse and the portion of +-- | the input that was consumed while it was being parsed. +match :: forall m a. Monad m => ParserT String m a -> ParserT String m (Tuple String a) +match p = do + ParseState input1 _ _ <- get + x <- p + ParseState input2 _ _ <- get + -- We use the `SCU.length`, which is in units of “code units” + -- instead of `Data.String.length`. which is in units of “code points”. + -- This is more efficient, and it will be correct as long as we can assume + -- the invariant that the `ParseState input` always begins on a code point + -- boundary. + pure $ Tuple (SCU.take (SCU.length input1 - SCU.length input2) input1) x + +-- | The CodePoint newtype constructor is not exported, so here's a helper. +-- | This will break at runtime if the definition of CodePoint ever changes +-- | to something other than `newtype CodePoint = CodePoint Int`. +unCodePoint :: CodePoint -> Int +unCodePoint = unsafeCoerce \ No newline at end of file diff --git a/src/Text/Parsing/Parser/Token.purs b/src/Text/Parsing/Parser/Token.purs index 970af21..0e852fc 100644 --- a/src/Text/Parsing/Parser/Token.purs +++ b/src/Text/Parsing/Parser/Token.purs @@ -21,17 +21,14 @@ module Text.Parsing.Parser.Token ) where -import Prelude hiding (when,between) +import Prelude hiding (when, between) import Control.Lazy (fix) import Control.Monad.State (gets, modify_) import Control.MonadPlus (guard, (<|>)) import Data.Array as Array -import Data.String.CodeUnits (toChar, singleton) as CodeUnits -import Data.String.CodePoints (CodePoint, codePointFromChar) import Data.Char (fromCharCode, toCharCode) -import Data.CodePoint.Unicode (isAlpha, isAlphaNum, isDecDigit, isHexDigit, isOctDigit, isSpace, isUpper, hexDigitToInt) -import Data.String.Unicode as Unicode +import Data.CodePoint.Unicode (hexDigitToInt, isAlpha, isAlphaNum, isDecDigit, isHexDigit, isOctDigit, isSpace, isUpper) import Data.Either (Either(..)) import Data.Foldable (foldl, foldr) import Data.Identity (Identity) @@ -40,14 +37,17 @@ import Data.List (List(..)) import Data.List as List import Data.List.NonEmpty (NonEmptyList) import Data.Maybe (Maybe(..), maybe) -import Data.String (null, toLower) +import Data.String (CodePoint, null, toLower) +import Data.String.CodePoints (codePointFromChar) +import Data.String.CodeUnits (toChar, singleton) as CodeUnits import Data.String.CodeUnits as SCU +import Data.String.Unicode as Unicode import Data.Tuple (Tuple(..)) import Math (pow) import Text.Parsing.Parser (ParseState(..), ParserT, fail) import Text.Parsing.Parser.Combinators (skipMany1, try, tryRethrow, skipMany, notFollowedBy, option, choice, between, sepBy1, sepBy, (), ()) import Text.Parsing.Parser.Pos (Position) -import Text.Parsing.Parser.String (satisfy, oneOf, noneOf, string, char) +import Text.Parsing.Parser.String (char, noneOf, oneOf, satisfy, satisfyCodePoint, string) -- | Create a parser which Returns the first token in the stream. token :: forall m a. Monad m => (a -> Position) -> ParserT (List a) m a @@ -746,7 +746,7 @@ whiteSpace' langDef@(LanguageDef languageDef) skipMany (simpleSpace <|> oneLineComment langDef <|> multiLineComment langDef "") simpleSpace :: forall m . Monad m => ParserT String m Unit -simpleSpace = skipMany1 (satisfyCP isSpace) +simpleSpace = skipMany1 (satisfyCodePoint isSpace) oneLineComment :: forall m . Monad m => GenLanguageDef String m -> ParserT String m Unit oneLineComment (LanguageDef languageDef) = diff --git a/test/Main.purs b/test/Main.purs index 7c3ed2d..a581f3e 100644 --- a/test/Main.purs +++ b/test/Main.purs @@ -1,6 +1,6 @@ module Test.Main where -import Prelude hiding (between,when) +import Prelude hiding (between, when) import Control.Alt ((<|>)) import Control.Lazy (fix) @@ -9,7 +9,9 @@ import Data.Either (Either(..)) import Data.List (List(..), fromFoldable, many) import Data.List.NonEmpty (cons, cons') import Data.Maybe (Maybe(..)) +import Data.String.CodePoints as SCP import Data.String.CodeUnits (fromCharArray, singleton) +import Data.String.CodeUnits as SCU import Data.Tuple (Tuple(..)) import Effect (Effect) import Effect.Console (logShow) @@ -19,8 +21,8 @@ import Text.Parsing.Parser.Combinators (endBy1, sepBy1, optionMaybe, try, chainl import Text.Parsing.Parser.Expr (Assoc(..), Operator(..), buildExprParser) import Text.Parsing.Parser.Language (javaStyle, haskellStyle, haskellDef) import Text.Parsing.Parser.Pos (Position(..), initialPos) -import Text.Parsing.Parser.String (eof, string, char, satisfy, anyChar) -import Text.Parsing.Parser.Token (TokenParser, match, when, token, makeTokenParser) +import Text.Parsing.Parser.String (anyChar, anyCodePoint, char, eof, satisfy, string, whiteSpace) +import Text.Parsing.Parser.Token (TokenParser, letter, makeTokenParser, match, token, when) parens :: forall m a. Monad m => ParserT String m a -> ParserT String m a parens = between (string "(") (string ")") @@ -39,7 +41,7 @@ parseTest input expected p = case runParser input p of parseErrorTestPosition :: forall s a. Show a => Parser s a -> s -> Position -> Effect Unit parseErrorTestPosition p input expected = case runParser input p of - Right _ -> assert' "error: ParseError expected!" false + Right x -> assert' ("ParseError expected at " <> show expected <> " but parsed " <> show x) false Left err -> do let pos = parseErrorPosition err assert' ("expected: " <> show expected <> ", pos: " <> show pos) (expected == pos) @@ -448,6 +450,24 @@ main = do parseTest "1*2+3/4-5" (-3) exprTest parseTest "ab?" "ab" manySatisfyTest + parseErrorTestPosition + anyChar + "𝅘𝅥𝅯" + (Position {column:1,line:1}) + + parseTest "𝅘𝅥𝅘𝅥𝅮x𝅘𝅥𝅯" ["𝅘𝅥", "𝅘𝅥𝅮", "x", "𝅘𝅥𝅯"] do + quarter <- anyCodePoint + eighth <- (singleton <$> char 'x') <|> string "𝅘𝅥𝅮" + letterx <- string "𝅘𝅥𝅯" <|> string "x" + sixteenth <- string "𝅘𝅥𝅯" <|> (singleton <$> char 'x') + pure $ [SCP.singleton quarter, eighth, letterx, sixteenth] + + parseTest "aa bb" ["aa", " ", "bb"] do + aa <- SCU.fromCharArray <$> some letter + w <- whiteSpace + bb <- SCU.fromCharArray <$> some letter + pure [aa, w, bb] + let tokpos = const initialPos parseTest (fromFoldable [A, B]) A (token tokpos) parseTest (fromFoldable [B, A]) B (token tokpos) From 7454d6b1fc60abb3336d79f2c945414e61019c92 Mon Sep 17 00:00:00 2001 From: James Brock Date: Wed, 6 Oct 2021 15:33:12 +0900 Subject: [PATCH 07/12] Add license, repository to spago.dhall --- spago.dhall | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spago.dhall b/spago.dhall index fe22dd2..c20e881 100644 --- a/spago.dhall +++ b/spago.dhall @@ -1,4 +1,6 @@ { name = "parsing" +, license = "BSD3" +, repository = "https://github.com/purescript-contrib/purescript-parsing.git" , dependencies = [ "arrays" , "control" From 83b6c7abd4c5dc1d85543556084b308a0364c4ef Mon Sep 17 00:00:00 2001 From: James Brock Date: Wed, 6 Oct 2021 15:35:18 +0900 Subject: [PATCH 08/12] bower.json from spago bump-version --no-dry-run major --- bower.json | 71 +++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 38 deletions(-) diff --git a/bower.json b/bower.json index a3d9a7a..3f16baa 100644 --- a/bower.json +++ b/bower.json @@ -1,40 +1,35 @@ { - "name": "purescript-parsing", - "homepage": "https://github.com/purescript-contrib/purescript-parsing", - "description": "Parser combinator library", - "keywords": [ - "purescript" - ], - "license": "BSD-3-Clause", - "repository": { - "type": "git", - "url": "https://github.com/purescript-contrib/purescript-parsing.git" - }, - "ignore": [ - "**/.*", - "bower_components", - "node_modules", - "output", - "test", - "bower.json", - "package.json" - ], - "dependencies": { - "purescript-arrays": "^6.0.0", - "purescript-either": "^5.0.0", - "purescript-foldable-traversable": "^5.0.0", - "purescript-identity": "^5.0.0", - "purescript-integers": "^5.0.0", - "purescript-lists": "^6.0.0", - "purescript-maybe": "^5.0.0", - "purescript-prelude": "^5.0.0", - "purescript-strings": "^5.0.0", - "purescript-transformers": "^5.0.0", - "purescript-unicode": "^5.0.0" - }, - "devDependencies": { - "purescript-assert": "^5.0.0", - "purescript-console": "^5.0.0", - "purescript-effect": "^3.0.0" - } + "name": "purescript-parsing", + "license": [ + "BSD3" + ], + "repository": { + "type": "git", + "url": "https://github.com/purescript-contrib/purescript-parsing.git" + }, + "ignore": [ + "**/.*", + "node_modules", + "bower_components", + "output" + ], + "dependencies": { + "purescript-arrays": "^v6.0.1", + "purescript-control": "^v5.0.0", + "purescript-either": "^v5.0.0", + "purescript-foldable-traversable": "^v5.0.1", + "purescript-identity": "^v5.0.0", + "purescript-integers": "^v5.0.0", + "purescript-lists": "^v6.0.1", + "purescript-math": "^v3.0.0", + "purescript-maybe": "^v5.0.0", + "purescript-newtype": "^v4.0.0", + "purescript-prelude": "^v5.0.1", + "purescript-strings": "^v5.0.0", + "purescript-tailrec": "^v5.0.1", + "purescript-transformers": "^v5.1.0", + "purescript-tuples": "^v6.0.1", + "purescript-unicode": "^v5.0.0", + "purescript-unsafe-coerce": "^v5.0.0" + } } From 07438c366d0ab2cda99cfec9e4adf8704360f246 Mon Sep 17 00:00:00 2001 From: James Brock Date: Wed, 6 Oct 2021 15:35:45 +0900 Subject: [PATCH 09/12] =?UTF-8?q?v6.0.2=20=E2=86=92=20v7.0.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From 636f8970800143d803d7cd53986bb60b8e16618a Mon Sep 17 00:00:00 2001 From: James Brock Date: Wed, 6 Oct 2021 15:43:25 +0900 Subject: [PATCH 10/12] BSD-2-Clause --- spago.dhall | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spago.dhall b/spago.dhall index c20e881..7958795 100644 --- a/spago.dhall +++ b/spago.dhall @@ -1,5 +1,5 @@ { name = "parsing" -, license = "BSD3" +, license = "BSD-2-Clause" , repository = "https://github.com/purescript-contrib/purescript-parsing.git" , dependencies = [ "arrays" From 74197e4f156f5f4ad25ca6ab2c68d2d818f12808 Mon Sep 17 00:00:00 2001 From: James Brock Date: Wed, 6 Oct 2021 15:44:26 +0900 Subject: [PATCH 11/12] bower.json from spago bump-version --no-dry-run major --- bower.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bower.json b/bower.json index 3f16baa..e74bac6 100644 --- a/bower.json +++ b/bower.json @@ -1,7 +1,7 @@ { "name": "purescript-parsing", "license": [ - "BSD3" + "BSD-2-Clause" ], "repository": { "type": "git", From 297ad9e88cba9ddda70a249f468b6972477f097f Mon Sep 17 00:00:00 2001 From: James Brock Date: Wed, 6 Oct 2021 15:44:48 +0900 Subject: [PATCH 12/12] =?UTF-8?q?v6.0.2=20=E2=86=92=20v7.0.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit