From c2ee9ca3e02810c76f82bfc32a6643f2b0af0b84 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Wed, 30 Sep 2020 21:20:13 -0500 Subject: [PATCH 001/705] WIP - native _sre --- constants.rs | 114 ++++++++++++++++++++++++++++++++++++++++++++++ interp.rs | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 constants.rs create mode 100644 interp.rs diff --git a/constants.rs b/constants.rs new file mode 100644 index 0000000000..f6aeb3182f --- /dev/null +++ b/constants.rs @@ -0,0 +1,114 @@ +/* + * Secret Labs' Regular Expression Engine + * + * regular expression matching engine + * + * NOTE: This file is generated by sre_constants.py. If you need + * to change anything in here, edit sre_constants.py and run it. + * + * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. + * + * See the _sre.c file for information on usage and redistribution. + */ + +use bitflags::bitflags; + +pub const SRE_MAGIC: usize = 20140917; +#[derive(num_enum::TryFromPrimitive, Debug)] +#[repr(u32)] +#[allow(non_camel_case_types)] +pub enum SreOpcode { + FAILURE = 0, + SUCCESS = 1, + ANY = 2, + ANY_ALL = 3, + ASSERT = 4, + ASSERT_NOT = 5, + AT = 6, + BRANCH = 7, + CALL = 8, + CATEGORY = 9, + CHARSET = 10, + BIGCHARSET = 11, + GROUPREF = 12, + GROUPREF_EXISTS = 13, + GROUPREF_IGNORE = 14, + IN = 15, + IN_IGNORE = 16, + INFO = 17, + JUMP = 18, + LITERAL = 19, + LITERAL_IGNORE = 20, + MARK = 21, + MAX_UNTIL = 22, + MIN_UNTIL = 23, + NOT_LITERAL = 24, + NOT_LITERAL_IGNORE = 25, + NEGATE = 26, + RANGE = 27, + REPEAT = 28, + REPEAT_ONE = 29, + SUBPATTERN = 30, + MIN_REPEAT_ONE = 31, + RANGE_IGNORE = 32, +} +#[derive(num_enum::TryFromPrimitive, Debug)] +#[repr(u32)] +#[allow(non_camel_case_types)] +pub enum SreAtCode { + BEGINNING = 0, + BEGINNING_LINE = 1, + BEGINNING_STRING = 2, + BOUNDARY = 3, + NON_BOUNDARY = 4, + END = 5, + END_LINE = 6, + END_STRING = 7, + LOC_BOUNDARY = 8, + LOC_NON_BOUNDARY = 9, + UNI_BOUNDARY = 10, + UNI_NON_BOUNDARY = 11, +} +#[derive(num_enum::TryFromPrimitive, Debug)] +#[repr(u32)] +#[allow(non_camel_case_types)] +pub enum SreCatCode { + DIGIT = 0, + NOT_DIGIT = 1, + SPACE = 2, + NOT_SPACE = 3, + WORD = 4, + NOT_WORD = 5, + LINEBREAK = 6, + NOT_LINEBREAK = 7, + LOC_WORD = 8, + LOC_NOT_WORD = 9, + UNI_DIGIT = 10, + UNI_NOT_DIGIT = 11, + UNI_SPACE = 12, + UNI_NOT_SPACE = 13, + UNI_WORD = 14, + UNI_NOT_WORD = 15, + UNI_LINEBREAK = 16, + UNI_NOT_LINEBREAK = 17, +} +bitflags! { + pub struct SreFlag: u16 { + const TEMPLATE = 1; + const IGNORECASE = 2; + const LOCALE = 4; + const MULTILINE = 8; + const DOTALL = 16; + const UNICODE = 32; + const VERBOSE = 64; + const DEBUG = 128; + const ASCII = 256; + } +} +bitflags! { + pub struct SreInfo: u32 { + const PREFIX = 1; + const LITERAL = 2; + const CHARSET = 4; + } +} diff --git a/interp.rs b/interp.rs new file mode 100644 index 0000000000..7f93a82eb4 --- /dev/null +++ b/interp.rs @@ -0,0 +1,126 @@ +// good luck to those that follow; here be dragons + +use crate::builtins::PyStrRef; + +use super::constants::{SreFlag, SreOpcode}; + +use std::convert::TryFrom; +use std::{iter, slice}; + +pub struct State { + start: usize, + s_pos: usize, + end: usize, + pos: usize, + flags: SreFlag, + marks: Vec, + lastindex: isize, + marks_stack: Vec, + context_stack: Vec, + repeat: Option, + s: PyStrRef, +} + +// struct State1<'a> { +// state: &'a mut State, +// } + +struct MatchContext { + s_pos: usize, + code_pos: usize, +} + +// struct Context<'a> { +// context_stack: &mut Vec, +// } + +impl State { + pub fn new(s: PyStrRef, start: usize, end: usize, flags: SreFlag) -> Self { + let end = std::cmp::min(end, s.char_len()); + Self { + start, + s_pos: start, + end, + pos: start, + flags, + marks: Vec::new(), + lastindex: -1, + marks_stack: Vec::new(), + context_stack: Vec::new(), + repeat: None, + s, + } + } +} + +// struct OpcodeDispatcher { +// executing_contexts: HashMap>, +// } + +pub struct BadSreCode; + +pub fn parse_ops(code: &[u32]) -> impl Iterator> + '_ { + let mut it = code.iter().copied(); + std::iter::from_fn(move || -> Option> { + let op = it.next()?; + let op = SreOpcode::try_from(op) + .ok() + .and_then(|op| extract_code(op, &mut it)); + Some(op) + }) + .map(|x| x.ok_or(BadSreCode)) +} + +type It<'a> = iter::Copied>; +fn extract_code(op: SreOpcode, it: &mut It) -> Option { + let skip = |it: &mut It| { + let skip = it.next()? as usize; + if skip > it.len() { + None + } else { + Some(skip) + } + }; + match op { + SreOpcode::FAILURE => {} + SreOpcode::SUCCESS => {} + SreOpcode::ANY => {} + SreOpcode::ANY_ALL => {} + SreOpcode::ASSERT => {} + SreOpcode::ASSERT_NOT => {} + SreOpcode::AT => {} + SreOpcode::BRANCH => {} + SreOpcode::CALL => {} + SreOpcode::CATEGORY => {} + SreOpcode::CHARSET => {} + SreOpcode::BIGCHARSET => {} + SreOpcode::GROUPREF => {} + SreOpcode::GROUPREF_EXISTS => {} + SreOpcode::GROUPREF_IGNORE => {} + SreOpcode::IN => {} + SreOpcode::IN_IGNORE => {} + SreOpcode::INFO => { + // let skip = it.next()?; + } + SreOpcode::JUMP => {} + SreOpcode::LITERAL => {} + SreOpcode::LITERAL_IGNORE => {} + SreOpcode::MARK => {} + SreOpcode::MAX_UNTIL => {} + SreOpcode::MIN_UNTIL => {} + SreOpcode::NOT_LITERAL => {} + SreOpcode::NOT_LITERAL_IGNORE => {} + SreOpcode::NEGATE => {} + SreOpcode::RANGE => {} + SreOpcode::REPEAT => {} + SreOpcode::REPEAT_ONE => {} + SreOpcode::SUBPATTERN => {} + SreOpcode::MIN_REPEAT_ONE => {} + SreOpcode::RANGE_IGNORE => {} + } + todo!() +} + +pub enum Op { + Info {}, +} From e1362ead3c8c7462b0027a7ecb4b9bbd23e76b5a Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 1 Dec 2020 17:51:11 +0200 Subject: [PATCH 002/705] WIP structure --- constants.rs | 4 + interp.rs | 467 +++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 379 insertions(+), 92 deletions(-) diff --git a/constants.rs b/constants.rs index f6aeb3182f..a80534d70b 100644 --- a/constants.rs +++ b/constants.rs @@ -14,6 +14,10 @@ use bitflags::bitflags; pub const SRE_MAGIC: usize = 20140917; +pub const SRE_CODESIZE: usize = 4; +pub const SRE_MAXREPEAT: usize = usize::max_value(); +pub const SRE_MAXGROUPS: usize = usize::max_value() / std::mem::size_of::() / 2; + #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] #[allow(non_camel_case_types)] diff --git a/interp.rs b/interp.rs index 7f93a82eb4..43c2a54515 100644 --- a/interp.rs +++ b/interp.rs @@ -1,126 +1,409 @@ // good luck to those that follow; here be dragons +use super::constants::{SreFlag, SreOpcode, SRE_MAXREPEAT}; use crate::builtins::PyStrRef; - -use super::constants::{SreFlag, SreOpcode}; - +use rustpython_common::borrow::BorrowValue; +use std::collections::HashMap; use std::convert::TryFrom; -use std::{iter, slice}; -pub struct State { +pub struct State<'a> { + // py_string: PyStrRef, + string: &'a str, start: usize, - s_pos: usize, end: usize, - pos: usize, flags: SreFlag, + pattern_codes: Vec, marks: Vec, lastindex: isize, marks_stack: Vec, context_stack: Vec, repeat: Option, - s: PyStrRef, + string_position: usize, } -// struct State1<'a> { -// state: &'a mut State, -// } - -struct MatchContext { - s_pos: usize, - code_pos: usize, -} - -// struct Context<'a> { -// context_stack: &mut Vec, -// } - -impl State { - pub fn new(s: PyStrRef, start: usize, end: usize, flags: SreFlag) -> Self { - let end = std::cmp::min(end, s.char_len()); +impl<'a> State<'a> { + pub(crate) fn new( + // py_string: PyStrRef, + string: &'a str, + start: usize, + end: usize, + flags: SreFlag, + pattern_codes: Vec, + ) -> Self { + // let string = py_string.borrow_value(); Self { + // py_string, + string, start, - s_pos: start, end, - pos: start, flags, - marks: Vec::new(), + pattern_codes, lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), repeat: None, - s, + marks: Vec::new(), + string_position: start, } } + + fn reset(&mut self) { + self.marks.clear(); + self.lastindex = -1; + self.marks_stack.clear(); + self.context_stack.clear(); + self.repeat = None; + } } -// struct OpcodeDispatcher { -// executing_contexts: HashMap>, -// } +pub(crate) fn pymatch(mut state: State) -> bool { + let ctx = MatchContext { + string_position: state.start, + code_position: 0, + has_matched: None, + }; + state.context_stack.push(ctx); -pub struct BadSreCode; + let mut has_matched = None; + loop { + if state.context_stack.is_empty() { + break; + } + let ctx_id = state.context_stack.len() - 1; + let mut drive = MatchContextDrive::drive(ctx_id, state); + let mut dispatcher = OpcodeDispatcher::new(); -pub fn parse_ops(code: &[u32]) -> impl Iterator> + '_ { - let mut it = code.iter().copied(); - std::iter::from_fn(move || -> Option> { - let op = it.next()?; - let op = SreOpcode::try_from(op) - .ok() - .and_then(|op| extract_code(op, &mut it)); - Some(op) - }) - .map(|x| x.ok_or(BadSreCode)) + has_matched = dispatcher.pymatch(&mut drive); + state = drive.take(); + if has_matched.is_some() { + state.context_stack.pop(); + } + } + has_matched.unwrap_or(false) } -type It<'a> = iter::Copied>; -fn extract_code(op: SreOpcode, it: &mut It) -> Option { - let skip = |it: &mut It| { - let skip = it.next()? as usize; - if skip > it.len() { - None - } else { - Some(skip) +#[derive(Debug, Copy, Clone)] +struct MatchContext { + string_position: usize, + code_position: usize, + has_matched: Option, +} + +struct MatchContextDrive<'a> { + state: State<'a>, + ctx_id: usize, +} + +impl<'a> MatchContextDrive<'a> { + fn id(&self) -> usize { + self.ctx_id + } + fn ctx_mut(&mut self) -> &mut MatchContext { + &mut self.state.context_stack[self.ctx_id] + } + fn ctx(&self) -> &MatchContext { + &self.state.context_stack[self.ctx_id] + } + fn push_new_context(&mut self, pattern_offset: usize) -> usize { + let ctx = self.ctx(); + let child_ctx = MatchContext { + string_position: ctx.string_position, + code_position: ctx.code_position + pattern_offset, + has_matched: None, + }; + self.state.context_stack.push(child_ctx); + self.state.context_stack.len() - 1 + } + fn drive(ctx_id: usize, state: State<'a>) -> Self { + Self { state, ctx_id } + } + fn take(self) -> State<'a> { + self.state + } + fn str(&self) -> &str { + unsafe { + std::str::from_utf8_unchecked( + &self.state.string.as_bytes()[self.ctx().string_position..], + ) + } + } + fn peek_char(&self) -> char { + self.str().chars().next().unwrap() + } + fn peek_code(&self, peek: usize) -> u32 { + self.state.pattern_codes[self.ctx().code_position + peek] + } + fn skip_char(&mut self, skip_count: usize) { + let skipped = self.str().char_indices().nth(skip_count).unwrap().0; + self.ctx_mut().string_position += skipped; + } + fn skip_code(&mut self, skip_count: usize) { + self.ctx_mut().code_position += skip_count; + } + fn remaining_chars(&self) -> usize { + let end = self.state.end; + end - self.ctx().string_position + self.str().len() + } + fn remaining_codes(&self) -> usize { + self.state.pattern_codes.len() - self.ctx().code_position + } + fn at_beginning(&self) -> bool { + self.ctx().string_position == 0 + } + fn at_end(&self) -> bool { + self.str().is_empty() + } + fn at_linebreak(&self) -> bool { + match self.str().chars().next() { + Some(c) => c == '\n', + None => false, } + } +} + +struct OpcodeDispatcher { + executing_contexts: HashMap>, +} + +macro_rules! once { + ($val:expr) => { + Box::new(OpEmpty {}) }; - match op { - SreOpcode::FAILURE => {} - SreOpcode::SUCCESS => {} - SreOpcode::ANY => {} - SreOpcode::ANY_ALL => {} - SreOpcode::ASSERT => {} - SreOpcode::ASSERT_NOT => {} - SreOpcode::AT => {} - SreOpcode::BRANCH => {} - SreOpcode::CALL => {} - SreOpcode::CATEGORY => {} - SreOpcode::CHARSET => {} - SreOpcode::BIGCHARSET => {} - SreOpcode::GROUPREF => {} - SreOpcode::GROUPREF_EXISTS => {} - SreOpcode::GROUPREF_IGNORE => {} - SreOpcode::IN => {} - SreOpcode::IN_IGNORE => {} - SreOpcode::INFO => { - // let skip = it.next()?; - } - SreOpcode::JUMP => {} - SreOpcode::LITERAL => {} - SreOpcode::LITERAL_IGNORE => {} - SreOpcode::MARK => {} - SreOpcode::MAX_UNTIL => {} - SreOpcode::MIN_UNTIL => {} - SreOpcode::NOT_LITERAL => {} - SreOpcode::NOT_LITERAL_IGNORE => {} - SreOpcode::NEGATE => {} - SreOpcode::RANGE => {} - SreOpcode::REPEAT => {} - SreOpcode::REPEAT_ONE => {} - SreOpcode::SUBPATTERN => {} - SreOpcode::MIN_REPEAT_ONE => {} - SreOpcode::RANGE_IGNORE => {} - } - todo!() -} - -pub enum Op { - Info {}, +} + +trait OpcodeExecutor { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()>; +} + +struct OpFailure {} +impl OpcodeExecutor for OpFailure { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + drive.ctx_mut().has_matched = Some(false); + None + } +} + +struct OpEmpty {} +impl OpcodeExecutor for OpEmpty { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + None + } +} + +struct OpOnce { + f: Option, +} +impl OpcodeExecutor for OpOnce { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + let f = self.f.take()?; + f(drive); + None + } +} +fn once(f: F) -> Box> { + Box::new(OpOnce { f: Some(f) }) +} + +struct OpMinRepeatOne { + trace_id: usize, + mincount: usize, + maxcount: usize, + count: usize, + child_ctx_id: usize, +} +impl OpcodeExecutor for OpMinRepeatOne { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + match self.trace_id { + 0 => self._0(drive), + _ => unreachable!(), + } + } +} +impl Default for OpMinRepeatOne { + fn default() -> Self { + OpMinRepeatOne { + trace_id: 0, + mincount: 0, + maxcount: 0, + count: 0, + child_ctx_id: 0, + } + } +} +impl OpMinRepeatOne { + fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + self.mincount = drive.peek_code(2) as usize; + self.maxcount = drive.peek_code(3) as usize; + + if drive.remaining_chars() < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + + drive.state.string_position = drive.ctx().string_position; + + self.count = if self.mincount == 0 { + 0 + } else { + let count = count_repetitions(drive, self.mincount); + if count < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(count); + count + }; + + if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + return None; + } + + // mark push + self.trace_id = 1; + self._1(drive) + } + fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + if self.maxcount == SRE_MAXREPEAT || self.count <= self.maxcount { + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + self.trace_id = 2; + return Some(()); + } + + // mark discard + drive.ctx_mut().has_matched = Some(false); + None + } + fn _2(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.string_position = drive.ctx().string_position; + if count_repetitions(drive, 1) == 0 { + self.trace_id = 3; + return self._3(drive); + } + drive.skip_char(1); + self.count += 1; + // marks pop keep + self.trace_id = 1; + self._1(drive) + } + fn _3(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + // mark discard + drive.ctx_mut().has_matched = Some(false); + None + } +} + +impl OpcodeDispatcher { + fn new() -> Self { + Self { + executing_contexts: HashMap::new(), + } + } + // Returns True if the current context matches, False if it doesn't and + // None if matching is not finished, ie must be resumed after child + // contexts have been matched. + fn pymatch(&mut self, drive: &mut MatchContextDrive) -> Option { + while drive.remaining_codes() > 0 && drive.ctx().has_matched.is_none() { + let code = drive.peek_code(0); + let opcode = SreOpcode::try_from(code).unwrap(); + self.dispatch(opcode, drive); + // self.drive = self.drive; + } + match drive.ctx().has_matched { + Some(matched) => Some(matched), + None => { + drive.ctx_mut().has_matched = Some(false); + Some(false) + } + } + } + + // Dispatches a context on a given opcode. Returns True if the context + // is done matching, False if it must be resumed when next encountered. + fn dispatch(&mut self, opcode: SreOpcode, drive: &mut MatchContextDrive) -> bool { + let mut executor = match self.executing_contexts.remove_entry(&drive.id()) { + Some((_, mut executor)) => executor, + None => self.dispatch_table(opcode, drive), + }; + if let Some(()) = executor.next(drive) { + self.executing_contexts.insert(drive.id(), executor); + false + } else { + true + } + } + + fn dispatch_table( + &mut self, + opcode: SreOpcode, + drive: &mut MatchContextDrive, + ) -> Box { + // move || { + match opcode { + SreOpcode::FAILURE => { + Box::new(OpFailure {}) + } + SreOpcode::SUCCESS => once(|drive| { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + }), + SreOpcode::ANY => once!(true), + SreOpcode::ANY_ALL => once!(true), + SreOpcode::ASSERT => once!(true), + SreOpcode::ASSERT_NOT => once!(true), + SreOpcode::AT => once!(true), + SreOpcode::BRANCH => once!(true), + SreOpcode::CALL => once!(true), + SreOpcode::CATEGORY => once!(true), + SreOpcode::CHARSET => once!(true), + SreOpcode::BIGCHARSET => once!(true), + SreOpcode::GROUPREF => once!(true), + SreOpcode::GROUPREF_EXISTS => once!(true), + SreOpcode::GROUPREF_IGNORE => once!(true), + SreOpcode::IN => once!(true), + SreOpcode::IN_IGNORE => once!(true), + SreOpcode::INFO => once!(true), + SreOpcode::JUMP => once!(true), + SreOpcode::LITERAL => { + if drive.at_end() || drive.peek_char() as u32 != drive.peek_code(1) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_char(1); + } + drive.skip_code(2); + once!(true) + } + SreOpcode::LITERAL_IGNORE => once!(true), + SreOpcode::MARK => once!(true), + SreOpcode::MAX_UNTIL => once!(true), + SreOpcode::MIN_UNTIL => once!(true), + SreOpcode::NOT_LITERAL => once!(true), + SreOpcode::NOT_LITERAL_IGNORE => once!(true), + SreOpcode::NEGATE => once!(true), + SreOpcode::RANGE => once!(true), + SreOpcode::REPEAT => once!(true), + SreOpcode::REPEAT_ONE => once!(true), + SreOpcode::SUBPATTERN => once!(true), + SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), + SreOpcode::RANGE_IGNORE => once!(true), + } + } +} + +// Returns the number of repetitions of a single item, starting from the +// current string position. The code pointer is expected to point to a +// REPEAT_ONE operation (with the repeated 4 ahead). +fn count_repetitions(drive: &mut MatchContextDrive, maxcount: usize) -> usize { + let mut count = 0; + let mut real_maxcount = drive.state.end - drive.ctx().string_position; + if maxcount < real_maxcount && maxcount != SRE_MAXREPEAT { + real_maxcount = maxcount; + } + count } From 82922bf0d796f6c79f5799a1b14b9d1ad9c26431 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 23 Dec 2020 09:01:50 +0200 Subject: [PATCH 003/705] upgrade re version; implement helper functions; --- constants.rs | 52 ++-- interp.rs | 717 ++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 588 insertions(+), 181 deletions(-) diff --git a/constants.rs b/constants.rs index a80534d70b..f5ab92c531 100644 --- a/constants.rs +++ b/constants.rs @@ -13,11 +13,7 @@ use bitflags::bitflags; -pub const SRE_MAGIC: usize = 20140917; -pub const SRE_CODESIZE: usize = 4; -pub const SRE_MAXREPEAT: usize = usize::max_value(); -pub const SRE_MAXGROUPS: usize = usize::max_value() / std::mem::size_of::() / 2; - +pub const SRE_MAGIC: usize = 20171005; #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] #[allow(non_camel_case_types)] @@ -36,25 +32,33 @@ pub enum SreOpcode { BIGCHARSET = 11, GROUPREF = 12, GROUPREF_EXISTS = 13, - GROUPREF_IGNORE = 14, - IN = 15, - IN_IGNORE = 16, - INFO = 17, - JUMP = 18, - LITERAL = 19, - LITERAL_IGNORE = 20, - MARK = 21, - MAX_UNTIL = 22, - MIN_UNTIL = 23, - NOT_LITERAL = 24, - NOT_LITERAL_IGNORE = 25, - NEGATE = 26, - RANGE = 27, - REPEAT = 28, - REPEAT_ONE = 29, - SUBPATTERN = 30, - MIN_REPEAT_ONE = 31, - RANGE_IGNORE = 32, + IN = 14, + INFO = 15, + JUMP = 16, + LITERAL = 17, + MARK = 18, + MAX_UNTIL = 19, + MIN_UNTIL = 20, + NOT_LITERAL = 21, + NEGATE = 22, + RANGE = 23, + REPEAT = 24, + REPEAT_ONE = 25, + SUBPATTERN = 26, + MIN_REPEAT_ONE = 27, + GROUPREF_IGNORE = 28, + IN_IGNORE = 29, + LITERAL_IGNORE = 30, + NOT_LITERAL_IGNORE = 31, + GROUPREF_LOC_IGNORE = 32, + IN_LOC_IGNORE = 33, + LITERAL_LOC_IGNORE = 34, + NOT_LITERAL_LOC_IGNORE = 35, + GROUPREF_UNI_IGNORE = 36, + IN_UNI_IGNORE = 37, + LITERAL_UNI_IGNORE = 38, + NOT_LITERAL_UNI_IGNORE = 39, + RANGE_UNI_IGNORE = 40, } #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] diff --git a/interp.rs b/interp.rs index 43c2a54515..bc6753a5fe 100644 --- a/interp.rs +++ b/interp.rs @@ -1,14 +1,14 @@ // good luck to those that follow; here be dragons -use super::constants::{SreFlag, SreOpcode, SRE_MAXREPEAT}; -use crate::builtins::PyStrRef; -use rustpython_common::borrow::BorrowValue; +use super::_sre::MAXREPEAT; +use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use std::collections::HashMap; use std::convert::TryFrom; pub struct State<'a> { - // py_string: PyStrRef, string: &'a str, + // chars count + string_len: usize, start: usize, end: usize, flags: SreFlag, @@ -23,17 +23,18 @@ pub struct State<'a> { impl<'a> State<'a> { pub(crate) fn new( - // py_string: PyStrRef, string: &'a str, start: usize, end: usize, flags: SreFlag, pattern_codes: Vec, ) -> Self { - // let string = py_string.borrow_value(); + let string_len = string.chars().count(); + let end = std::cmp::min(end, string_len); + let start = std::cmp::min(start, end); Self { - // py_string, string, + string_len, start, end, flags, @@ -59,6 +60,7 @@ impl<'a> State<'a> { pub(crate) fn pymatch(mut state: State) -> bool { let ctx = MatchContext { string_position: state.start, + string_offset: state.string.char_indices().nth(state.start).unwrap().0, code_position: 0, has_matched: None, }; @@ -85,6 +87,7 @@ pub(crate) fn pymatch(mut state: State) -> bool { #[derive(Debug, Copy, Clone)] struct MatchContext { string_position: usize, + string_offset: usize, code_position: usize, has_matched: Option, } @@ -108,6 +111,7 @@ impl<'a> MatchContextDrive<'a> { let ctx = self.ctx(); let child_ctx = MatchContext { string_position: ctx.string_position, + string_offset: ctx.string_offset, code_position: ctx.code_position + pattern_offset, has_matched: None, }; @@ -122,9 +126,7 @@ impl<'a> MatchContextDrive<'a> { } fn str(&self) -> &str { unsafe { - std::str::from_utf8_unchecked( - &self.state.string.as_bytes()[self.ctx().string_position..], - ) + std::str::from_utf8_unchecked(&self.state.string.as_bytes()[self.ctx().string_offset..]) } } fn peek_char(&self) -> char { @@ -135,61 +137,90 @@ impl<'a> MatchContextDrive<'a> { } fn skip_char(&mut self, skip_count: usize) { let skipped = self.str().char_indices().nth(skip_count).unwrap().0; - self.ctx_mut().string_position += skipped; + self.ctx_mut().string_position += skip_count; + self.ctx_mut().string_offset += skipped; } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; } fn remaining_chars(&self) -> usize { - let end = self.state.end; - end - self.ctx().string_position + self.str().len() + self.state.end - self.ctx().string_position } fn remaining_codes(&self) -> usize { self.state.pattern_codes.len() - self.ctx().code_position } fn at_beginning(&self) -> bool { - self.ctx().string_position == 0 + self.ctx().string_position == self.state.start } fn at_end(&self) -> bool { - self.str().is_empty() + self.ctx().string_position == self.state.end } fn at_linebreak(&self) -> bool { - match self.str().chars().next() { - Some(c) => c == '\n', - None => false, + !self.at_end() && is_linebreak(self.peek_char()) + } + fn at_boundary bool>(&self, mut word_checker: F) -> bool { + if self.at_beginning() && self.at_end() { + return false; + } + let that = !self.at_beginning() && word_checker(self.back_peek_char()); + let this = !self.at_end() && word_checker(self.peek_char()); + this != that + } + fn back_peek_offset(&self) -> usize { + let bytes = self.state.string.as_bytes(); + let mut offset = self.ctx().string_offset - 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + panic!("not utf-8 code point"); + } + } + } + } + offset + } + fn back_peek_char(&self) -> char { + let bytes = self.state.string.as_bytes(); + let offset = self.back_peek_offset(); + let current_offset = self.ctx().string_offset; + let code = match current_offset - offset { + 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset]]), + 2 => u32::from_ne_bytes([0, 0, bytes[offset], bytes[offset + 1]]), + 3 => u32::from_ne_bytes([0, bytes[offset], bytes[offset + 1], bytes[offset + 2]]), + 4 => u32::from_ne_bytes([ + bytes[offset], + bytes[offset + 1], + bytes[offset + 2], + bytes[offset + 3], + ]), + _ => unreachable!(), + }; + unsafe { std::mem::transmute(code) } + } + fn back_skip_char(&mut self, skip_count: usize) { + self.ctx_mut().string_position -= skip_count; + for _ in 0..skip_count { + self.ctx_mut().string_offset = self.back_peek_offset(); } } -} - -struct OpcodeDispatcher { - executing_contexts: HashMap>, -} - -macro_rules! once { - ($val:expr) => { - Box::new(OpEmpty {}) - }; } trait OpcodeExecutor { fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()>; } -struct OpFailure {} -impl OpcodeExecutor for OpFailure { +struct OpUnimplemented {} +impl OpcodeExecutor for OpUnimplemented { fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { drive.ctx_mut().has_matched = Some(false); None } } -struct OpEmpty {} -impl OpcodeExecutor for OpEmpty { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - None - } -} - struct OpOnce { f: Option, } @@ -204,6 +235,10 @@ fn once(f: F) -> Box> { Box::new(OpOnce { f: Some(f) }) } +fn unimplemented() -> Box { + Box::new(OpUnimplemented {}) +} + struct OpMinRepeatOne { trace_id: usize, mincount: usize, @@ -213,10 +248,11 @@ struct OpMinRepeatOne { } impl OpcodeExecutor for OpMinRepeatOne { fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - match self.trace_id { - 0 => self._0(drive), - _ => unreachable!(), - } + None + // match self.trace_id { + // 0 => self._0(drive), + // _ => unreachable!(), + // } } } impl Default for OpMinRepeatOne { @@ -230,75 +266,78 @@ impl Default for OpMinRepeatOne { } } } -impl OpMinRepeatOne { - fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - self.mincount = drive.peek_code(2) as usize; - self.maxcount = drive.peek_code(3) as usize; +// impl OpMinRepeatOne { +// fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { +// self.mincount = drive.peek_code(2) as usize; +// self.maxcount = drive.peek_code(3) as usize; - if drive.remaining_chars() < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } +// if drive.remaining_chars() < self.mincount { +// drive.ctx_mut().has_matched = Some(false); +// return None; +// } - drive.state.string_position = drive.ctx().string_position; +// drive.state.string_position = drive.ctx().string_position; - self.count = if self.mincount == 0 { - 0 - } else { - let count = count_repetitions(drive, self.mincount); - if count < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.skip_char(count); - count - }; +// self.count = if self.mincount == 0 { +// 0 +// } else { +// let count = count_repetitions(drive, self.mincount); +// if count < self.mincount { +// drive.ctx_mut().has_matched = Some(false); +// return None; +// } +// drive.skip_char(count); +// count +// }; - if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { - drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); - return None; - } +// if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { +// drive.state.string_position = drive.ctx().string_position; +// drive.ctx_mut().has_matched = Some(true); +// return None; +// } - // mark push - self.trace_id = 1; - self._1(drive) - } - fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - if self.maxcount == SRE_MAXREPEAT || self.count <= self.maxcount { - drive.state.string_position = drive.ctx().string_position; - self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); - self.trace_id = 2; - return Some(()); - } +// // mark push +// self.trace_id = 1; +// self._1(drive) +// } +// fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { +// if self.maxcount == SRE_MAXREPEAT || self.count <= self.maxcount { +// drive.state.string_position = drive.ctx().string_position; +// self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); +// self.trace_id = 2; +// return Some(()); +// } - // mark discard - drive.ctx_mut().has_matched = Some(false); - None - } - fn _2(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.string_position = drive.ctx().string_position; - if count_repetitions(drive, 1) == 0 { - self.trace_id = 3; - return self._3(drive); - } - drive.skip_char(1); - self.count += 1; - // marks pop keep - self.trace_id = 1; - self._1(drive) - } - fn _3(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - // mark discard - drive.ctx_mut().has_matched = Some(false); - None - } -} +// // mark discard +// drive.ctx_mut().has_matched = Some(false); +// None +// } +// fn _2(&mut self, drive: &mut MatchContextDrive) -> Option<()> { +// if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { +// drive.ctx_mut().has_matched = Some(true); +// return None; +// } +// drive.state.string_position = drive.ctx().string_position; +// if count_repetitions(drive, 1) == 0 { +// self.trace_id = 3; +// return self._3(drive); +// } +// drive.skip_char(1); +// self.count += 1; +// // marks pop keep +// self.trace_id = 1; +// self._1(drive) +// } +// fn _3(&mut self, drive: &mut MatchContextDrive) -> Option<()> { +// // mark discard +// drive.ctx_mut().has_matched = Some(false); +// None +// } +// } +struct OpcodeDispatcher { + executing_contexts: HashMap>, +} impl OpcodeDispatcher { fn new() -> Self { Self { @@ -313,7 +352,6 @@ impl OpcodeDispatcher { let code = drive.peek_code(0); let opcode = SreOpcode::try_from(code).unwrap(); self.dispatch(opcode, drive); - // self.drive = self.drive; } match drive.ctx().has_matched { Some(matched) => Some(matched), @@ -328,8 +366,8 @@ impl OpcodeDispatcher { // is done matching, False if it must be resumed when next encountered. fn dispatch(&mut self, opcode: SreOpcode, drive: &mut MatchContextDrive) -> bool { let mut executor = match self.executing_contexts.remove_entry(&drive.id()) { - Some((_, mut executor)) => executor, - None => self.dispatch_table(opcode, drive), + Some((_, executor)) => executor, + None => self.dispatch_table(opcode), }; if let Some(()) = executor.next(drive) { self.executing_contexts.insert(drive.id(), executor); @@ -339,71 +377,436 @@ impl OpcodeDispatcher { } } - fn dispatch_table( - &mut self, - opcode: SreOpcode, - drive: &mut MatchContextDrive, - ) -> Box { - // move || { + fn dispatch_table(&mut self, opcode: SreOpcode) -> Box { match opcode { - SreOpcode::FAILURE => { - Box::new(OpFailure {}) - } + SreOpcode::FAILURE => once(|drive| { + drive.ctx_mut().has_matched = Some(false); + }), SreOpcode::SUCCESS => once(|drive| { drive.state.string_position = drive.ctx().string_position; drive.ctx_mut().has_matched = Some(true); }), - SreOpcode::ANY => once!(true), - SreOpcode::ANY_ALL => once!(true), - SreOpcode::ASSERT => once!(true), - SreOpcode::ASSERT_NOT => once!(true), - SreOpcode::AT => once!(true), - SreOpcode::BRANCH => once!(true), - SreOpcode::CALL => once!(true), - SreOpcode::CATEGORY => once!(true), - SreOpcode::CHARSET => once!(true), - SreOpcode::BIGCHARSET => once!(true), - SreOpcode::GROUPREF => once!(true), - SreOpcode::GROUPREF_EXISTS => once!(true), - SreOpcode::GROUPREF_IGNORE => once!(true), - SreOpcode::IN => once!(true), - SreOpcode::IN_IGNORE => once!(true), - SreOpcode::INFO => once!(true), - SreOpcode::JUMP => once!(true), - SreOpcode::LITERAL => { - if drive.at_end() || drive.peek_char() as u32 != drive.peek_code(1) { + SreOpcode::ANY => once(|drive| { + if drive.at_end() || drive.at_linebreak() { drive.ctx_mut().has_matched = Some(false); } else { + drive.skip_code(1); drive.skip_char(1); } + }), + SreOpcode::ANY_ALL => once(|drive| { + if drive.at_end() { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(1); + drive.skip_char(1); + } + }), + SreOpcode::ASSERT => Box::new(OpAssert::default()), + SreOpcode::ASSERT_NOT => unimplemented(), + SreOpcode::AT => once(|drive| { + let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); + if !at(drive, atcode) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(2); + } + }), + SreOpcode::BRANCH => unimplemented(), + SreOpcode::CALL => unimplemented(), + SreOpcode::CATEGORY => unimplemented(), + SreOpcode::CHARSET => unimplemented(), + SreOpcode::BIGCHARSET => unimplemented(), + SreOpcode::GROUPREF => unimplemented(), + SreOpcode::GROUPREF_EXISTS => unimplemented(), + SreOpcode::GROUPREF_IGNORE => unimplemented(), + SreOpcode::IN => unimplemented(), + SreOpcode::IN_IGNORE => unimplemented(), + SreOpcode::INFO | SreOpcode::JUMP => once(|drive| { + drive.skip_code(drive.peek_code(1) as usize + 1); + }), + SreOpcode::LITERAL => once(|drive| { + if drive.at_end() || drive.peek_char() as u32 != drive.peek_code(1) { + drive.ctx_mut().has_matched = Some(false); + } drive.skip_code(2); - once!(true) - } - SreOpcode::LITERAL_IGNORE => once!(true), - SreOpcode::MARK => once!(true), - SreOpcode::MAX_UNTIL => once!(true), - SreOpcode::MIN_UNTIL => once!(true), - SreOpcode::NOT_LITERAL => once!(true), - SreOpcode::NOT_LITERAL_IGNORE => once!(true), - SreOpcode::NEGATE => once!(true), - SreOpcode::RANGE => once!(true), - SreOpcode::REPEAT => once!(true), - SreOpcode::REPEAT_ONE => once!(true), - SreOpcode::SUBPATTERN => once!(true), + drive.skip_char(1); + }), + SreOpcode::LITERAL_IGNORE => once(|drive| { + let code = drive.peek_code(1); + let c = drive.peek_char(); + if drive.at_end() + || (c.to_ascii_lowercase() as u32 != code + && c.to_ascii_uppercase() as u32 != code) + { + drive.ctx_mut().has_matched = Some(false); + } + drive.skip_code(2); + drive.skip_char(1); + }), + SreOpcode::MARK => unimplemented(), + SreOpcode::MAX_UNTIL => unimplemented(), + SreOpcode::MIN_UNTIL => unimplemented(), + SreOpcode::NOT_LITERAL => once(|drive| { + if drive.at_end() || drive.peek_char() as u32 == drive.peek_code(1) { + drive.ctx_mut().has_matched = Some(false); + } + drive.skip_code(2); + drive.skip_char(1); + }), + SreOpcode::NOT_LITERAL_IGNORE => once(|drive| { + let code = drive.peek_code(1); + let c = drive.peek_char(); + if drive.at_end() + || (c.to_ascii_lowercase() as u32 == code + || c.to_ascii_uppercase() as u32 == code) + { + drive.ctx_mut().has_matched = Some(false); + } + drive.skip_code(2); + drive.skip_char(1); + }), + SreOpcode::NEGATE => unimplemented(), + SreOpcode::RANGE => unimplemented(), + SreOpcode::REPEAT => unimplemented(), + SreOpcode::REPEAT_ONE => unimplemented(), + SreOpcode::SUBPATTERN => unimplemented(), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), - SreOpcode::RANGE_IGNORE => once!(true), + SreOpcode::GROUPREF_LOC_IGNORE => unimplemented(), + SreOpcode::IN_LOC_IGNORE => unimplemented(), + SreOpcode::LITERAL_LOC_IGNORE => unimplemented(), + SreOpcode::NOT_LITERAL_LOC_IGNORE => unimplemented(), + SreOpcode::GROUPREF_UNI_IGNORE => unimplemented(), + SreOpcode::IN_UNI_IGNORE => unimplemented(), + SreOpcode::LITERAL_UNI_IGNORE => unimplemented(), + SreOpcode::NOT_LITERAL_UNI_IGNORE => unimplemented(), + SreOpcode::RANGE_UNI_IGNORE => unimplemented(), } } + + // Returns the number of repetitions of a single item, starting from the + // current string position. The code pointer is expected to point to a + // REPEAT_ONE operation (with the repeated 4 ahead). + fn count_repetitions(&mut self, drive: &mut MatchContextDrive, maxcount: usize) -> usize { + let mut count = 0; + let mut real_maxcount = drive.remaining_chars(); + if maxcount < real_maxcount && maxcount != MAXREPEAT { + real_maxcount = maxcount; + } + let code_position = drive.ctx().code_position; + let string_position = drive.ctx().string_position; + drive.skip_code(4); + let reset_position = drive.ctx().code_position; + while count < real_maxcount { + drive.ctx_mut().code_position = reset_position; + let opcode = SreOpcode::try_from(drive.peek_code(1)).unwrap(); + self.dispatch(opcode, drive); + if drive.ctx().has_matched == Some(false) { + break; + } + count += 1; + } + drive.ctx_mut().has_matched = None; + drive.ctx_mut().code_position = code_position; + drive.ctx_mut().string_position = string_position; + count + } +} + +fn at(drive: &mut MatchContextDrive, atcode: SreAtCode) -> bool { + match atcode { + SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), + SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), + SreAtCode::BOUNDARY => drive.at_boundary(is_word), + SreAtCode::NON_BOUNDARY => !drive.at_boundary(is_word), + SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(), + SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(), + SreAtCode::END_STRING => drive.at_end(), + SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word), + SreAtCode::LOC_NON_BOUNDARY => !drive.at_boundary(is_loc_word), + SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word), + SreAtCode::UNI_NON_BOUNDARY => !drive.at_boundary(is_uni_word), + } +} + +fn category(catcode: SreCatCode, c: char) -> bool { + match catcode { + SreCatCode::DIGIT => is_digit(c), + SreCatCode::NOT_DIGIT => !is_digit(c), + SreCatCode::SPACE => is_space(c), + SreCatCode::NOT_SPACE => !is_space(c), + SreCatCode::WORD => is_word(c), + SreCatCode::NOT_WORD => !is_word(c), + SreCatCode::LINEBREAK => is_linebreak(c), + SreCatCode::NOT_LINEBREAK => !is_linebreak(c), + SreCatCode::LOC_WORD => is_loc_word(c), + SreCatCode::LOC_NOT_WORD => !is_loc_word(c), + SreCatCode::UNI_DIGIT => is_uni_digit(c), + SreCatCode::UNI_NOT_DIGIT => !is_uni_digit(c), + SreCatCode::UNI_SPACE => is_uni_space(c), + SreCatCode::UNI_NOT_SPACE => !is_uni_space(c), + SreCatCode::UNI_WORD => is_uni_word(c), + SreCatCode::UNI_NOT_WORD => !is_uni_word(c), + SreCatCode::UNI_LINEBREAK => is_uni_linebreak(c), + SreCatCode::UNI_NOT_LINEBREAK => !is_uni_linebreak(c), + } +} + +fn charset(set: &[u32], c: char) -> bool { + /* check if character is a member of the given set */ + let ch = c as u32; + let mut ok = true; + let mut i = 0; + while i < set.len() { + let opcode = match SreOpcode::try_from(set[i]) { + Ok(code) => code, + Err(_) => { + break; + } + }; + match opcode { + SreOpcode::FAILURE => { + return !ok; + } + SreOpcode::CATEGORY => { + /* */ + let catcode = match SreCatCode::try_from(set[i + 1]) { + Ok(code) => code, + Err(_) => { + break; + } + }; + if category(catcode, c) { + return ok; + } + i += 2; + } + SreOpcode::CHARSET => { + /* */ + if ch < 256 && (set[(ch / 32) as usize] & (1 << (32 - 1))) != 0 { + return ok; + } + i += 8; + } + SreOpcode::BIGCHARSET => { + /* <256 blockindices> */ + let count = set[i + 1]; + if ch < 0x10000 { + let blockindices: &[u8] = unsafe { std::mem::transmute(&set[i + 2..]) }; + let block = blockindices[(ch >> 8) as usize]; + if set[2 + 64 + ((block as u32 * 256 + (ch & 255)) / 32) as usize] + & (1 << (ch & (32 - 1))) + != 0 + { + return ok; + } + } + i += 2 + 64 + count as usize * 8; + } + SreOpcode::LITERAL => { + /* */ + if ch == set[i + 1] { + return ok; + } + i += 2; + } + SreOpcode::NEGATE => { + ok = !ok; + i += 1; + } + SreOpcode::RANGE => { + /* */ + if set[i + 1] <= ch && ch <= set[i + 2] { + return ok; + } + i += 3; + } + SreOpcode::RANGE_UNI_IGNORE => { + /* */ + if set[i + 1] <= ch && ch <= set[i + 2] { + return ok; + } + let ch = upper_unicode(c) as u32; + if set[i + 1] <= ch && ch <= set[i + 2] { + return ok; + } + i += 3; + } + _ => { + break; + } + } + } + /* internal error -- there's not much we can do about it + here, so let's just pretend it didn't match... */ + false +} + +fn count(drive: MatchContextDrive, maxcount: usize) -> usize { + let string_position = drive.state.string_position; + let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); + + let opcode = SreOpcode::try_from(drive.peek_code(1)).unwrap(); + match opcode { + SreOpcode::FAILURE => {} + SreOpcode::SUCCESS => {} + SreOpcode::ANY => {} + SreOpcode::ANY_ALL => {} + SreOpcode::ASSERT => {} + SreOpcode::ASSERT_NOT => {} + SreOpcode::AT => {} + SreOpcode::BRANCH => {} + SreOpcode::CALL => {} + SreOpcode::CATEGORY => {} + SreOpcode::CHARSET => {} + SreOpcode::BIGCHARSET => {} + SreOpcode::GROUPREF => {} + SreOpcode::GROUPREF_EXISTS => {} + SreOpcode::IN => { + } + SreOpcode::INFO => {} + SreOpcode::JUMP => {} + SreOpcode::LITERAL => {} + SreOpcode::MARK => {} + SreOpcode::MAX_UNTIL => {} + SreOpcode::MIN_UNTIL => {} + SreOpcode::NOT_LITERAL => {} + SreOpcode::NEGATE => {} + SreOpcode::RANGE => {} + SreOpcode::REPEAT => {} + SreOpcode::REPEAT_ONE => {} + SreOpcode::SUBPATTERN => {} + SreOpcode::MIN_REPEAT_ONE => {} + SreOpcode::GROUPREF_IGNORE => {} + SreOpcode::IN_IGNORE => {} + SreOpcode::LITERAL_IGNORE => {} + SreOpcode::NOT_LITERAL_IGNORE => {} + SreOpcode::GROUPREF_LOC_IGNORE => {} + SreOpcode::IN_LOC_IGNORE => {} + SreOpcode::LITERAL_LOC_IGNORE => {} + SreOpcode::NOT_LITERAL_LOC_IGNORE => {} + SreOpcode::GROUPREF_UNI_IGNORE => {} + SreOpcode::IN_UNI_IGNORE => {} + SreOpcode::LITERAL_UNI_IGNORE => {} + SreOpcode::NOT_LITERAL_UNI_IGNORE => {} + SreOpcode::RANGE_UNI_IGNORE => {} + } +} + +fn eq_loc_ignore(code: u32, c: char) -> bool { + code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32 } -// Returns the number of repetitions of a single item, starting from the -// current string position. The code pointer is expected to point to a -// REPEAT_ONE operation (with the repeated 4 ahead). -fn count_repetitions(drive: &mut MatchContextDrive, maxcount: usize) -> usize { - let mut count = 0; - let mut real_maxcount = drive.state.end - drive.ctx().string_position; - if maxcount < real_maxcount && maxcount != SRE_MAXREPEAT { - real_maxcount = maxcount; +fn is_word(c: char) -> bool { + c.is_ascii_alphanumeric() || c == '_' +} +fn is_space(c: char) -> bool { + c.is_ascii_whitespace() +} +fn is_digit(c: char) -> bool { + c.is_ascii_digit() +} +fn is_loc_alnum(c: char) -> bool { + // TODO: check with cpython + c.is_alphanumeric() +} +fn is_loc_word(c: char) -> bool { + is_loc_alnum(c) || c == '_' +} +fn is_linebreak(c: char) -> bool { + c == '\n' +} +pub(crate) fn lower_ascii(c: char) -> char { + c.to_ascii_lowercase() +} +fn lower_locate(c: char) -> char { + // TODO: check with cpython + // https://doc.rust-lang.org/std/primitive.char.html#method.to_lowercase + c.to_lowercase().next().unwrap() +} +fn upper_locate(c: char) -> char { + // TODO: check with cpython + // https://doc.rust-lang.org/std/primitive.char.html#method.to_uppercase + c.to_uppercase().next().unwrap() +} +fn is_uni_digit(c: char) -> bool { + // TODO: check with cpython + c.is_digit(10) +} +fn is_uni_space(c: char) -> bool { + // TODO: check with cpython + c.is_whitespace() +} +fn is_uni_linebreak(c: char) -> bool { + match c { + '\u{000A}' | '\u{000B}' | '\u{000C}' | '\u{000D}' | '\u{001C}' | '\u{001D}' + | '\u{001E}' | '\u{0085}' | '\u{2028}' | '\u{2029}' => true, + _ => false, + } +} +fn is_uni_alnum(c: char) -> bool { + // TODO: check with cpython + c.is_alphanumeric() +} +fn is_uni_word(c: char) -> bool { + is_uni_alnum(c) || c == '_' +} +pub(crate) fn lower_unicode(c: char) -> char { + // TODO: check with cpython + c.to_lowercase().next().unwrap() +} +pub(crate) fn upper_unicode(c: char) -> char { + // TODO: check with cpython + c.to_uppercase().next().unwrap() +} + +fn is_utf8_first_byte(b: u8) -> bool { + // In UTF-8, there are three kinds of byte... + // 0xxxxxxx : ASCII + // 10xxxxxx : 2nd, 3rd or 4th byte of code + // 11xxxxxx : 1st byte of multibyte code + (b & 0b10000000 == 0) || (b & 0b11000000 == 0b11000000) +} + +struct OpAssert { + child_ctx_id: usize, + jump_id: usize, +} +impl Default for OpAssert { + fn default() -> Self { + OpAssert { + child_ctx_id: 0, + jump_id: 0, + } + } +} +impl OpcodeExecutor for OpAssert { + fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + match self.jump_id { + 0 => self._0(drive), + 1 => self._1(drive), + _ => unreachable!(), + } + } +} +impl OpAssert { + fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + let back = drive.peek_code(2) as usize; + if back > drive.ctx().string_position { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.state.string_position = drive.ctx().string_position - back; + self.child_ctx_id = drive.push_new_context(3); + self.jump_id = 1; + Some(()) + } + fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + if drive.state.context_stack[self.child_ctx_id].has_matched == Some(true) { + drive.skip_code(drive.peek_code(1) as usize + 1); + } else { + drive.ctx_mut().has_matched = Some(false); + } + None } - count } From 4e03fb361f731116d392cfdda0820353454fcaac Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 25 Dec 2020 16:31:22 +0200 Subject: [PATCH 004/705] upgrade sre_parse.py; impl marks count --- interp.rs | 633 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 365 insertions(+), 268 deletions(-) diff --git a/interp.rs b/interp.rs index bc6753a5fe..8e0968cbf5 100644 --- a/interp.rs +++ b/interp.rs @@ -5,6 +5,7 @@ use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use std::collections::HashMap; use std::convert::TryFrom; +#[derive(Debug)] pub struct State<'a> { string: &'a str, // chars count @@ -13,9 +14,9 @@ pub struct State<'a> { end: usize, flags: SreFlag, pattern_codes: Vec, - marks: Vec, + marks: Vec>, lastindex: isize, - marks_stack: Vec, + marks_stack: Vec<(Vec>, isize)>, context_stack: Vec, repeat: Option, string_position: usize, @@ -55,12 +56,47 @@ impl<'a> State<'a> { self.context_stack.clear(); self.repeat = None; } + + fn set_mark(&mut self, mark_nr: usize, position: usize) { + if mark_nr & 1 != 0 { + self.lastindex = mark_nr as isize / 2 + 1; + } + if mark_nr >= self.marks.len() { + self.marks.resize(mark_nr + 1, None); + } + self.marks[mark_nr] = Some(position); + } + fn get_marks(&self, group_index: usize) -> (Option, Option) { + let marks_index = 2 * group_index; + if marks_index + 1 < self.marks.len() { + (self.marks[marks_index], self.marks[marks_index + 1]) + } else { + (None, None) + } + } + fn marks_push(&mut self) { + self.marks_stack.push(self.marks.clone(), self.lastindex); + } + fn marks_pop(&mut self) { + (self.marks, self.lastindex) = self.marks_stack.pop().unwrap(); + } + fn marks_pop_keep(&mut self) { + (self.marks, self.lastindex) = self.marks_stack.last().unwrap(); + } + fn marks_pop_discard(&mut self) { + self.marks_stack.pop(); + } } pub(crate) fn pymatch(mut state: State) -> bool { let ctx = MatchContext { string_position: state.start, - string_offset: state.string.char_indices().nth(state.start).unwrap().0, + string_offset: state + .string + .char_indices() + .nth(state.start) + .map(|x| x.0) + .unwrap_or(0), code_position: 0, has_matched: None, }; @@ -72,7 +108,7 @@ pub(crate) fn pymatch(mut state: State) -> bool { break; } let ctx_id = state.context_stack.len() - 1; - let mut drive = MatchContextDrive::drive(ctx_id, state); + let mut drive = StackDrive::drive(ctx_id, state); let mut dispatcher = OpcodeDispatcher::new(); has_matched = dispatcher.pymatch(&mut drive); @@ -92,68 +128,52 @@ struct MatchContext { has_matched: Option, } -struct MatchContextDrive<'a> { - state: State<'a>, - ctx_id: usize, -} - -impl<'a> MatchContextDrive<'a> { - fn id(&self) -> usize { - self.ctx_id - } - fn ctx_mut(&mut self) -> &mut MatchContext { - &mut self.state.context_stack[self.ctx_id] - } - fn ctx(&self) -> &MatchContext { - &self.state.context_stack[self.ctx_id] - } - fn push_new_context(&mut self, pattern_offset: usize) -> usize { - let ctx = self.ctx(); - let child_ctx = MatchContext { - string_position: ctx.string_position, - string_offset: ctx.string_offset, - code_position: ctx.code_position + pattern_offset, - has_matched: None, - }; - self.state.context_stack.push(child_ctx); - self.state.context_stack.len() - 1 - } - fn drive(ctx_id: usize, state: State<'a>) -> Self { - Self { state, ctx_id } - } - fn take(self) -> State<'a> { - self.state - } +trait MatchContextDrive { + fn ctx_mut(&mut self) -> &mut MatchContext; + fn ctx(&self) -> &MatchContext; + fn state(&self) -> &State; fn str(&self) -> &str { unsafe { - std::str::from_utf8_unchecked(&self.state.string.as_bytes()[self.ctx().string_offset..]) + std::str::from_utf8_unchecked( + &self.state().string.as_bytes()[self.ctx().string_offset..], + ) } } + fn pattern(&self) -> &[u32] { + &self.state().pattern_codes[self.ctx().code_position..] + } fn peek_char(&self) -> char { self.str().chars().next().unwrap() } fn peek_code(&self, peek: usize) -> u32 { - self.state.pattern_codes[self.ctx().code_position + peek] + self.state().pattern_codes[self.ctx().code_position + peek] } fn skip_char(&mut self, skip_count: usize) { - let skipped = self.str().char_indices().nth(skip_count).unwrap().0; - self.ctx_mut().string_position += skip_count; - self.ctx_mut().string_offset += skipped; + match self.str().char_indices().nth(skip_count).map(|x| x.0) { + Some(skipped) => { + self.ctx_mut().string_position += skip_count; + self.ctx_mut().string_offset += skipped; + } + None => { + self.ctx_mut().string_position = self.state().end; + self.ctx_mut().string_offset = self.state().string.len(); // bytes len + } + } } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; } fn remaining_chars(&self) -> usize { - self.state.end - self.ctx().string_position + self.state().end - self.ctx().string_position } fn remaining_codes(&self) -> usize { - self.state.pattern_codes.len() - self.ctx().code_position + self.state().pattern_codes.len() - self.ctx().code_position } fn at_beginning(&self) -> bool { - self.ctx().string_position == self.state.start + self.ctx().string_position == self.state().start } fn at_end(&self) -> bool { - self.ctx().string_position == self.state.end + self.ctx().string_position == self.state().end } fn at_linebreak(&self) -> bool { !self.at_end() && is_linebreak(self.peek_char()) @@ -167,7 +187,7 @@ impl<'a> MatchContextDrive<'a> { this != that } fn back_peek_offset(&self) -> usize { - let bytes = self.state.string.as_bytes(); + let bytes = self.state().string.as_bytes(); let mut offset = self.ctx().string_offset - 1; if !is_utf8_first_byte(bytes[offset]) { offset -= 1; @@ -184,7 +204,7 @@ impl<'a> MatchContextDrive<'a> { offset } fn back_peek_char(&self) -> char { - let bytes = self.state.string.as_bytes(); + let bytes = self.state().string.as_bytes(); let offset = self.back_peek_offset(); let current_offset = self.ctx().string_offset; let code = match current_offset - offset { @@ -209,13 +229,74 @@ impl<'a> MatchContextDrive<'a> { } } +struct StackDrive<'a> { + state: State<'a>, + ctx_id: usize, +} +impl<'a> StackDrive<'a> { + fn id(&self) -> usize { + self.ctx_id + } + fn drive(ctx_id: usize, state: State<'a>) -> Self { + Self { state, ctx_id } + } + fn take(self) -> State<'a> { + self.state + } + fn push_new_context(&mut self, pattern_offset: usize) -> usize { + let ctx = self.ctx(); + let child_ctx = MatchContext { + string_position: ctx.string_position, + string_offset: ctx.string_offset, + code_position: ctx.code_position + pattern_offset, + has_matched: None, + }; + self.state.context_stack.push(child_ctx); + self.state.context_stack.len() - 1 + } +} +impl MatchContextDrive for StackDrive<'_> { + fn ctx_mut(&mut self) -> &mut MatchContext { + &mut self.state.context_stack[self.ctx_id] + } + fn ctx(&self) -> &MatchContext { + &self.state.context_stack[self.ctx_id] + } + fn state(&self) -> &State { + &self.state + } +} + +struct WrapDrive<'a> { + stack_drive: &'a StackDrive<'a>, + ctx: MatchContext, +} +impl<'a> WrapDrive<'a> { + fn drive(ctx: MatchContext, stack_drive: &'a StackDrive<'a>) -> Self { + Self { stack_drive, ctx } + } +} +impl MatchContextDrive for WrapDrive<'_> { + fn ctx_mut(&mut self) -> &mut MatchContext { + &mut self.ctx + } + + fn ctx(&self) -> &MatchContext { + &self.ctx + } + + fn state(&self) -> &State { + self.stack_drive.state() + } +} + trait OpcodeExecutor { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()>; + fn next(&mut self, drive: &mut StackDrive) -> Option<()>; } struct OpUnimplemented {} impl OpcodeExecutor for OpUnimplemented { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { drive.ctx_mut().has_matched = Some(false); None } @@ -224,14 +305,14 @@ impl OpcodeExecutor for OpUnimplemented { struct OpOnce { f: Option, } -impl OpcodeExecutor for OpOnce { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { +impl OpcodeExecutor for OpOnce { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { let f = self.f.take()?; f(drive); None } } -fn once(f: F) -> Box> { +fn once(f: F) -> Box> { Box::new(OpOnce { f: Some(f) }) } @@ -239,102 +320,6 @@ fn unimplemented() -> Box { Box::new(OpUnimplemented {}) } -struct OpMinRepeatOne { - trace_id: usize, - mincount: usize, - maxcount: usize, - count: usize, - child_ctx_id: usize, -} -impl OpcodeExecutor for OpMinRepeatOne { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { - None - // match self.trace_id { - // 0 => self._0(drive), - // _ => unreachable!(), - // } - } -} -impl Default for OpMinRepeatOne { - fn default() -> Self { - OpMinRepeatOne { - trace_id: 0, - mincount: 0, - maxcount: 0, - count: 0, - child_ctx_id: 0, - } - } -} -// impl OpMinRepeatOne { -// fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { -// self.mincount = drive.peek_code(2) as usize; -// self.maxcount = drive.peek_code(3) as usize; - -// if drive.remaining_chars() < self.mincount { -// drive.ctx_mut().has_matched = Some(false); -// return None; -// } - -// drive.state.string_position = drive.ctx().string_position; - -// self.count = if self.mincount == 0 { -// 0 -// } else { -// let count = count_repetitions(drive, self.mincount); -// if count < self.mincount { -// drive.ctx_mut().has_matched = Some(false); -// return None; -// } -// drive.skip_char(count); -// count -// }; - -// if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { -// drive.state.string_position = drive.ctx().string_position; -// drive.ctx_mut().has_matched = Some(true); -// return None; -// } - -// // mark push -// self.trace_id = 1; -// self._1(drive) -// } -// fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { -// if self.maxcount == SRE_MAXREPEAT || self.count <= self.maxcount { -// drive.state.string_position = drive.ctx().string_position; -// self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); -// self.trace_id = 2; -// return Some(()); -// } - -// // mark discard -// drive.ctx_mut().has_matched = Some(false); -// None -// } -// fn _2(&mut self, drive: &mut MatchContextDrive) -> Option<()> { -// if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { -// drive.ctx_mut().has_matched = Some(true); -// return None; -// } -// drive.state.string_position = drive.ctx().string_position; -// if count_repetitions(drive, 1) == 0 { -// self.trace_id = 3; -// return self._3(drive); -// } -// drive.skip_char(1); -// self.count += 1; -// // marks pop keep -// self.trace_id = 1; -// self._1(drive) -// } -// fn _3(&mut self, drive: &mut MatchContextDrive) -> Option<()> { -// // mark discard -// drive.ctx_mut().has_matched = Some(false); -// None -// } -// } - struct OpcodeDispatcher { executing_contexts: HashMap>, } @@ -347,7 +332,7 @@ impl OpcodeDispatcher { // Returns True if the current context matches, False if it doesn't and // None if matching is not finished, ie must be resumed after child // contexts have been matched. - fn pymatch(&mut self, drive: &mut MatchContextDrive) -> Option { + fn pymatch(&mut self, drive: &mut StackDrive) -> Option { while drive.remaining_codes() > 0 && drive.ctx().has_matched.is_none() { let code = drive.peek_code(0); let opcode = SreOpcode::try_from(code).unwrap(); @@ -364,7 +349,7 @@ impl OpcodeDispatcher { // Dispatches a context on a given opcode. Returns True if the context // is done matching, False if it must be resumed when next encountered. - fn dispatch(&mut self, opcode: SreOpcode, drive: &mut MatchContextDrive) -> bool { + fn dispatch(&mut self, opcode: SreOpcode, drive: &mut StackDrive) -> bool { let mut executor = match self.executing_contexts.remove_entry(&drive.id()) { Some((_, executor)) => executor, None => self.dispatch_table(opcode), @@ -414,58 +399,67 @@ impl OpcodeDispatcher { }), SreOpcode::BRANCH => unimplemented(), SreOpcode::CALL => unimplemented(), - SreOpcode::CATEGORY => unimplemented(), - SreOpcode::CHARSET => unimplemented(), - SreOpcode::BIGCHARSET => unimplemented(), + SreOpcode::CATEGORY => once(|drive| { + let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); + if drive.at_end() || !category(catcode, drive.peek_char()) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(2); + drive.skip_char(1); + } + }), + SreOpcode::CHARSET | SreOpcode::BIGCHARSET => unreachable!("unexpected opcode"), SreOpcode::GROUPREF => unimplemented(), SreOpcode::GROUPREF_EXISTS => unimplemented(), SreOpcode::GROUPREF_IGNORE => unimplemented(), - SreOpcode::IN => unimplemented(), - SreOpcode::IN_IGNORE => unimplemented(), + SreOpcode::IN => once(|drive| { + general_op_in(drive, |x| x); + }), + SreOpcode::IN_IGNORE => once(|drive| { + general_op_in(drive, lower_ascii); + }), + SreOpcode::IN_UNI_IGNORE => once(|drive| { + general_op_in(drive, lower_unicode); + }), + SreOpcode::IN_LOC_IGNORE => once(|drive| { + let skip = drive.peek_code(1) as usize; + if drive.at_end() || !charset_loc_ignore(&drive.pattern()[1..], drive.peek_char()) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(skip + 1); + drive.skip_char(1); + } + }), SreOpcode::INFO | SreOpcode::JUMP => once(|drive| { drive.skip_code(drive.peek_code(1) as usize + 1); }), SreOpcode::LITERAL => once(|drive| { - if drive.at_end() || drive.peek_char() as u32 != drive.peek_code(1) { - drive.ctx_mut().has_matched = Some(false); - } - drive.skip_code(2); - drive.skip_char(1); + general_op_literal(drive, |code, c| code == c as u32); + }), + SreOpcode::NOT_LITERAL => once(|drive| { + general_op_literal(drive, |code, c| code != c as u32); }), SreOpcode::LITERAL_IGNORE => once(|drive| { - let code = drive.peek_code(1); - let c = drive.peek_char(); - if drive.at_end() - || (c.to_ascii_lowercase() as u32 != code - && c.to_ascii_uppercase() as u32 != code) - { - drive.ctx_mut().has_matched = Some(false); - } - drive.skip_code(2); - drive.skip_char(1); + general_op_literal(drive, |code, c| code == lower_ascii(c) as u32); + }), + SreOpcode::NOT_LITERAL_IGNORE => once(|drive| { + general_op_literal(drive, |code, c| code != lower_ascii(c) as u32); + }), + SreOpcode::LITERAL_UNI_IGNORE => once(|drive| { + general_op_literal(drive, |code, c| code == lower_unicode(c) as u32); + }), + SreOpcode::NOT_LITERAL_UNI_IGNORE => once(|drive| { + general_op_literal(drive, |code, c| code != lower_unicode(c) as u32); + }), + SreOpcode::LITERAL_LOC_IGNORE => once(|drive| { + general_op_literal(drive, |code, c| char_loc_ignore(code, c)); + }), + SreOpcode::NOT_LITERAL_LOC_IGNORE => once(|drive| { + general_op_literal(drive, |code, c| !char_loc_ignore(code, c)); }), SreOpcode::MARK => unimplemented(), SreOpcode::MAX_UNTIL => unimplemented(), SreOpcode::MIN_UNTIL => unimplemented(), - SreOpcode::NOT_LITERAL => once(|drive| { - if drive.at_end() || drive.peek_char() as u32 == drive.peek_code(1) { - drive.ctx_mut().has_matched = Some(false); - } - drive.skip_code(2); - drive.skip_char(1); - }), - SreOpcode::NOT_LITERAL_IGNORE => once(|drive| { - let code = drive.peek_code(1); - let c = drive.peek_char(); - if drive.at_end() - || (c.to_ascii_lowercase() as u32 == code - || c.to_ascii_uppercase() as u32 == code) - { - drive.ctx_mut().has_matched = Some(false); - } - drive.skip_code(2); - drive.skip_char(1); - }), SreOpcode::NEGATE => unimplemented(), SreOpcode::RANGE => unimplemented(), SreOpcode::REPEAT => unimplemented(), @@ -473,47 +467,45 @@ impl OpcodeDispatcher { SreOpcode::SUBPATTERN => unimplemented(), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF_LOC_IGNORE => unimplemented(), - SreOpcode::IN_LOC_IGNORE => unimplemented(), - SreOpcode::LITERAL_LOC_IGNORE => unimplemented(), - SreOpcode::NOT_LITERAL_LOC_IGNORE => unimplemented(), SreOpcode::GROUPREF_UNI_IGNORE => unimplemented(), - SreOpcode::IN_UNI_IGNORE => unimplemented(), - SreOpcode::LITERAL_UNI_IGNORE => unimplemented(), - SreOpcode::NOT_LITERAL_UNI_IGNORE => unimplemented(), SreOpcode::RANGE_UNI_IGNORE => unimplemented(), } } +} - // Returns the number of repetitions of a single item, starting from the - // current string position. The code pointer is expected to point to a - // REPEAT_ONE operation (with the repeated 4 ahead). - fn count_repetitions(&mut self, drive: &mut MatchContextDrive, maxcount: usize) -> usize { - let mut count = 0; - let mut real_maxcount = drive.remaining_chars(); - if maxcount < real_maxcount && maxcount != MAXREPEAT { - real_maxcount = maxcount; - } - let code_position = drive.ctx().code_position; - let string_position = drive.ctx().string_position; - drive.skip_code(4); - let reset_position = drive.ctx().code_position; - while count < real_maxcount { - drive.ctx_mut().code_position = reset_position; - let opcode = SreOpcode::try_from(drive.peek_code(1)).unwrap(); - self.dispatch(opcode, drive); - if drive.ctx().has_matched == Some(false) { - break; - } - count += 1; - } - drive.ctx_mut().has_matched = None; - drive.ctx_mut().code_position = code_position; - drive.ctx_mut().string_position = string_position; - count +fn char_loc_ignore(code: u32, c: char) -> bool { + code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32 +} + +fn charset_loc_ignore(set: &[u32], c: char) -> bool { + let lo = lower_locate(c); + if charset(set, c) { + return true; } + let up = upper_locate(c); + up != lo && charset(set, up) } -fn at(drive: &mut MatchContextDrive, atcode: SreAtCode) -> bool { +fn general_op_literal bool>(drive: &mut StackDrive, f: F) { + if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(2); + drive.skip_char(1); + } +} + +fn general_op_in char>(drive: &mut StackDrive, f: F) { + let skip = drive.peek_code(1) as usize; + if drive.at_end() || !charset(&drive.pattern()[1..], f(drive.peek_char())) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(skip + 1); + drive.skip_char(1); + } +} + +fn at(drive: &StackDrive, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), @@ -642,54 +634,67 @@ fn charset(set: &[u32], c: char) -> bool { false } -fn count(drive: MatchContextDrive, maxcount: usize) -> usize { - let string_position = drive.state.string_position; +fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { + let drive = WrapDrive::drive(stack_drive.ctx().clone(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); + let opcode = match SreOpcode::try_from(drive.peek_code(1)) { + Ok(code) => code, + Err(_) => { + panic!("FIXME:COUNT1"); + } + }; - let opcode = SreOpcode::try_from(drive.peek_code(1)).unwrap(); match opcode { - SreOpcode::FAILURE => {} - SreOpcode::SUCCESS => {} - SreOpcode::ANY => {} - SreOpcode::ANY_ALL => {} - SreOpcode::ASSERT => {} - SreOpcode::ASSERT_NOT => {} - SreOpcode::AT => {} - SreOpcode::BRANCH => {} - SreOpcode::CALL => {} - SreOpcode::CATEGORY => {} - SreOpcode::CHARSET => {} - SreOpcode::BIGCHARSET => {} - SreOpcode::GROUPREF => {} - SreOpcode::GROUPREF_EXISTS => {} + SreOpcode::ANY => { + while !drive.at_end() && !drive.at_linebreak() { + drive.skip_char(1); + } + } + SreOpcode::ANY_ALL => { + drive.skip_char(drive.remaining_chars()); + } SreOpcode::IN => { + // TODO: pattern[2 or 1..]? + while !drive.at_end() && charset(&drive.pattern()[2..], drive.peek_char()) { + drive.skip_char(1); + } + } + SreOpcode::LITERAL => { + general_count_literal(drive, |code, c| code == c as u32); + } + SreOpcode::NOT_LITERAL => { + general_count_literal(drive, |code, c| code != c as u32); + } + SreOpcode::LITERAL_IGNORE => { + general_count_literal(drive, |code, c| code == lower_ascii(c) as u32); + } + SreOpcode::NOT_LITERAL_IGNORE => { + general_count_literal(drive, |code, c| code != lower_ascii(c) as u32); + } + SreOpcode::LITERAL_LOC_IGNORE => { + general_count_literal(drive, |code, c| char_loc_ignore(code, c)); + } + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_count_literal(drive, |code, c| !char_loc_ignore(code, c)); + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_count_literal(drive, |code, c| code == lower_unicode(c) as u32); + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_count_literal(drive, |code, c| code != lower_unicode(c) as u32); } - SreOpcode::INFO => {} - SreOpcode::JUMP => {} - SreOpcode::LITERAL => {} - SreOpcode::MARK => {} - SreOpcode::MAX_UNTIL => {} - SreOpcode::MIN_UNTIL => {} - SreOpcode::NOT_LITERAL => {} - SreOpcode::NEGATE => {} - SreOpcode::RANGE => {} - SreOpcode::REPEAT => {} - SreOpcode::REPEAT_ONE => {} - SreOpcode::SUBPATTERN => {} - SreOpcode::MIN_REPEAT_ONE => {} - SreOpcode::GROUPREF_IGNORE => {} - SreOpcode::IN_IGNORE => {} - SreOpcode::LITERAL_IGNORE => {} - SreOpcode::NOT_LITERAL_IGNORE => {} - SreOpcode::GROUPREF_LOC_IGNORE => {} - SreOpcode::IN_LOC_IGNORE => {} - SreOpcode::LITERAL_LOC_IGNORE => {} - SreOpcode::NOT_LITERAL_LOC_IGNORE => {} - SreOpcode::GROUPREF_UNI_IGNORE => {} - SreOpcode::IN_UNI_IGNORE => {} - SreOpcode::LITERAL_UNI_IGNORE => {} - SreOpcode::NOT_LITERAL_UNI_IGNORE => {} - SreOpcode::RANGE_UNI_IGNORE => {} + _ => { + panic!("TODO: Not Implemented."); + } + } + + drive.ctx().string_position - stack_drive.ctx().string_position +} + +fn general_count_literal bool>(drive: &mut WrapDrive, f: F) { + let ch = drive.peek_code(1); + while !drive.at_end() && f(ch, drive.peek_char()) { + drive.skip_char(1); } } @@ -781,7 +786,7 @@ impl Default for OpAssert { } } impl OpcodeExecutor for OpAssert { - fn next(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { 0 => self._0(drive), 1 => self._1(drive), @@ -790,7 +795,7 @@ impl OpcodeExecutor for OpAssert { } } impl OpAssert { - fn _0(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { let back = drive.peek_code(2) as usize; if back > drive.ctx().string_position { drive.ctx_mut().has_matched = Some(false); @@ -801,7 +806,7 @@ impl OpAssert { self.jump_id = 1; Some(()) } - fn _1(&mut self, drive: &mut MatchContextDrive) -> Option<()> { + fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { if drive.state.context_stack[self.child_ctx_id].has_matched == Some(true) { drive.skip_code(drive.peek_code(1) as usize + 1); } else { @@ -810,3 +815,95 @@ impl OpAssert { None } } + +struct OpMinRepeatOne { + jump_id: usize, + mincount: usize, + maxcount: usize, + count: usize, + child_ctx_id: usize, +} +impl OpcodeExecutor for OpMinRepeatOne { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => self._0(drive), + 1 => self._1(drive), + 2 => self._2(drive), + _ => unreachable!(), + } + } +} +impl Default for OpMinRepeatOne { + fn default() -> Self { + OpMinRepeatOne { + jump_id: 0, + mincount: 0, + maxcount: 0, + count: 0, + child_ctx_id: 0, + } + } +} +impl OpMinRepeatOne { + fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { + self.mincount = drive.peek_code(2) as usize; + self.maxcount = drive.peek_code(3) as usize; + + if drive.remaining_chars() < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + + drive.state.string_position = drive.ctx().string_position; + + self.count = if self.mincount == 0 { + 0 + } else { + let count = count(drive, self.mincount); + if count < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(count); + count + }; + + if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + return None; + } + + drive.state.marks_push(); + self.jump_id = 1; + self._1(drive) + } + fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { + if self.maxcount == MAXREPEAT || self.count <= self.maxcount { + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + self.jump_id = 2; + return Some(()); + } + + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + None + } + fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { + if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.string_position = drive.ctx().string_position; + if count(drive, 1) == 0 { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(1); + self.count += 1; + drive.state.marks_pop_keep(); + self.jump_id = 1; + self._1(drive) + } +} From 78485fd8df79444283d3cb978654500da0ce0590 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sat, 26 Dec 2020 16:59:30 +0200 Subject: [PATCH 005/705] create _sre.Match --- interp.rs | 68 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/interp.rs b/interp.rs index 8e0968cbf5..71d89cc0e8 100644 --- a/interp.rs +++ b/interp.rs @@ -1,21 +1,24 @@ // good luck to those that follow; here be dragons -use super::_sre::MAXREPEAT; +use rustpython_common::borrow::BorrowValue; + +use super::_sre::{Match, Pattern, MAXREPEAT}; use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; +use crate::builtins::PyStrRef; use std::collections::HashMap; use std::convert::TryFrom; #[derive(Debug)] -pub struct State<'a> { +pub(crate) struct State<'a> { string: &'a str, // chars count string_len: usize, - start: usize, - end: usize, + pub start: usize, + pub end: usize, flags: SreFlag, - pattern_codes: Vec, + pattern_codes: &'a [u32], marks: Vec>, - lastindex: isize, + pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, context_stack: Vec, repeat: Option, @@ -28,7 +31,7 @@ impl<'a> State<'a> { start: usize, end: usize, flags: SreFlag, - pattern_codes: Vec, + pattern_codes: &'a [u32], ) -> Self { let string_len = string.chars().count(); let end = std::cmp::min(end, string_len); @@ -75,20 +78,36 @@ impl<'a> State<'a> { } } fn marks_push(&mut self) { - self.marks_stack.push(self.marks.clone(), self.lastindex); + self.marks_stack.push((self.marks.clone(), self.lastindex)); } fn marks_pop(&mut self) { - (self.marks, self.lastindex) = self.marks_stack.pop().unwrap(); + let (marks, lastindex) = self.marks_stack.pop().unwrap(); + self.marks = marks; + self.lastindex = lastindex; } fn marks_pop_keep(&mut self) { - (self.marks, self.lastindex) = self.marks_stack.last().unwrap(); + let (marks, lastindex) = self.marks_stack.last().unwrap().clone(); + self.marks = marks; + self.lastindex = lastindex; } fn marks_pop_discard(&mut self) { self.marks_stack.pop(); } } -pub(crate) fn pymatch(mut state: State) -> bool { +pub(crate) fn pymatch( + string: PyStrRef, + start: usize, + end: usize, + pattern: &Pattern, +) -> Option { + let mut state = State::new( + string.borrow_value(), + start, + end, + pattern.flags.clone(), + &pattern.code, + ); let ctx = MatchContext { string_position: state.start, string_offset: state @@ -117,7 +136,12 @@ pub(crate) fn pymatch(mut state: State) -> bool { state.context_stack.pop(); } } - has_matched.unwrap_or(false) + + if has_matched == None || has_matched == Some(false) { + return None; + } + + Some(Match::new(&state, pattern.pattern.clone(), string.clone())) } #[derive(Debug, Copy, Clone)] @@ -635,7 +659,7 @@ fn charset(set: &[u32], c: char) -> bool { } fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { - let drive = WrapDrive::drive(stack_drive.ctx().clone(), stack_drive); + let mut drive = WrapDrive::drive(stack_drive.ctx().clone(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); let opcode = match SreOpcode::try_from(drive.peek_code(1)) { Ok(code) => code, @@ -660,28 +684,28 @@ fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { } } SreOpcode::LITERAL => { - general_count_literal(drive, |code, c| code == c as u32); + general_count_literal(&mut drive, |code, c| code == c as u32); } SreOpcode::NOT_LITERAL => { - general_count_literal(drive, |code, c| code != c as u32); + general_count_literal(&mut drive, |code, c| code != c as u32); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(drive, |code, c| code == lower_ascii(c) as u32); + general_count_literal(&mut drive, |code, c| code == lower_ascii(c) as u32); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(drive, |code, c| code != lower_ascii(c) as u32); + general_count_literal(&mut drive, |code, c| code != lower_ascii(c) as u32); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(drive, |code, c| char_loc_ignore(code, c)); + general_count_literal(&mut drive, |code, c| char_loc_ignore(code, c)); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(drive, |code, c| !char_loc_ignore(code, c)); + general_count_literal(&mut drive, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(drive, |code, c| code == lower_unicode(c) as u32); + general_count_literal(&mut drive, |code, c| code == lower_unicode(c) as u32); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(drive, |code, c| code != lower_unicode(c) as u32); + general_count_literal(&mut drive, |code, c| code != lower_unicode(c) as u32); } _ => { panic!("TODO: Not Implemented."); @@ -691,7 +715,7 @@ fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { drive.ctx().string_position - stack_drive.ctx().string_position } -fn general_count_literal bool>(drive: &mut WrapDrive, f: F) { +fn general_count_literal bool>(drive: &mut WrapDrive, mut f: F) { let ch = drive.peek_code(1); while !drive.at_end() && f(ch, drive.peek_char()) { drive.skip_char(1); From aa0f20b93e86e6b63eadefb1f17958b2d513ccd3 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sat, 26 Dec 2020 18:44:33 +0200 Subject: [PATCH 006/705] impl Pattern.fullmatch, Pattern.search --- interp.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/interp.rs b/interp.rs index 71d89cc0e8..f87cc85abc 100644 --- a/interp.rs +++ b/interp.rs @@ -1,10 +1,9 @@ // good luck to those that follow; here be dragons -use rustpython_common::borrow::BorrowValue; - use super::_sre::{Match, Pattern, MAXREPEAT}; use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use crate::builtins::PyStrRef; +use rustpython_common::borrow::BorrowValue; use std::collections::HashMap; use std::convert::TryFrom; From 312e5b875677bbc57b085ffae98f230e56717d21 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 27 Dec 2020 21:27:06 +0200 Subject: [PATCH 007/705] impl opcode groupref and assert_not --- interp.rs | 143 ++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 124 insertions(+), 19 deletions(-) diff --git a/interp.rs b/interp.rs index f87cc85abc..88dbb034da 100644 --- a/interp.rs +++ b/interp.rs @@ -109,12 +109,7 @@ pub(crate) fn pymatch( ); let ctx = MatchContext { string_position: state.start, - string_offset: state - .string - .char_indices() - .nth(state.start) - .map(|x| x.0) - .unwrap_or(0), + string_offset: calc_string_offset(state.string, state.start), code_position: 0, has_matched: None, }; @@ -411,7 +406,7 @@ impl OpcodeDispatcher { } }), SreOpcode::ASSERT => Box::new(OpAssert::default()), - SreOpcode::ASSERT_NOT => unimplemented(), + SreOpcode::ASSERT_NOT => Box::new(OpAssertNot::default()), SreOpcode::AT => once(|drive| { let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); if !at(drive, atcode) { @@ -432,9 +427,6 @@ impl OpcodeDispatcher { } }), SreOpcode::CHARSET | SreOpcode::BIGCHARSET => unreachable!("unexpected opcode"), - SreOpcode::GROUPREF => unimplemented(), - SreOpcode::GROUPREF_EXISTS => unimplemented(), - SreOpcode::GROUPREF_IGNORE => unimplemented(), SreOpcode::IN => once(|drive| { general_op_in(drive, |x| x); }), @@ -480,7 +472,12 @@ impl OpcodeDispatcher { SreOpcode::NOT_LITERAL_LOC_IGNORE => once(|drive| { general_op_literal(drive, |code, c| !char_loc_ignore(code, c)); }), - SreOpcode::MARK => unimplemented(), + SreOpcode::MARK => once(|drive| { + drive + .state + .set_mark(drive.peek_code(1) as usize, drive.ctx().string_position); + drive.skip_code(2); + }), SreOpcode::MAX_UNTIL => unimplemented(), SreOpcode::MIN_UNTIL => unimplemented(), SreOpcode::NEGATE => unimplemented(), @@ -489,13 +486,36 @@ impl OpcodeDispatcher { SreOpcode::REPEAT_ONE => unimplemented(), SreOpcode::SUBPATTERN => unimplemented(), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), - SreOpcode::GROUPREF_LOC_IGNORE => unimplemented(), - SreOpcode::GROUPREF_UNI_IGNORE => unimplemented(), + SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), + SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)), + SreOpcode::GROUPREF_LOC_IGNORE => { + once(|drive| general_op_groupref(drive, lower_locate)) + } + SreOpcode::GROUPREF_UNI_IGNORE => { + once(|drive| general_op_groupref(drive, lower_unicode)) + } + SreOpcode::GROUPREF_EXISTS => once(|drive| { + let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); + match (group_start, group_end) { + (Some(start), Some(end)) if start <= end => { + drive.skip_code(3); + } + _ => drive.skip_code(drive.peek_code(2) as usize + 1), + } + }), SreOpcode::RANGE_UNI_IGNORE => unimplemented(), } } } +fn calc_string_offset(string: &str, position: usize) -> usize { + string + .char_indices() + .nth(position) + .map(|(i, _)| i) + .unwrap_or(0) +} + fn char_loc_ignore(code: u32, c: char) -> bool { code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32 } @@ -509,6 +529,40 @@ fn charset_loc_ignore(set: &[u32], c: char) -> bool { up != lo && charset(set, up) } +fn general_op_groupref char>(drive: &mut StackDrive, mut f: F) { + let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); + let (group_start, group_end) = match (group_start, group_end) { + (Some(start), Some(end)) if start <= end => (start, end), + _ => { + drive.ctx_mut().has_matched = Some(false); + return; + } + }; + let mut wdrive = WrapDrive::drive(*drive.ctx(), &drive); + let mut gdrive = WrapDrive::drive( + MatchContext { + string_position: group_start, + // TODO: cache the offset + string_offset: calc_string_offset(drive.state.string, group_start), + ..*drive.ctx() + }, + &drive, + ); + for _ in group_start..group_end { + if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) { + drive.ctx_mut().has_matched = Some(false); + return; + } + wdrive.skip_char(1); + gdrive.skip_char(1); + } + let position = wdrive.ctx().string_position; + let offset = wdrive.ctx().string_offset; + drive.skip_code(2); + drive.ctx_mut().string_position = position; + drive.ctx_mut().string_offset = offset; +} + fn general_op_literal bool>(drive: &mut StackDrive, f: F) { if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); @@ -766,11 +820,19 @@ fn is_uni_space(c: char) -> bool { c.is_whitespace() } fn is_uni_linebreak(c: char) -> bool { - match c { - '\u{000A}' | '\u{000B}' | '\u{000C}' | '\u{000D}' | '\u{001C}' | '\u{001D}' - | '\u{001E}' | '\u{0085}' | '\u{2028}' | '\u{2029}' => true, - _ => false, - } + matches!( + c, + '\u{000A}' + | '\u{000B}' + | '\u{000C}' + | '\u{000D}' + | '\u{001C}' + | '\u{001D}' + | '\u{001E}' + | '\u{0085}' + | '\u{2028}' + | '\u{2029}' + ) } fn is_uni_alnum(c: char) -> bool { // TODO: check with cpython @@ -802,7 +864,7 @@ struct OpAssert { } impl Default for OpAssert { fn default() -> Self { - OpAssert { + Self { child_ctx_id: 0, jump_id: 0, } @@ -839,6 +901,49 @@ impl OpAssert { } } +struct OpAssertNot { + child_ctx_id: usize, + jump_id: usize, +} +impl Default for OpAssertNot { + fn default() -> Self { + Self { + child_ctx_id: 0, + jump_id: 0, + } + } +} +impl OpcodeExecutor for OpAssertNot { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => self._0(drive), + 1 => self._1(drive), + _ => unreachable!(), + } + } +} +impl OpAssertNot { + fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { + let back = drive.peek_code(2) as usize; + if back > drive.ctx().string_position { + drive.skip_code(drive.peek_code(1) as usize + 1); + return None; + } + drive.state.string_position = drive.ctx().string_position - back; + self.child_ctx_id = drive.push_new_context(3); + self.jump_id = 1; + Some(()) + } + fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { + if drive.state.context_stack[self.child_ctx_id].has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(drive.peek_code(1) as usize + 1); + } + None + } +} + struct OpMinRepeatOne { jump_id: usize, mincount: usize, From 0b2c8d1fa256a0b40223f97b2a5de6b2747c3397 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 29 Dec 2020 10:33:28 +0200 Subject: [PATCH 008/705] impl OpMaxUntil --- interp.rs | 133 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 122 insertions(+), 11 deletions(-) diff --git a/interp.rs b/interp.rs index 88dbb034da..85787ae516 100644 --- a/interp.rs +++ b/interp.rs @@ -20,7 +20,7 @@ pub(crate) struct State<'a> { pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, context_stack: Vec, - repeat: Option, + repeat_stack: Vec, string_position: usize, } @@ -45,7 +45,7 @@ impl<'a> State<'a> { lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), - repeat: None, + repeat_stack: Vec::new(), marks: Vec::new(), string_position: start, } @@ -56,7 +56,7 @@ impl<'a> State<'a> { self.lastindex = -1; self.marks_stack.clear(); self.context_stack.clear(); - self.repeat = None; + self.repeat_stack.clear(); } fn set_mark(&mut self, mark_nr: usize, position: usize) { @@ -104,7 +104,7 @@ pub(crate) fn pymatch( string.borrow_value(), start, end, - pattern.flags.clone(), + pattern.flags, &pattern.code, ); let ctx = MatchContext { @@ -237,6 +237,7 @@ trait MatchContextDrive { ]), _ => unreachable!(), }; + // TODO: char::from_u32_unchecked is stable from 1.5.0 unsafe { std::mem::transmute(code) } } fn back_skip_char(&mut self, skip_count: usize) { @@ -426,7 +427,6 @@ impl OpcodeDispatcher { drive.skip_char(1); } }), - SreOpcode::CHARSET | SreOpcode::BIGCHARSET => unreachable!("unexpected opcode"), SreOpcode::IN => once(|drive| { general_op_in(drive, |x| x); }), @@ -467,7 +467,7 @@ impl OpcodeDispatcher { general_op_literal(drive, |code, c| code != lower_unicode(c) as u32); }), SreOpcode::LITERAL_LOC_IGNORE => once(|drive| { - general_op_literal(drive, |code, c| char_loc_ignore(code, c)); + general_op_literal(drive, char_loc_ignore); }), SreOpcode::NOT_LITERAL_LOC_IGNORE => once(|drive| { general_op_literal(drive, |code, c| !char_loc_ignore(code, c)); @@ -478,9 +478,8 @@ impl OpcodeDispatcher { .set_mark(drive.peek_code(1) as usize, drive.ctx().string_position); drive.skip_code(2); }), - SreOpcode::MAX_UNTIL => unimplemented(), + SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), SreOpcode::MIN_UNTIL => unimplemented(), - SreOpcode::NEGATE => unimplemented(), SreOpcode::RANGE => unimplemented(), SreOpcode::REPEAT => unimplemented(), SreOpcode::REPEAT_ONE => unimplemented(), @@ -504,6 +503,10 @@ impl OpcodeDispatcher { } }), SreOpcode::RANGE_UNI_IGNORE => unimplemented(), + _ => { + // TODO + unreachable!("unexpected opcode") + } } } } @@ -661,7 +664,7 @@ fn charset(set: &[u32], c: char) -> bool { /* <256 blockindices> */ let count = set[i + 1]; if ch < 0x10000 { - let blockindices: &[u8] = unsafe { std::mem::transmute(&set[i + 2..]) }; + let (_, blockindices, _) = unsafe { set[i + 2..].align_to::() }; let block = blockindices[(ch >> 8) as usize]; if set[2 + 64 + ((block as u32 * 256 + (ch & 255)) / 32) as usize] & (1 << (ch & (32 - 1))) @@ -712,7 +715,7 @@ fn charset(set: &[u32], c: char) -> bool { } fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { - let mut drive = WrapDrive::drive(stack_drive.ctx().clone(), stack_drive); + let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); let opcode = match SreOpcode::try_from(drive.peek_code(1)) { Ok(code) => code, @@ -749,7 +752,7 @@ fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { general_count_literal(&mut drive, |code, c| code != lower_ascii(c) as u32); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(&mut drive, |code, c| char_loc_ignore(code, c)); + general_count_literal(&mut drive, char_loc_ignore); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { general_count_literal(&mut drive, |code, c| !char_loc_ignore(code, c)); @@ -1035,3 +1038,111 @@ impl OpMinRepeatOne { self._1(drive) } } + +#[derive(Debug, Copy, Clone)] +struct RepeatContext { + skip: usize, + mincount: usize, + maxcount: usize, + count: isize, + last_position: isize, +} + +struct OpMaxUntil { + jump_id: usize, + count: isize, + save_last_position: isize, + child_ctx_id: usize, +} +impl Default for OpMaxUntil { + fn default() -> Self { + Self { + jump_id: 0, + count: 0, + save_last_position: -1, + child_ctx_id: 0, + } + } +} +impl OpcodeExecutor for OpMaxUntil { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => { + drive.state.string_position = drive.ctx().string_position; + let repeat = match drive.state.repeat_stack.last_mut() { + Some(repeat) => repeat, + None => { + todo!("Internal re error: MAX_UNTIL without REPEAT."); + } + }; + self.count = repeat.count + 1; + + if self.count < repeat.mincount as isize { + // not enough matches + repeat.count = self.count; + self.child_ctx_id = drive.push_new_context(4); + self.jump_id = 1; + return Some(()); + } + + if (self.count < repeat.maxcount as isize || repeat.maxcount == MAXREPEAT) + && (drive.state.string_position as isize != repeat.last_position) + { + // we may have enough matches, if we can match another item, do so + repeat.count = self.count; + self.save_last_position = repeat.last_position; + repeat.last_position = drive.state.string_position as isize; + drive.state.marks_push(); + self.child_ctx_id = drive.push_new_context(4); + self.jump_id = 2; + return Some(()); + } + + self.child_ctx_id = drive.push_new_context(1); + + self.jump_id = 3; + Some(()) + } + 1 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.state.string_position = drive.ctx().string_position; + let repeat = drive.state.repeat_stack.last_mut().unwrap(); + repeat.count = self.count - 1; + } + None + } + 2 => { + let repeat = drive.state.repeat_stack.last_mut().unwrap(); + repeat.last_position = drive.state.string_position as isize; + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + if child_ctx.has_matched == Some(true) { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(true); + return None; + } + repeat.count = self.count - 1; + drive.state.marks_pop(); + drive.state.string_position = drive.ctx().string_position; + + self.child_ctx_id = drive.push_new_context(1); + + self.jump_id = 3; + Some(()) + } + 3 => { + // cannot match more repeated items here. make sure the tail matches + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.state.string_position = drive.ctx().string_position; + } else { + drive.state.repeat_stack.pop(); + } + None + } + _ => unreachable!(), + } + } +} From 93c2b8b55513989982156136e6b92a1f07e02c24 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 29 Dec 2020 13:02:43 +0200 Subject: [PATCH 009/705] impl OpBranch --- interp.rs | 49 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/interp.rs b/interp.rs index 85787ae516..03cfdfa96a 100644 --- a/interp.rs +++ b/interp.rs @@ -417,7 +417,6 @@ impl OpcodeDispatcher { } }), SreOpcode::BRANCH => unimplemented(), - SreOpcode::CALL => unimplemented(), SreOpcode::CATEGORY => once(|drive| { let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); if drive.at_end() || !category(catcode, drive.peek_char()) { @@ -1146,3 +1145,51 @@ impl OpcodeExecutor for OpMaxUntil { } } } + +struct OpBranch { + jump_id: usize, + child_ctx_id: usize, + current_branch_length: usize, +} +impl Default for OpBranch { + fn default() -> Self { + Self { jump_id: 0, child_ctx_id: 0, current_branch_length: 0 } + } +} +impl OpcodeExecutor for OpBranch { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => { + drive.state.marks_push(); + // jump out the head + self.current_branch_length = 1; + self.jump_id = 1; + self.next(drive) + } + 1 => { + drive.skip_code(self.current_branch_length); + self.current_branch_length = drive.peek_code(0) as usize; + if self.current_branch_length == 0 { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(1); + self.jump_id = 2; + Some(()) + } + 2 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + if child_ctx.has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.marks_pop_keep(); + self.jump_id = 1; + Some(()) + } + _ => unreachable!() + } + } +} \ No newline at end of file From 5a4459856ca57886279f0cc4f1abc33c7cf4a397 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 29 Dec 2020 13:56:37 +0200 Subject: [PATCH 010/705] impl OpRepeat --- interp.rs | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 4 deletions(-) diff --git a/interp.rs b/interp.rs index 03cfdfa96a..fbcbf260e5 100644 --- a/interp.rs +++ b/interp.rs @@ -416,7 +416,7 @@ impl OpcodeDispatcher { drive.skip_code(2); } }), - SreOpcode::BRANCH => unimplemented(), + SreOpcode::BRANCH => Box::new(OpBranch::default()), SreOpcode::CATEGORY => once(|drive| { let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); if drive.at_end() || !category(catcode, drive.peek_char()) { @@ -1146,6 +1146,39 @@ impl OpcodeExecutor for OpMaxUntil { } } +struct OpMinUntil { + jump_id: usize, + count: isize, + child_ctx_id: usize, +} +impl Default for OpMinUntil { + fn default() -> Self { + Self { + jump_id: 0, + count: 0, + child_ctx_id: 0, + } + } +} +impl OpcodeExecutor for OpMinUntil { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => { + drive.state.string_position = drive.ctx().string_position; + let repeat = match drive.state.repeat_stack.last_mut() { + Some(repeat) => repeat, + None => { + todo!("Internal re error: MAX_UNTIL without REPEAT."); + } + }; + self.count = repeat.count + 1; + None + } + _ => unreachable!(), + } + } +} + struct OpBranch { jump_id: usize, child_ctx_id: usize, @@ -1153,7 +1186,11 @@ struct OpBranch { } impl Default for OpBranch { fn default() -> Self { - Self { jump_id: 0, child_ctx_id: 0, current_branch_length: 0 } + Self { + jump_id: 0, + child_ctx_id: 0, + current_branch_length: 0, + } } } impl OpcodeExecutor for OpBranch { @@ -1189,7 +1226,46 @@ impl OpcodeExecutor for OpBranch { self.jump_id = 1; Some(()) } - _ => unreachable!() + _ => unreachable!(), + } + } +} + +struct OpRepeat { + jump_id: usize, + child_ctx_id: usize, +} +impl Default for OpRepeat { + fn default() -> Self { + Self { + jump_id: 0, + child_ctx_id: 0, + } + } +} +impl OpcodeExecutor for OpRepeat { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => { + let repeat = RepeatContext { + skip: drive.peek_code(1), + mincount: drive.peek_code(2), + maxcount: drive.peek_code(3), + count: -1, + last_position: -1, + }; + drive.state.repeat_stack.push(repeat); + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + self.jump_id = 1; + Some(()) + } + 1 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + drive.ctx_mut().has_matched = child_ctx.has_matched; + None + } + _ => unreachable!(), } } -} \ No newline at end of file +} From af7901dcb21cba20bcfa8635b3c86d84574a988e Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 29 Dec 2020 16:48:30 +0200 Subject: [PATCH 011/705] impl OpMinUntil --- interp.rs | 59 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/interp.rs b/interp.rs index fbcbf260e5..ec6d41eade 100644 --- a/interp.rs +++ b/interp.rs @@ -478,11 +478,9 @@ impl OpcodeDispatcher { drive.skip_code(2); }), SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), - SreOpcode::MIN_UNTIL => unimplemented(), - SreOpcode::RANGE => unimplemented(), - SreOpcode::REPEAT => unimplemented(), + SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), + SreOpcode::REPEAT => Box::new(OpRepeat::default()), SreOpcode::REPEAT_ONE => unimplemented(), - SreOpcode::SUBPATTERN => unimplemented(), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)), @@ -501,9 +499,8 @@ impl OpcodeDispatcher { _ => drive.skip_code(drive.peek_code(2) as usize + 1), } }), - SreOpcode::RANGE_UNI_IGNORE => unimplemented(), _ => { - // TODO + // TODO error expcetion unreachable!("unexpected opcode") } } @@ -1172,8 +1169,52 @@ impl OpcodeExecutor for OpMinUntil { } }; self.count = repeat.count + 1; + + if self.count < repeat.mincount as isize { + // not enough matches + repeat.count = self.count; + self.child_ctx_id = drive.push_new_context(4); + self.jump_id = 1; + return Some(()); + } + + // see if the tail matches + drive.state.marks_push(); + self.child_ctx_id = drive.push_new_context(1); + self.jump_id = 2; + Some(()) + } + 1 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.state.string_position = drive.ctx().string_position; + let repeat = drive.state.repeat_stack.last_mut().unwrap(); + repeat.count = self.count - 1; + } None } + 2 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + if child_ctx.has_matched == Some(true) { + drive.state.repeat_stack.pop(); + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.string_position = drive.ctx().string_position; + drive.state.marks_pop(); + + // match more until tail matches + let repeat = drive.state.repeat_stack.last_mut().unwrap(); + if self.count >= repeat.maxcount as isize && repeat.maxcount != MAXREPEAT { + drive.ctx_mut().has_matched = Some(false); + return None; + } + repeat.count = self.count; + self.child_ctx_id = drive.push_new_context(4); + self.jump_id = 1; + Some(()) + } _ => unreachable!(), } } @@ -1248,9 +1289,9 @@ impl OpcodeExecutor for OpRepeat { match self.jump_id { 0 => { let repeat = RepeatContext { - skip: drive.peek_code(1), - mincount: drive.peek_code(2), - maxcount: drive.peek_code(3), + skip: drive.peek_code(1) as usize, + mincount: drive.peek_code(2) as usize, + maxcount: drive.peek_code(3) as usize, count: -1, last_position: -1, }; From fa2adaf2ff9bf8acf642ad210480e686848d4061 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 29 Dec 2020 17:53:14 +0200 Subject: [PATCH 012/705] Impl OpRepeatONe --- interp.rs | 132 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 103 insertions(+), 29 deletions(-) diff --git a/interp.rs b/interp.rs index ec6d41eade..64d70216e3 100644 --- a/interp.rs +++ b/interp.rs @@ -313,14 +313,6 @@ trait OpcodeExecutor { fn next(&mut self, drive: &mut StackDrive) -> Option<()>; } -struct OpUnimplemented {} -impl OpcodeExecutor for OpUnimplemented { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - drive.ctx_mut().has_matched = Some(false); - None - } -} - struct OpOnce { f: Option, } @@ -335,10 +327,6 @@ fn once(f: F) -> Box> { Box::new(OpOnce { f: Some(f) }) } -fn unimplemented() -> Box { - Box::new(OpUnimplemented {}) -} - struct OpcodeDispatcher { executing_contexts: HashMap>, } @@ -480,7 +468,7 @@ impl OpcodeDispatcher { SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), SreOpcode::REPEAT => Box::new(OpRepeat::default()), - SreOpcode::REPEAT_ONE => unimplemented(), + SreOpcode::REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)), @@ -500,7 +488,7 @@ impl OpcodeDispatcher { } }), _ => { - // TODO error expcetion + // TODO python expcetion? unreachable!("unexpected opcode") } } @@ -713,6 +701,7 @@ fn charset(set: &[u32], c: char) -> bool { fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); + let end = drive.ctx().string_position + maxcount; let opcode = match SreOpcode::try_from(drive.peek_code(1)) { Ok(code) => code, Err(_) => { @@ -722,54 +711,56 @@ fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { match opcode { SreOpcode::ANY => { - while !drive.at_end() && !drive.at_linebreak() { + while !drive.ctx().string_position < end && !drive.at_linebreak() { drive.skip_char(1); } } SreOpcode::ANY_ALL => { - drive.skip_char(drive.remaining_chars()); + drive.skip_char(maxcount); } SreOpcode::IN => { // TODO: pattern[2 or 1..]? - while !drive.at_end() && charset(&drive.pattern()[2..], drive.peek_char()) { + while !drive.ctx().string_position < end + && charset(&drive.pattern()[2..], drive.peek_char()) + { drive.skip_char(1); } } SreOpcode::LITERAL => { - general_count_literal(&mut drive, |code, c| code == c as u32); + general_count_literal(&mut drive, end, |code, c| code == c as u32); } SreOpcode::NOT_LITERAL => { - general_count_literal(&mut drive, |code, c| code != c as u32); + general_count_literal(&mut drive, end, |code, c| code != c as u32); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(&mut drive, |code, c| code == lower_ascii(c) as u32); + general_count_literal(&mut drive, end, |code, c| code == lower_ascii(c) as u32); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(&mut drive, |code, c| code != lower_ascii(c) as u32); + general_count_literal(&mut drive, end, |code, c| code != lower_ascii(c) as u32); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(&mut drive, char_loc_ignore); + general_count_literal(&mut drive, end, char_loc_ignore); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(&mut drive, |code, c| !char_loc_ignore(code, c)); + general_count_literal(&mut drive, end, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(&mut drive, |code, c| code == lower_unicode(c) as u32); + general_count_literal(&mut drive, end, |code, c| code == lower_unicode(c) as u32); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(&mut drive, |code, c| code != lower_unicode(c) as u32); + general_count_literal(&mut drive, end, |code, c| code != lower_unicode(c) as u32); } _ => { - panic!("TODO: Not Implemented."); + todo!("repeated single character pattern?"); } } - drive.ctx().string_position - stack_drive.ctx().string_position + drive.ctx().string_position - drive.state().string_position } -fn general_count_literal bool>(drive: &mut WrapDrive, mut f: F) { +fn general_count_literal bool>(drive: &mut WrapDrive, end: usize, mut f: F) { let ch = drive.peek_code(1); - while !drive.at_end() && f(ch, drive.peek_char()) { + while !drive.ctx().string_position < end && f(ch, drive.peek_char()) { drive.skip_char(1); } } @@ -1310,3 +1301,86 @@ impl OpcodeExecutor for OpRepeat { } } } + +struct OpRepeatOne { + jump_id: usize, + child_ctx_id: usize, + mincount: usize, + maxcount: usize, + count: usize, +} +impl Default for OpRepeatOne { + fn default() -> Self { + Self { + jump_id: 0, + child_ctx_id: 0, + mincount: 0, + maxcount: 0, + count: 0, + } + } +} +impl OpcodeExecutor for OpRepeatOne { + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + match self.jump_id { + 0 => { + self.mincount = drive.peek_code(2) as usize; + self.maxcount = drive.peek_code(3) as usize; + + if drive.remaining_chars() < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.state.string_position = drive.ctx().string_position; + self.count = count(drive, self.maxcount); + drive.skip_char(self.count); + if self.count < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + + let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 { + // tail is empty. we're finished + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + return None; + } + + drive.state.marks_push(); + // TODO: + // Special case: Tail starts with a literal. Skip positions where + // the rest of the pattern cannot possibly match. + self.jump_id = 1; + self.next(drive) + } + 1 => { + // General case: backtracking + if self.count >= self.mincount { + drive.state.string_position = drive.ctx().string_position; + self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + self.jump_id = 2; + return Some(()); + } + + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + None + } + 2 => { + let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + if child_ctx.has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.back_skip_char(1); + self.count -= 1; + drive.state.marks_pop_keep(); + + self.jump_id = 1; + Some(()) + } + _ => unreachable!(), + } + } +} From ae44580371afb3e347f31647368c2a7a8b1a1578 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 30 Dec 2020 14:14:13 +0200 Subject: [PATCH 013/705] general case for count --- interp.rs | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/interp.rs b/interp.rs index 64d70216e3..99574443c0 100644 --- a/interp.rs +++ b/interp.rs @@ -3,6 +3,7 @@ use super::_sre::{Match, Pattern, MAXREPEAT}; use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use crate::builtins::PyStrRef; +use crate::pyobject::PyRef; use rustpython_common::borrow::BorrowValue; use std::collections::HashMap; use std::convert::TryFrom; @@ -98,7 +99,7 @@ pub(crate) fn pymatch( string: PyStrRef, start: usize, end: usize, - pattern: &Pattern, + pattern: PyRef, ) -> Option { let mut state = State::new( string.borrow_value(), @@ -135,7 +136,7 @@ pub(crate) fn pymatch( return None; } - Some(Match::new(&state, pattern.pattern.clone(), string.clone())) + Some(Match::new(&state, pattern.clone().into_object(), string.clone())) } #[derive(Debug, Copy, Clone)] @@ -425,7 +426,7 @@ impl OpcodeDispatcher { }), SreOpcode::IN_LOC_IGNORE => once(|drive| { let skip = drive.peek_code(1) as usize; - if drive.at_end() || !charset_loc_ignore(&drive.pattern()[1..], drive.peek_char()) { + if drive.at_end() || !charset_loc_ignore(&drive.pattern()[2..], drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(skip + 1); @@ -561,7 +562,7 @@ fn general_op_literal bool>(drive: &mut StackDrive, f: F fn general_op_in char>(drive: &mut StackDrive, f: F) { let skip = drive.peek_code(1) as usize; - if drive.at_end() || !charset(&drive.pattern()[1..], f(drive.peek_char())) { + if drive.at_end() || !charset(&drive.pattern()[2..], f(drive.peek_char())) { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(skip + 1); @@ -698,7 +699,28 @@ fn charset(set: &[u32], c: char) -> bool { false } -fn count(stack_drive: &StackDrive, maxcount: usize) -> usize { +fn count(drive: &mut StackDrive, maxcount: usize) -> usize { + let mut count = 0; + let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); + + let save_ctx = *drive.ctx(); + drive.skip_code(4); + let reset_position = drive.ctx().code_position; + + let mut dispatcher = OpcodeDispatcher::new(); + while count < maxcount { + drive.ctx_mut().code_position = reset_position; + dispatcher.dispatch(SreOpcode::try_from(drive.peek_code(0)).unwrap(), drive); + if drive.ctx().has_matched == Some(false) { + break; + } + count += 1; + } + *drive.ctx_mut() = save_ctx; + count +} + +fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); let end = drive.ctx().string_position + maxcount; From f7287553e9ed42df638190b56ce4474eb7783c38 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 30 Dec 2020 18:04:00 +0200 Subject: [PATCH 014/705] impl re.Match object --- interp.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/interp.rs b/interp.rs index 99574443c0..ac0eadb4ae 100644 --- a/interp.rs +++ b/interp.rs @@ -17,12 +17,12 @@ pub(crate) struct State<'a> { pub end: usize, flags: SreFlag, pattern_codes: &'a [u32], - marks: Vec>, + pub marks: Vec>, pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, context_stack: Vec, repeat_stack: Vec, - string_position: usize, + pub string_position: usize, } impl<'a> State<'a> { @@ -136,7 +136,7 @@ pub(crate) fn pymatch( return None; } - Some(Match::new(&state, pattern.clone().into_object(), string.clone())) + Some(Match::new(&state, pattern.clone(), string.clone())) } #[derive(Debug, Copy, Clone)] From 04bb80f157128d59e2a5e25261311ee1be8c143e Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 31 Dec 2020 11:22:17 +0200 Subject: [PATCH 015/705] impl Match.group --- interp.rs | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/interp.rs b/interp.rs index ac0eadb4ae..f9b0898a31 100644 --- a/interp.rs +++ b/interp.rs @@ -115,6 +115,7 @@ pub(crate) fn pymatch( has_matched: None, }; state.context_stack.push(ctx); + let mut dispatcher = OpcodeDispatcher::new(); let mut has_matched = None; loop { @@ -123,7 +124,6 @@ pub(crate) fn pymatch( } let ctx_id = state.context_stack.len() - 1; let mut drive = StackDrive::drive(ctx_id, state); - let mut dispatcher = OpcodeDispatcher::new(); has_matched = dispatcher.pymatch(&mut drive); state = drive.take(); @@ -132,11 +132,11 @@ pub(crate) fn pymatch( } } - if has_matched == None || has_matched == Some(false) { - return None; + if has_matched != Some(true) { + None + } else { + Some(Match::new(&state, pattern.clone(), string.clone())) } - - Some(Match::new(&state, pattern.clone(), string.clone())) } #[derive(Debug, Copy, Clone)] @@ -344,7 +344,9 @@ impl OpcodeDispatcher { while drive.remaining_codes() > 0 && drive.ctx().has_matched.is_none() { let code = drive.peek_code(0); let opcode = SreOpcode::try_from(code).unwrap(); - self.dispatch(opcode, drive); + if !self.dispatch(opcode, drive) { + return None; + } } match drive.ctx().has_matched { Some(matched) => Some(matched), @@ -469,7 +471,7 @@ impl OpcodeDispatcher { SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), SreOpcode::REPEAT => Box::new(OpRepeat::default()), - SreOpcode::REPEAT_ONE => Box::new(OpMinRepeatOne::default()), + SreOpcode::REPEAT_ONE => Box::new(OpRepeatOne::default()), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)), @@ -1329,7 +1331,7 @@ struct OpRepeatOne { child_ctx_id: usize, mincount: usize, maxcount: usize, - count: usize, + count: isize, } impl Default for OpRepeatOne { fn default() -> Self { @@ -1354,9 +1356,9 @@ impl OpcodeExecutor for OpRepeatOne { return None; } drive.state.string_position = drive.ctx().string_position; - self.count = count(drive, self.maxcount); - drive.skip_char(self.count); - if self.count < self.mincount { + self.count = count(drive, self.maxcount) as isize; + drive.skip_char(self.count as usize); + if self.count < self.mincount as isize { drive.ctx_mut().has_matched = Some(false); return None; } @@ -1378,7 +1380,7 @@ impl OpcodeExecutor for OpRepeatOne { } 1 => { // General case: backtracking - if self.count >= self.mincount { + if self.count >= self.mincount as isize { drive.state.string_position = drive.ctx().string_position; self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); self.jump_id = 2; From 8c442f599bb67f4a9878cad9e4df180ab52b39b7 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 1 Jan 2021 10:30:05 +0200 Subject: [PATCH 016/705] rework OpMaxUntil; restruct popping context; add tests; --- interp.rs | 660 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 364 insertions(+), 296 deletions(-) diff --git a/interp.rs b/interp.rs index f9b0898a31..cb06ee0b8a 100644 --- a/interp.rs +++ b/interp.rs @@ -23,6 +23,7 @@ pub(crate) struct State<'a> { context_stack: Vec, repeat_stack: Vec, pub string_position: usize, + popped_context: Option, } impl<'a> State<'a> { @@ -49,6 +50,7 @@ impl<'a> State<'a> { repeat_stack: Vec::new(), marks: Vec::new(), string_position: start, + popped_context: None, } } @@ -58,6 +60,7 @@ impl<'a> State<'a> { self.marks_stack.clear(); self.context_stack.clear(); self.repeat_stack.clear(); + self.popped_context = None; } fn set_mark(&mut self, mark_nr: usize, position: usize) { @@ -128,7 +131,7 @@ pub(crate) fn pymatch( has_matched = dispatcher.pymatch(&mut drive); state = drive.take(); if has_matched.is_some() { - state.context_stack.pop(); + state.popped_context = state.context_stack.pop(); } } @@ -151,6 +154,9 @@ trait MatchContextDrive { fn ctx_mut(&mut self) -> &mut MatchContext; fn ctx(&self) -> &MatchContext; fn state(&self) -> &State; + fn repeat_ctx(&self) -> &RepeatContext { + self.state().repeat_stack.last().unwrap() + } fn str(&self) -> &str { unsafe { std::str::from_utf8_unchecked( @@ -181,6 +187,9 @@ trait MatchContextDrive { } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; + if self.ctx().code_position > self.state().pattern_codes.len() { + self.ctx_mut().code_position = self.state().pattern_codes.len(); + } } fn remaining_chars(&self) -> usize { self.state().end - self.ctx().string_position @@ -263,16 +272,17 @@ impl<'a> StackDrive<'a> { fn take(self) -> State<'a> { self.state } - fn push_new_context(&mut self, pattern_offset: usize) -> usize { + fn push_new_context(&mut self, pattern_offset: usize) { let ctx = self.ctx(); - let child_ctx = MatchContext { - string_position: ctx.string_position, - string_offset: ctx.string_offset, - code_position: ctx.code_position + pattern_offset, - has_matched: None, - }; + let mut child_ctx = MatchContext { ..*ctx }; + child_ctx.code_position += pattern_offset; + if child_ctx.code_position > self.state.pattern_codes.len() { + child_ctx.code_position = self.state.pattern_codes.len(); + } self.state.context_stack.push(child_ctx); - self.state.context_stack.len() - 1 + } + fn repeat_ctx_mut(&mut self) -> &mut RepeatContext { + self.state.repeat_stack.last_mut().unwrap() } } impl MatchContextDrive for StackDrive<'_> { @@ -328,6 +338,39 @@ fn once(f: F) -> Box> { Box::new(OpOnce { f: Some(f) }) } +// F1 F2 are same identical, but workaround for closure +struct OpTwice { + f1: Option, + f2: Option, +} +impl OpcodeExecutor for OpTwice +where + F1: FnOnce(&mut StackDrive), + F2: FnOnce(&mut StackDrive), +{ + fn next(&mut self, drive: &mut StackDrive) -> Option<()> { + if let Some(f1) = self.f1.take() { + f1(drive); + Some(()) + } else if let Some(f2) = self.f2.take() { + f2(drive); + None + } else { + unreachable!() + } + } +} +fn twice(f1: F1, f2: F2) -> Box> +where + F1: FnOnce(&mut StackDrive), + F2: FnOnce(&mut StackDrive), +{ + Box::new(OpTwice { + f1: Some(f1), + f2: Some(f2), + }) +} + struct OpcodeDispatcher { executing_contexts: HashMap>, } @@ -397,8 +440,44 @@ impl OpcodeDispatcher { drive.skip_char(1); } }), - SreOpcode::ASSERT => Box::new(OpAssert::default()), - SreOpcode::ASSERT_NOT => Box::new(OpAssertNot::default()), + SreOpcode::ASSERT => twice( + |drive| { + let back = drive.peek_code(2) as usize; + if back > drive.ctx().string_position { + drive.ctx_mut().has_matched = Some(false); + return; + } + drive.state.string_position = drive.ctx().string_position - back; + drive.push_new_context(3); + }, + |drive| { + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.skip_code(drive.peek_code(1) as usize + 1); + } else { + drive.ctx_mut().has_matched = Some(false); + } + }, + ), + SreOpcode::ASSERT_NOT => twice( + |drive| { + let back = drive.peek_code(2) as usize; + if back > drive.ctx().string_position { + drive.skip_code(drive.peek_code(1) as usize + 1); + return; + } + drive.state.string_position = drive.ctx().string_position - back; + drive.push_new_context(3); + }, + |drive| { + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.skip_code(drive.peek_code(1) as usize + 1); + } + }, + ), SreOpcode::AT => once(|drive| { let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); if !at(drive, atcode) { @@ -468,9 +547,29 @@ impl OpcodeDispatcher { .set_mark(drive.peek_code(1) as usize, drive.ctx().string_position); drive.skip_code(2); }), + SreOpcode::REPEAT => twice( + // create repeat context. all the hard work is done by the UNTIL + // operator (MAX_UNTIL, MIN_UNTIL) + // <1=min> <2=max> item tail + |drive| { + let repeat = RepeatContext { + count: -1, + code_position: drive.ctx().code_position, + last_position: std::usize::MAX, + }; + drive.state.repeat_stack.push(repeat); + drive.state.string_position = drive.ctx().string_position; + // execute UNTIL operator + drive.push_new_context(drive.peek_code(1) as usize + 1); + }, + |drive| { + drive.state.repeat_stack.pop(); + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + }, + ), SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), - SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), - SreOpcode::REPEAT => Box::new(OpRepeat::default()), + SreOpcode::MIN_UNTIL => todo!("min until"), SreOpcode::REPEAT_ONE => Box::new(OpRepeatOne::default()), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), @@ -872,90 +971,12 @@ fn is_utf8_first_byte(b: u8) -> bool { (b & 0b10000000 == 0) || (b & 0b11000000 == 0b11000000) } -struct OpAssert { - child_ctx_id: usize, - jump_id: usize, -} -impl Default for OpAssert { - fn default() -> Self { - Self { - child_ctx_id: 0, - jump_id: 0, - } - } -} -impl OpcodeExecutor for OpAssert { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => self._0(drive), - 1 => self._1(drive), - _ => unreachable!(), - } - } -} -impl OpAssert { - fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { - let back = drive.peek_code(2) as usize; - if back > drive.ctx().string_position { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.state.string_position = drive.ctx().string_position - back; - self.child_ctx_id = drive.push_new_context(3); - self.jump_id = 1; - Some(()) - } - fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { - if drive.state.context_stack[self.child_ctx_id].has_matched == Some(true) { - drive.skip_code(drive.peek_code(1) as usize + 1); - } else { - drive.ctx_mut().has_matched = Some(false); - } - None - } -} - -struct OpAssertNot { - child_ctx_id: usize, - jump_id: usize, -} -impl Default for OpAssertNot { - fn default() -> Self { - Self { - child_ctx_id: 0, - jump_id: 0, - } - } -} -impl OpcodeExecutor for OpAssertNot { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => self._0(drive), - 1 => self._1(drive), - _ => unreachable!(), - } - } -} -impl OpAssertNot { - fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { - let back = drive.peek_code(2) as usize; - if back > drive.ctx().string_position { - drive.skip_code(drive.peek_code(1) as usize + 1); - return None; - } - drive.state.string_position = drive.ctx().string_position - back; - self.child_ctx_id = drive.push_new_context(3); - self.jump_id = 1; - Some(()) - } - fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { - if drive.state.context_stack[self.child_ctx_id].has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(drive.peek_code(1) as usize + 1); - } - None - } +#[derive(Debug, Copy, Clone)] +struct RepeatContext { + count: isize, + code_position: usize, + // zero-width match protection + last_position: usize, } struct OpMinRepeatOne { @@ -963,7 +984,6 @@ struct OpMinRepeatOne { mincount: usize, maxcount: usize, count: usize, - child_ctx_id: usize, } impl OpcodeExecutor for OpMinRepeatOne { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { @@ -982,7 +1002,6 @@ impl Default for OpMinRepeatOne { mincount: 0, maxcount: 0, count: 0, - child_ctx_id: 0, } } } @@ -1023,7 +1042,7 @@ impl OpMinRepeatOne { fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { if self.maxcount == MAXREPEAT || self.count <= self.maxcount { drive.state.string_position = drive.ctx().string_position; - self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + drive.push_new_context(drive.peek_code(1) as usize + 1); self.jump_id = 2; return Some(()); } @@ -1033,7 +1052,8 @@ impl OpMinRepeatOne { None } fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { - if let Some(true) = drive.state.context_stack[self.child_ctx_id].has_matched { + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { drive.ctx_mut().has_matched = Some(true); return None; } @@ -1050,201 +1070,290 @@ impl OpMinRepeatOne { } } -#[derive(Debug, Copy, Clone)] -struct RepeatContext { - skip: usize, - mincount: usize, - maxcount: usize, - count: isize, - last_position: isize, -} - +// Everything is stored in RepeatContext struct OpMaxUntil { jump_id: usize, count: isize, - save_last_position: isize, - child_ctx_id: usize, + save_last_position: usize, } impl Default for OpMaxUntil { fn default() -> Self { - Self { + OpMaxUntil { jump_id: 0, count: 0, - save_last_position: -1, - child_ctx_id: 0, + save_last_position: 0, } } } impl OpcodeExecutor for OpMaxUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { - 0 => { - drive.state.string_position = drive.ctx().string_position; - let repeat = match drive.state.repeat_stack.last_mut() { - Some(repeat) => repeat, - None => { - todo!("Internal re error: MAX_UNTIL without REPEAT."); - } - }; - self.count = repeat.count + 1; - - if self.count < repeat.mincount as isize { - // not enough matches - repeat.count = self.count; - self.child_ctx_id = drive.push_new_context(4); - self.jump_id = 1; - return Some(()); - } - - if (self.count < repeat.maxcount as isize || repeat.maxcount == MAXREPEAT) - && (drive.state.string_position as isize != repeat.last_position) - { - // we may have enough matches, if we can match another item, do so - repeat.count = self.count; - self.save_last_position = repeat.last_position; - repeat.last_position = drive.state.string_position as isize; - drive.state.marks_push(); - self.child_ctx_id = drive.push_new_context(4); - self.jump_id = 2; - return Some(()); - } - - self.child_ctx_id = drive.push_new_context(1); - - self.jump_id = 3; - Some(()) - } - 1 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.state.string_position = drive.ctx().string_position; - let repeat = drive.state.repeat_stack.last_mut().unwrap(); - repeat.count = self.count - 1; - } - None - } - 2 => { - let repeat = drive.state.repeat_stack.last_mut().unwrap(); - repeat.last_position = drive.state.string_position as isize; - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - if child_ctx.has_matched == Some(true) { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(true); - return None; - } - repeat.count = self.count - 1; - drive.state.marks_pop(); - drive.state.string_position = drive.ctx().string_position; + 0 => self._0(drive), + 1 => self._1(drive), + 2 => self._2(drive), + 3 => self._3(drive), + 4 => self._4(drive), + _ => unreachable!(), + } + } +} +impl OpMaxUntil { + fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { + let RepeatContext { + count, + code_position, + last_position, + } = *drive.repeat_ctx(); + drive.ctx_mut().code_position = code_position; + let mincount = drive.peek_code(2) as usize; + let maxcount = drive.peek_code(3) as usize; + self.count = count + 1; + + if (self.count as usize) < mincount { + // not enough matches + drive.repeat_ctx_mut().count = self.count; + drive.push_new_context(4); + self.jump_id = 1; + return Some(()); + } - self.child_ctx_id = drive.push_new_context(1); + if ((count as usize) < maxcount || maxcount == MAXREPEAT) + && drive.state.string_position != last_position + { + // we may have enough matches, if we can match another item, do so + drive.repeat_ctx_mut().count = self.count; + drive.state.marks_push(); + // self.save_last_position = last_position; + // drive.repeat_ctx_mut().last_position = drive.state.string_position; + drive.push_new_context(4); + self.jump_id = 2; + return Some(()); + } - self.jump_id = 3; - Some(()) - } - 3 => { - // cannot match more repeated items here. make sure the tail matches - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.state.string_position = drive.ctx().string_position; - } else { - drive.state.repeat_stack.pop(); - } - None - } - _ => unreachable!(), + self.jump_id = 3; + self.next(drive) + } + fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.repeat_ctx_mut().count = self.count - 1; + drive.state.string_position = drive.ctx().string_position; } + None + } + fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { + // drive.repeat_ctx_mut().last_position = self.save_last_position; + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.marks_pop(); + drive.repeat_ctx_mut().count = self.count - 1; + drive.state.string_position = drive.ctx().string_position; + self.jump_id = 3; + self.next(drive) + } + fn _3(&mut self, drive: &mut StackDrive) -> Option<()> { + // cannot match more repeated items here. make sure the tail matches + drive.skip_code(drive.peek_code(1) as usize + 1); + drive.push_new_context(1); + self.jump_id = 4; + Some(()) + } + fn _4(&mut self, drive: &mut StackDrive) -> Option<()> { + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.state.string_position = drive.ctx().string_position; + } + None } } +// struct OpMaxUntil { +// jump_id: usize, +// count: isize, +// save_last_position: isize, +// } +// impl Default for OpMaxUntil { +// fn default() -> Self { +// Self { +// jump_id: 0, +// count: 0, +// save_last_position: -1, +// } +// } +// } +// impl OpcodeExecutor for OpMaxUntil { +// fn next(&mut self, drive: &mut StackDrive) -> Option<()> { +// match self.jump_id { +// 0 => { +// drive.state.string_position = drive.ctx().string_position; +// let repeat = match drive.state.repeat_stack.last_mut() { +// Some(repeat) => repeat, +// None => { +// panic!("Internal re error: MAX_UNTIL without REPEAT."); +// } +// }; +// self.count = repeat.count + 1; + +// if self.count < repeat.mincount as isize { +// // not enough matches +// repeat.count = self.count; +// drive.push_new_context(4); +// self.jump_id = 1; +// return Some(()); +// } + +// if (self.count < repeat.maxcount as isize || repeat.maxcount == MAXREPEAT) +// && (drive.state.string_position as isize != repeat.last_position) +// { +// // we may have enough matches, if we can match another item, do so +// repeat.count = self.count; +// self.save_last_position = repeat.last_position; +// repeat.last_position = drive.state.string_position as isize; +// drive.state.marks_push(); +// drive.push_new_context(4); +// self.jump_id = 2; +// return Some(()); +// } + +// drive.push_new_context(1); + +// self.jump_id = 3; +// Some(()) +// } +// 1 => { +// let child_ctx = drive.state.popped_context.unwrap(); +// drive.ctx_mut().has_matched = child_ctx.has_matched; +// if drive.ctx().has_matched != Some(true) { +// drive.state.string_position = drive.ctx().string_position; +// let repeat = drive.state.repeat_stack.last_mut().unwrap(); +// repeat.count = self.count - 1; +// } +// None +// } +// 2 => { +// let repeat = drive.state.repeat_stack.last_mut().unwrap(); +// repeat.last_position = drive.state.string_position as isize; +// let child_ctx = drive.state.popped_context.unwrap(); +// if child_ctx.has_matched == Some(true) { +// drive.state.marks_pop_discard(); +// drive.ctx_mut().has_matched = Some(true); +// return None; +// } +// repeat.count = self.count - 1; +// drive.state.marks_pop(); +// drive.state.string_position = drive.ctx().string_position; + +// drive.push_new_context(1); + +// self.jump_id = 3; +// Some(()) +// } +// 3 => { +// // cannot match more repeated items here. make sure the tail matches +// let child_ctx = drive.state.popped_context.unwrap(); +// drive.ctx_mut().has_matched = child_ctx.has_matched; +// if drive.ctx().has_matched != Some(true) { +// drive.state.string_position = drive.ctx().string_position; +// } else { +// drive.state.repeat_stack.pop(); +// } +// None +// } +// _ => unreachable!(), +// } +// } +// } + struct OpMinUntil { jump_id: usize, count: isize, - child_ctx_id: usize, } impl Default for OpMinUntil { fn default() -> Self { Self { jump_id: 0, count: 0, - child_ctx_id: 0, } } } impl OpcodeExecutor for OpMinUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - drive.state.string_position = drive.ctx().string_position; - let repeat = match drive.state.repeat_stack.last_mut() { - Some(repeat) => repeat, - None => { - todo!("Internal re error: MAX_UNTIL without REPEAT."); - } - }; - self.count = repeat.count + 1; - - if self.count < repeat.mincount as isize { - // not enough matches - repeat.count = self.count; - self.child_ctx_id = drive.push_new_context(4); - self.jump_id = 1; - return Some(()); - } - - // see if the tail matches - drive.state.marks_push(); - self.child_ctx_id = drive.push_new_context(1); - self.jump_id = 2; - Some(()) - } - 1 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.state.string_position = drive.ctx().string_position; - let repeat = drive.state.repeat_stack.last_mut().unwrap(); - repeat.count = self.count - 1; - } - None - } - 2 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - if child_ctx.has_matched == Some(true) { - drive.state.repeat_stack.pop(); - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.string_position = drive.ctx().string_position; - drive.state.marks_pop(); - - // match more until tail matches - let repeat = drive.state.repeat_stack.last_mut().unwrap(); - if self.count >= repeat.maxcount as isize && repeat.maxcount != MAXREPEAT { - drive.ctx_mut().has_matched = Some(false); - return None; - } - repeat.count = self.count; - self.child_ctx_id = drive.push_new_context(4); - self.jump_id = 1; - Some(()) - } - _ => unreachable!(), - } + None + // match self.jump_id { + // 0 => { + // drive.state.string_position = drive.ctx().string_position; + // let repeat = match drive.state.repeat_stack.last_mut() { + // Some(repeat) => repeat, + // None => { + // todo!("Internal re error: MAX_UNTIL without REPEAT."); + // } + // }; + // self.count = repeat.count + 1; + + // if self.count < repeat.mincount as isize { + // // not enough matches + // repeat.count = self.count; + // drive.push_new_context(4); + // self.jump_id = 1; + // return Some(()); + // } + + // // see if the tail matches + // drive.state.marks_push(); + // drive.push_new_context(1); + // self.jump_id = 2; + // Some(()) + // } + // 1 => { + // let child_ctx = drive.state.popped_context.unwrap(); + // drive.ctx_mut().has_matched = child_ctx.has_matched; + // if drive.ctx().has_matched != Some(true) { + // drive.state.string_position = drive.ctx().string_position; + // let repeat = drive.state.repeat_stack.last_mut().unwrap(); + // repeat.count = self.count - 1; + // } + // None + // } + // 2 => { + // let child_ctx = drive.state.popped_context.unwrap(); + // if child_ctx.has_matched == Some(true) { + // drive.state.repeat_stack.pop(); + // drive.ctx_mut().has_matched = Some(true); + // return None; + // } + // drive.state.string_position = drive.ctx().string_position; + // drive.state.marks_pop(); + + // // match more until tail matches + // let repeat = drive.state.repeat_stack.last_mut().unwrap(); + // if self.count >= repeat.maxcount as isize && repeat.maxcount != MAXREPEAT { + // drive.ctx_mut().has_matched = Some(false); + // return None; + // } + // repeat.count = self.count; + // drive.push_new_context(4); + // self.jump_id = 1; + // Some(()) + // } + // _ => unreachable!(), + // } } } struct OpBranch { jump_id: usize, - child_ctx_id: usize, current_branch_length: usize, } impl Default for OpBranch { fn default() -> Self { Self { jump_id: 0, - child_ctx_id: 0, current_branch_length: 0, } } @@ -1268,12 +1377,12 @@ impl OpcodeExecutor for OpBranch { return None; } drive.state.string_position = drive.ctx().string_position; - self.child_ctx_id = drive.push_new_context(1); + drive.push_new_context(1); self.jump_id = 2; Some(()) } 2 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + let child_ctx = drive.state.popped_context.unwrap(); if child_ctx.has_matched == Some(true) { drive.ctx_mut().has_matched = Some(true); return None; @@ -1287,48 +1396,8 @@ impl OpcodeExecutor for OpBranch { } } -struct OpRepeat { - jump_id: usize, - child_ctx_id: usize, -} -impl Default for OpRepeat { - fn default() -> Self { - Self { - jump_id: 0, - child_ctx_id: 0, - } - } -} -impl OpcodeExecutor for OpRepeat { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - let repeat = RepeatContext { - skip: drive.peek_code(1) as usize, - mincount: drive.peek_code(2) as usize, - maxcount: drive.peek_code(3) as usize, - count: -1, - last_position: -1, - }; - drive.state.repeat_stack.push(repeat); - drive.state.string_position = drive.ctx().string_position; - self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); - self.jump_id = 1; - Some(()) - } - 1 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; - drive.ctx_mut().has_matched = child_ctx.has_matched; - None - } - _ => unreachable!(), - } - } -} - struct OpRepeatOne { jump_id: usize, - child_ctx_id: usize, mincount: usize, maxcount: usize, count: isize, @@ -1337,7 +1406,6 @@ impl Default for OpRepeatOne { fn default() -> Self { Self { jump_id: 0, - child_ctx_id: 0, mincount: 0, maxcount: 0, count: 0, @@ -1382,7 +1450,7 @@ impl OpcodeExecutor for OpRepeatOne { // General case: backtracking if self.count >= self.mincount as isize { drive.state.string_position = drive.ctx().string_position; - self.child_ctx_id = drive.push_new_context(drive.peek_code(1) as usize + 1); + drive.push_new_context(drive.peek_code(1) as usize + 1); self.jump_id = 2; return Some(()); } @@ -1392,7 +1460,7 @@ impl OpcodeExecutor for OpRepeatOne { None } 2 => { - let child_ctx = &drive.state.context_stack[self.child_ctx_id]; + let child_ctx = drive.state.popped_context.unwrap(); if child_ctx.has_matched == Some(true) { drive.ctx_mut().has_matched = Some(true); return None; From 8fba935bba46ade66107a3c7aabb65e4eae00dcb Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 1 Jan 2021 10:54:44 +0200 Subject: [PATCH 017/705] OpMaxUntil zero-width protection --- interp.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/interp.rs b/interp.rs index cb06ee0b8a..4a015187e9 100644 --- a/interp.rs +++ b/interp.rs @@ -1107,6 +1107,7 @@ impl OpMaxUntil { drive.ctx_mut().code_position = code_position; let mincount = drive.peek_code(2) as usize; let maxcount = drive.peek_code(3) as usize; + drive.state.string_position = drive.ctx().string_position; self.count = count + 1; if (self.count as usize) < mincount { @@ -1123,8 +1124,8 @@ impl OpMaxUntil { // we may have enough matches, if we can match another item, do so drive.repeat_ctx_mut().count = self.count; drive.state.marks_push(); - // self.save_last_position = last_position; - // drive.repeat_ctx_mut().last_position = drive.state.string_position; + self.save_last_position = last_position; + drive.repeat_ctx_mut().last_position = drive.state.string_position; drive.push_new_context(4); self.jump_id = 2; return Some(()); @@ -1143,7 +1144,7 @@ impl OpMaxUntil { None } fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { - // drive.repeat_ctx_mut().last_position = self.save_last_position; + drive.repeat_ctx_mut().last_position = self.save_last_position; let child_ctx = drive.state.popped_context.unwrap(); if child_ctx.has_matched == Some(true) { drive.state.marks_pop_discard(); From af1a53cb0530f27804442f86d3e1e05a4abcacef Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 1 Jan 2021 11:34:40 +0200 Subject: [PATCH 018/705] impl Match.groups() --- interp.rs | 249 ++++++++++++++++-------------------------------------- 1 file changed, 72 insertions(+), 177 deletions(-) diff --git a/interp.rs b/interp.rs index 4a015187e9..ad0790a31b 100644 --- a/interp.rs +++ b/interp.rs @@ -1070,7 +1070,6 @@ impl OpMinRepeatOne { } } -// Everything is stored in RepeatContext struct OpMaxUntil { jump_id: usize, count: isize, @@ -1088,189 +1087,85 @@ impl Default for OpMaxUntil { impl OpcodeExecutor for OpMaxUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { - 0 => self._0(drive), - 1 => self._1(drive), - 2 => self._2(drive), - 3 => self._3(drive), - 4 => self._4(drive), - _ => unreachable!(), - } - } -} -impl OpMaxUntil { - fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { - let RepeatContext { - count, - code_position, - last_position, - } = *drive.repeat_ctx(); - drive.ctx_mut().code_position = code_position; - let mincount = drive.peek_code(2) as usize; - let maxcount = drive.peek_code(3) as usize; - drive.state.string_position = drive.ctx().string_position; - self.count = count + 1; + 0 => { + let RepeatContext { + count, + code_position, + last_position, + } = *drive.repeat_ctx(); + drive.ctx_mut().code_position = code_position; + let mincount = drive.peek_code(2) as usize; + let maxcount = drive.peek_code(3) as usize; + drive.state.string_position = drive.ctx().string_position; + self.count = count + 1; - if (self.count as usize) < mincount { - // not enough matches - drive.repeat_ctx_mut().count = self.count; - drive.push_new_context(4); - self.jump_id = 1; - return Some(()); - } + if (self.count as usize) < mincount { + // not enough matches + drive.repeat_ctx_mut().count = self.count; + drive.push_new_context(4); + self.jump_id = 1; + return Some(()); + } - if ((count as usize) < maxcount || maxcount == MAXREPEAT) - && drive.state.string_position != last_position - { - // we may have enough matches, if we can match another item, do so - drive.repeat_ctx_mut().count = self.count; - drive.state.marks_push(); - self.save_last_position = last_position; - drive.repeat_ctx_mut().last_position = drive.state.string_position; - drive.push_new_context(4); - self.jump_id = 2; - return Some(()); - } + if ((count as usize) < maxcount || maxcount == MAXREPEAT) + && drive.state.string_position != last_position + { + // we may have enough matches, if we can match another item, do so + drive.repeat_ctx_mut().count = self.count; + drive.state.marks_push(); + self.save_last_position = last_position; + drive.repeat_ctx_mut().last_position = drive.state.string_position; + drive.push_new_context(4); + self.jump_id = 2; + return Some(()); + } - self.jump_id = 3; - self.next(drive) - } - fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.repeat_ctx_mut().count = self.count - 1; - drive.state.string_position = drive.ctx().string_position; - } - None - } - fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { - drive.repeat_ctx_mut().last_position = self.save_last_position; - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.marks_pop(); - drive.repeat_ctx_mut().count = self.count - 1; - drive.state.string_position = drive.ctx().string_position; - self.jump_id = 3; - self.next(drive) - } - fn _3(&mut self, drive: &mut StackDrive) -> Option<()> { - // cannot match more repeated items here. make sure the tail matches - drive.skip_code(drive.peek_code(1) as usize + 1); - drive.push_new_context(1); - self.jump_id = 4; - Some(()) - } - fn _4(&mut self, drive: &mut StackDrive) -> Option<()> { - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.state.string_position = drive.ctx().string_position; + self.jump_id = 3; + self.next(drive) + } + 1 => { + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.repeat_ctx_mut().count = self.count - 1; + drive.state.string_position = drive.ctx().string_position; + } + None + } + 2 => { + drive.repeat_ctx_mut().last_position = self.save_last_position; + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.marks_pop(); + drive.repeat_ctx_mut().count = self.count - 1; + drive.state.string_position = drive.ctx().string_position; + self.jump_id = 3; + self.next(drive) + } + 3 => { + // cannot match more repeated items here. make sure the tail matches + drive.skip_code(drive.peek_code(1) as usize + 1); + drive.push_new_context(1); + self.jump_id = 4; + Some(()) + } + 4 => { + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.state.string_position = drive.ctx().string_position; + } + None + } + _ => unreachable!(), } - None } } -// struct OpMaxUntil { -// jump_id: usize, -// count: isize, -// save_last_position: isize, -// } -// impl Default for OpMaxUntil { -// fn default() -> Self { -// Self { -// jump_id: 0, -// count: 0, -// save_last_position: -1, -// } -// } -// } -// impl OpcodeExecutor for OpMaxUntil { -// fn next(&mut self, drive: &mut StackDrive) -> Option<()> { -// match self.jump_id { -// 0 => { -// drive.state.string_position = drive.ctx().string_position; -// let repeat = match drive.state.repeat_stack.last_mut() { -// Some(repeat) => repeat, -// None => { -// panic!("Internal re error: MAX_UNTIL without REPEAT."); -// } -// }; -// self.count = repeat.count + 1; - -// if self.count < repeat.mincount as isize { -// // not enough matches -// repeat.count = self.count; -// drive.push_new_context(4); -// self.jump_id = 1; -// return Some(()); -// } - -// if (self.count < repeat.maxcount as isize || repeat.maxcount == MAXREPEAT) -// && (drive.state.string_position as isize != repeat.last_position) -// { -// // we may have enough matches, if we can match another item, do so -// repeat.count = self.count; -// self.save_last_position = repeat.last_position; -// repeat.last_position = drive.state.string_position as isize; -// drive.state.marks_push(); -// drive.push_new_context(4); -// self.jump_id = 2; -// return Some(()); -// } - -// drive.push_new_context(1); - -// self.jump_id = 3; -// Some(()) -// } -// 1 => { -// let child_ctx = drive.state.popped_context.unwrap(); -// drive.ctx_mut().has_matched = child_ctx.has_matched; -// if drive.ctx().has_matched != Some(true) { -// drive.state.string_position = drive.ctx().string_position; -// let repeat = drive.state.repeat_stack.last_mut().unwrap(); -// repeat.count = self.count - 1; -// } -// None -// } -// 2 => { -// let repeat = drive.state.repeat_stack.last_mut().unwrap(); -// repeat.last_position = drive.state.string_position as isize; -// let child_ctx = drive.state.popped_context.unwrap(); -// if child_ctx.has_matched == Some(true) { -// drive.state.marks_pop_discard(); -// drive.ctx_mut().has_matched = Some(true); -// return None; -// } -// repeat.count = self.count - 1; -// drive.state.marks_pop(); -// drive.state.string_position = drive.ctx().string_position; - -// drive.push_new_context(1); - -// self.jump_id = 3; -// Some(()) -// } -// 3 => { -// // cannot match more repeated items here. make sure the tail matches -// let child_ctx = drive.state.popped_context.unwrap(); -// drive.ctx_mut().has_matched = child_ctx.has_matched; -// if drive.ctx().has_matched != Some(true) { -// drive.state.string_position = drive.ctx().string_position; -// } else { -// drive.state.repeat_stack.pop(); -// } -// None -// } -// _ => unreachable!(), -// } -// } -// } - struct OpMinUntil { jump_id: usize, count: isize, From db84f329816d812349b49fa843b4d0480674aba2 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 1 Jan 2021 13:10:19 +0200 Subject: [PATCH 019/705] fix Opcode::CHARSET --- interp.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/interp.rs b/interp.rs index ad0790a31b..289dbc2ec9 100644 --- a/interp.rs +++ b/interp.rs @@ -741,7 +741,8 @@ fn charset(set: &[u32], c: char) -> bool { } SreOpcode::CHARSET => { /* */ - if ch < 256 && (set[(ch / 32) as usize] & (1 << (32 - 1))) != 0 { + let set = &set[1..]; + if ch < 256 && ((set[(ch >> 5) as usize] & (1u32 << (ch & 31))) != 0) { return ok; } i += 8; From 36433a9f4d026df404e419d604e67295fa4db758 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 1 Jan 2021 20:13:06 +0200 Subject: [PATCH 020/705] fix Opcode::BIGCHARSET --- interp.rs | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/interp.rs b/interp.rs index 289dbc2ec9..ca5efd9e4f 100644 --- a/interp.rs +++ b/interp.rs @@ -497,22 +497,16 @@ impl OpcodeDispatcher { } }), SreOpcode::IN => once(|drive| { - general_op_in(drive, |x| x); + general_op_in(drive, |set, c| charset(set, c)); }), SreOpcode::IN_IGNORE => once(|drive| { - general_op_in(drive, lower_ascii); + general_op_in(drive, |set, c| charset(set, lower_ascii(c))); }), SreOpcode::IN_UNI_IGNORE => once(|drive| { - general_op_in(drive, lower_unicode); + general_op_in(drive, |set, c| charset(set, lower_unicode(c))); }), SreOpcode::IN_LOC_IGNORE => once(|drive| { - let skip = drive.peek_code(1) as usize; - if drive.at_end() || !charset_loc_ignore(&drive.pattern()[2..], drive.peek_char()) { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(skip + 1); - drive.skip_char(1); - } + general_op_in(drive, |set, c| charset_loc_ignore(set, c)); }), SreOpcode::INFO | SreOpcode::JUMP => once(|drive| { drive.skip_code(drive.peek_code(1) as usize + 1); @@ -661,9 +655,9 @@ fn general_op_literal bool>(drive: &mut StackDrive, f: F } } -fn general_op_in char>(drive: &mut StackDrive, f: F) { +fn general_op_in bool>(drive: &mut StackDrive, f: F) { let skip = drive.peek_code(1) as usize; - if drive.at_end() || !charset(&drive.pattern()[2..], f(drive.peek_char())) { + if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(skip + 1); @@ -749,18 +743,20 @@ fn charset(set: &[u32], c: char) -> bool { } SreOpcode::BIGCHARSET => { /* <256 blockindices> */ - let count = set[i + 1]; + let count = set[i + 1] as usize; if ch < 0x10000 { - let (_, blockindices, _) = unsafe { set[i + 2..].align_to::() }; - let block = blockindices[(ch >> 8) as usize]; - if set[2 + 64 + ((block as u32 * 256 + (ch & 255)) / 32) as usize] - & (1 << (ch & (32 - 1))) + let set = &set[2..]; + let block_index = ch >> 8; + let (_, blockindices, _) = unsafe { set.align_to::() }; + let blocks = &set[64..]; + let block = blockindices[block_index as usize]; + if blocks[((block as u32 * 256 + (ch & 255)) / 32) as usize] & (1u32 << (ch & 31)) != 0 { return ok; } } - i += 2 + 64 + count as usize * 8; + i += 2 + 64 + count * 8; } SreOpcode::LITERAL => { /* */ From f05f6cb44df000ad7cffcd79b6447c97756d15dd Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 3 Jan 2021 19:44:40 +0200 Subject: [PATCH 021/705] impl Pattern.sub --- interp.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/interp.rs b/interp.rs index ca5efd9e4f..98844ee9ca 100644 --- a/interp.rs +++ b/interp.rs @@ -142,6 +142,27 @@ pub(crate) fn pymatch( } } +pub(crate) fn search( + string: PyStrRef, + start: usize, + end: usize, + pattern: PyRef, +) -> Option { + // TODO: optimize by op info and skip prefix + let end = std::cmp::min(end, string.char_len()); + for i in start..end { + if let Some(m) = pymatch( + string.clone(), + i, + end, + pattern.clone(), + ) { + return Some(m); + } + } + None +} + #[derive(Debug, Copy, Clone)] struct MatchContext { string_position: usize, @@ -750,7 +771,8 @@ fn charset(set: &[u32], c: char) -> bool { let (_, blockindices, _) = unsafe { set.align_to::() }; let blocks = &set[64..]; let block = blockindices[block_index as usize]; - if blocks[((block as u32 * 256 + (ch & 255)) / 32) as usize] & (1u32 << (ch & 31)) + if blocks[((block as u32 * 256 + (ch & 255)) / 32) as usize] + & (1u32 << (ch & 31)) != 0 { return ok; From 817eb66167810a163eb889ce2626418b51bd6afb Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 4 Jan 2021 16:01:17 +0200 Subject: [PATCH 022/705] fix OpMinUntil --- interp.rs | 100 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 20 deletions(-) diff --git a/interp.rs b/interp.rs index 98844ee9ca..a3263eddce 100644 --- a/interp.rs +++ b/interp.rs @@ -151,12 +151,7 @@ pub(crate) fn search( // TODO: optimize by op info and skip prefix let end = std::cmp::min(end, string.char_len()); for i in start..end { - if let Some(m) = pymatch( - string.clone(), - i, - end, - pattern.clone(), - ) { + if let Some(m) = pymatch(string.clone(), i, end, pattern.clone()) { return Some(m); } } @@ -294,12 +289,13 @@ impl<'a> StackDrive<'a> { self.state } fn push_new_context(&mut self, pattern_offset: usize) { - let ctx = self.ctx(); - let mut child_ctx = MatchContext { ..*ctx }; + let mut child_ctx = MatchContext { ..*self.ctx() }; child_ctx.code_position += pattern_offset; - if child_ctx.code_position > self.state.pattern_codes.len() { - child_ctx.code_position = self.state.pattern_codes.len(); - } + self.state.context_stack.push(child_ctx); + } + fn push_new_context_at(&mut self, code_position: usize) { + let mut child_ctx = MatchContext { ..*self.ctx() }; + child_ctx.code_position = code_position; self.state.context_stack.push(child_ctx); } fn repeat_ctx_mut(&mut self) -> &mut RepeatContext { @@ -571,6 +567,8 @@ impl OpcodeDispatcher { count: -1, code_position: drive.ctx().code_position, last_position: std::usize::MAX, + mincount: drive.peek_code(2) as usize, + maxcount: drive.peek_code(3) as usize, }; drive.state.repeat_stack.push(repeat); drive.state.string_position = drive.ctx().string_position; @@ -584,7 +582,7 @@ impl OpcodeDispatcher { }, ), SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), - SreOpcode::MIN_UNTIL => todo!("min until"), + SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), SreOpcode::REPEAT_ONE => Box::new(OpRepeatOne::default()), SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), @@ -996,6 +994,8 @@ struct RepeatContext { code_position: usize, // zero-width match protection last_position: usize, + mincount: usize, + maxcount: usize, } struct OpMinRepeatOne { @@ -1111,22 +1111,22 @@ impl OpcodeExecutor for OpMaxUntil { count, code_position, last_position, + mincount, + maxcount, } = *drive.repeat_ctx(); - drive.ctx_mut().code_position = code_position; - let mincount = drive.peek_code(2) as usize; - let maxcount = drive.peek_code(3) as usize; + drive.state.string_position = drive.ctx().string_position; self.count = count + 1; if (self.count as usize) < mincount { // not enough matches drive.repeat_ctx_mut().count = self.count; - drive.push_new_context(4); + drive.push_new_context_at(code_position + 4); self.jump_id = 1; return Some(()); } - if ((count as usize) < maxcount || maxcount == MAXREPEAT) + if ((self.count as usize) < maxcount || maxcount == MAXREPEAT) && drive.state.string_position != last_position { // we may have enough matches, if we can match another item, do so @@ -1134,7 +1134,7 @@ impl OpcodeExecutor for OpMaxUntil { drive.state.marks_push(); self.save_last_position = last_position; drive.repeat_ctx_mut().last_position = drive.state.string_position; - drive.push_new_context(4); + drive.push_new_context_at(code_position + 4); self.jump_id = 2; return Some(()); } @@ -1167,7 +1167,6 @@ impl OpcodeExecutor for OpMaxUntil { } 3 => { // cannot match more repeated items here. make sure the tail matches - drive.skip_code(drive.peek_code(1) as usize + 1); drive.push_new_context(1); self.jump_id = 4; Some(()) @@ -1188,18 +1187,79 @@ impl OpcodeExecutor for OpMaxUntil { struct OpMinUntil { jump_id: usize, count: isize, + save_repeat: Option, } impl Default for OpMinUntil { fn default() -> Self { Self { jump_id: 0, count: 0, + save_repeat: None, } } } impl OpcodeExecutor for OpMinUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - None + match self.jump_id { + 0 => { + let RepeatContext { + count, + code_position, + last_position: _, + mincount, + maxcount: _, + } = *drive.repeat_ctx(); + drive.state.string_position = drive.ctx().string_position; + self.count = count + 1; + + if (self.count as usize) < mincount { + // not enough matches + drive.repeat_ctx_mut().count = self.count; + drive.push_new_context_at(code_position + 4); + self.jump_id = 1; + return Some(()); + } + + // see if the tail matches + drive.state.marks_push(); + self.save_repeat = drive.state.repeat_stack.pop(); + drive.push_new_context(1); + self.jump_id = 2; + Some(()) + } + 1 => { + let child_ctx = drive.state.popped_context.unwrap(); + drive.ctx_mut().has_matched = child_ctx.has_matched; + if drive.ctx().has_matched != Some(true) { + drive.repeat_ctx_mut().count = self.count - 1; + drive.state.string_position = drive.ctx().string_position; + } + None + } + 2 => { + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.repeat_stack.push(self.save_repeat.unwrap()); + drive.state.string_position = drive.ctx().string_position; + drive.state.marks_pop(); + + // match more unital tail matches + let maxcount = drive.repeat_ctx().maxcount; + let code_position = drive.repeat_ctx().code_position; + if self.count as usize >= maxcount && maxcount != MAXREPEAT { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.repeat_ctx_mut().count = self.count; + drive.push_new_context_at(code_position + 4); + self.jump_id = 1; + Some(()) + } + _ => unreachable!(), + } // match self.jump_id { // 0 => { // drive.state.string_position = drive.ctx().string_position; From 33ef82364516422776beaa498db8d96a167c7b95 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 6 Jan 2021 10:30:57 +0200 Subject: [PATCH 023/705] impl Match.groupdict --- interp.rs | 58 ------------------------------------------------------- 1 file changed, 58 deletions(-) diff --git a/interp.rs b/interp.rs index a3263eddce..c118984b38 100644 --- a/interp.rs +++ b/interp.rs @@ -1260,64 +1260,6 @@ impl OpcodeExecutor for OpMinUntil { } _ => unreachable!(), } - // match self.jump_id { - // 0 => { - // drive.state.string_position = drive.ctx().string_position; - // let repeat = match drive.state.repeat_stack.last_mut() { - // Some(repeat) => repeat, - // None => { - // todo!("Internal re error: MAX_UNTIL without REPEAT."); - // } - // }; - // self.count = repeat.count + 1; - - // if self.count < repeat.mincount as isize { - // // not enough matches - // repeat.count = self.count; - // drive.push_new_context(4); - // self.jump_id = 1; - // return Some(()); - // } - - // // see if the tail matches - // drive.state.marks_push(); - // drive.push_new_context(1); - // self.jump_id = 2; - // Some(()) - // } - // 1 => { - // let child_ctx = drive.state.popped_context.unwrap(); - // drive.ctx_mut().has_matched = child_ctx.has_matched; - // if drive.ctx().has_matched != Some(true) { - // drive.state.string_position = drive.ctx().string_position; - // let repeat = drive.state.repeat_stack.last_mut().unwrap(); - // repeat.count = self.count - 1; - // } - // None - // } - // 2 => { - // let child_ctx = drive.state.popped_context.unwrap(); - // if child_ctx.has_matched == Some(true) { - // drive.state.repeat_stack.pop(); - // drive.ctx_mut().has_matched = Some(true); - // return None; - // } - // drive.state.string_position = drive.ctx().string_position; - // drive.state.marks_pop(); - - // // match more until tail matches - // let repeat = drive.state.repeat_stack.last_mut().unwrap(); - // if self.count >= repeat.maxcount as isize && repeat.maxcount != MAXREPEAT { - // drive.ctx_mut().has_matched = Some(false); - // return None; - // } - // repeat.count = self.count; - // drive.push_new_context(4); - // self.jump_id = 1; - // Some(()) - // } - // _ => unreachable!(), - // } } } From 76c95abbb5ecd91e05f216adefdd437b0f87b79b Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 7 Jan 2021 17:37:18 +0200 Subject: [PATCH 024/705] impl Match.lastgroup --- interp.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/interp.rs b/interp.rs index c118984b38..febbfa2437 100644 --- a/interp.rs +++ b/interp.rs @@ -150,7 +150,7 @@ pub(crate) fn search( ) -> Option { // TODO: optimize by op info and skip prefix let end = std::cmp::min(end, string.char_len()); - for i in start..end { + for i in start..end + 1 { if let Some(m) = pymatch(string.clone(), i, end, pattern.clone()) { return Some(m); } @@ -1382,6 +1382,13 @@ impl OpcodeExecutor for OpRepeatOne { drive.ctx_mut().has_matched = Some(true); return None; } + if self.count <= self.mincount as isize { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + return None; + } + + // TODO: unnesscary double check drive.back_skip_char(1); self.count -= 1; drive.state.marks_pop_keep(); From 13a8b6cc4e38927273774ab96565f2184840f900 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 17 Jan 2021 19:55:20 +0200 Subject: [PATCH 025/705] add bytes support and refactor --- interp.rs | 457 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 266 insertions(+), 191 deletions(-) diff --git a/interp.rs b/interp.rs index febbfa2437..7c9246142a 100644 --- a/interp.rs +++ b/interp.rs @@ -1,18 +1,18 @@ // good luck to those that follow; here be dragons -use super::_sre::{Match, Pattern, MAXREPEAT}; +use super::_sre::MAXREPEAT; use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; -use crate::builtins::PyStrRef; -use crate::pyobject::PyRef; -use rustpython_common::borrow::BorrowValue; +use crate::builtins::PyBytes; +use crate::bytesinner::is_py_ascii_whitespace; +use crate::pyobject::{IntoPyObject, PyObjectRef}; +use crate::VirtualMachine; use std::collections::HashMap; use std::convert::TryFrom; +use std::unreachable; #[derive(Debug)] pub(crate) struct State<'a> { - string: &'a str, - // chars count - string_len: usize, + pub string: StrDrive<'a>, pub start: usize, pub end: usize, flags: SreFlag, @@ -24,22 +24,21 @@ pub(crate) struct State<'a> { repeat_stack: Vec, pub string_position: usize, popped_context: Option, + pub has_matched: Option, } impl<'a> State<'a> { pub(crate) fn new( - string: &'a str, + string: StrDrive<'a>, start: usize, end: usize, flags: SreFlag, pattern_codes: &'a [u32], ) -> Self { - let string_len = string.chars().count(); - let end = std::cmp::min(end, string_len); + let end = std::cmp::min(end, string.count()); let start = std::cmp::min(start, end); Self { string, - string_len, start, end, flags, @@ -51,16 +50,19 @@ impl<'a> State<'a> { marks: Vec::new(), string_position: start, popped_context: None, + has_matched: None, } } - fn reset(&mut self) { - self.marks.clear(); + pub fn reset(&mut self) { self.lastindex = -1; self.marks_stack.clear(); self.context_stack.clear(); self.repeat_stack.clear(); + self.marks.clear(); + self.string_position = self.start; self.popped_context = None; + self.has_matched = None; } fn set_mark(&mut self, mark_nr: usize, position: usize) { @@ -96,66 +98,133 @@ impl<'a> State<'a> { fn marks_pop_discard(&mut self) { self.marks_stack.pop(); } + + pub fn pymatch(mut self) -> Self { + let ctx = MatchContext { + string_position: self.start, + string_offset: self.string.offset(0, self.start), + code_position: 0, + has_matched: None, + }; + self.context_stack.push(ctx); + + let mut dispatcher = OpcodeDispatcher::new(); + let mut has_matched = None; + + loop { + if self.context_stack.is_empty() { + break; + } + let ctx_id = self.context_stack.len() - 1; + let mut drive = StackDrive::drive(ctx_id, self); + + has_matched = dispatcher.pymatch(&mut drive); + self = drive.take(); + if has_matched.is_some() { + self.popped_context = self.context_stack.pop(); + } + } + + self.has_matched = has_matched; + self + } + + pub fn search(mut self) -> Self { + // TODO: optimize by op info and skip prefix + loop { + self = self.pymatch(); + + if self.has_matched == Some(true) { + return self; + } + self.start += 1; + if self.start > self.end { + return self; + } + self.reset(); + } + } } -pub(crate) fn pymatch( - string: PyStrRef, - start: usize, - end: usize, - pattern: PyRef, -) -> Option { - let mut state = State::new( - string.borrow_value(), - start, - end, - pattern.flags, - &pattern.code, - ); - let ctx = MatchContext { - string_position: state.start, - string_offset: calc_string_offset(state.string, state.start), - code_position: 0, - has_matched: None, - }; - state.context_stack.push(ctx); - let mut dispatcher = OpcodeDispatcher::new(); +#[derive(Debug, Clone, Copy)] +pub(crate) enum StrDrive<'a> { + Str(&'a str), + Bytes(&'a [u8]), +} +impl<'a> StrDrive<'a> { + fn offset(&self, offset: usize, skip: usize) -> usize { + match *self { + StrDrive::Str(s) => s + .get(offset..) + .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) + .unwrap_or_else(|| s.len()), + StrDrive::Bytes(b) => std::cmp::min(offset + skip, b.len()), + } + } - let mut has_matched = None; - loop { - if state.context_stack.is_empty() { - break; + pub fn count(&self) -> usize { + match *self { + StrDrive::Str(s) => s.chars().count(), + StrDrive::Bytes(b) => b.len(), } - let ctx_id = state.context_stack.len() - 1; - let mut drive = StackDrive::drive(ctx_id, state); + } - has_matched = dispatcher.pymatch(&mut drive); - state = drive.take(); - if has_matched.is_some() { - state.popped_context = state.context_stack.pop(); + fn peek(&self, offset: usize) -> u32 { + match *self { + StrDrive::Str(s) => unsafe { s.get_unchecked(offset..) }.chars().next().unwrap() as u32, + StrDrive::Bytes(b) => b[offset] as u32, } } - if has_matched != Some(true) { - None - } else { - Some(Match::new(&state, pattern.clone(), string.clone())) + fn back_peek(&self, offset: usize) -> u32 { + match *self { + StrDrive::Str(s) => { + let bytes = s.as_bytes(); + let back_offset = utf8_back_peek_offset(bytes, offset); + match offset - back_offset { + 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset]]), + 2 => u32::from_ne_bytes([0, 0, bytes[offset], bytes[offset + 1]]), + 3 => { + u32::from_ne_bytes([0, bytes[offset], bytes[offset + 1], bytes[offset + 2]]) + } + 4 => u32::from_ne_bytes([ + bytes[offset], + bytes[offset + 1], + bytes[offset + 2], + bytes[offset + 3], + ]), + _ => unreachable!(), + } + } + StrDrive::Bytes(b) => b[offset - 1] as u32, + } } -} -pub(crate) fn search( - string: PyStrRef, - start: usize, - end: usize, - pattern: PyRef, -) -> Option { - // TODO: optimize by op info and skip prefix - let end = std::cmp::min(end, string.char_len()); - for i in start..end + 1 { - if let Some(m) = pymatch(string.clone(), i, end, pattern.clone()) { - return Some(m); + fn back_offset(&self, offset: usize, skip: usize) -> usize { + match *self { + StrDrive::Str(s) => { + let bytes = s.as_bytes(); + let mut back_offset = offset; + for _ in 0..skip { + back_offset = utf8_back_peek_offset(bytes, back_offset); + } + back_offset + } + StrDrive::Bytes(_) => offset - skip, + } + } + + pub fn slice_to_pyobject(&self, start: usize, end: usize, vm: &VirtualMachine) -> PyObjectRef { + match *self { + StrDrive::Str(s) => s + .chars() + .take(end) + .skip(start) + .collect::() + .into_pyobject(vm), + StrDrive::Bytes(b) => PyBytes::from(b[start..end].to_vec()).into_pyobject(vm), } } - None } #[derive(Debug, Copy, Clone)] @@ -173,33 +242,21 @@ trait MatchContextDrive { fn repeat_ctx(&self) -> &RepeatContext { self.state().repeat_stack.last().unwrap() } - fn str(&self) -> &str { - unsafe { - std::str::from_utf8_unchecked( - &self.state().string.as_bytes()[self.ctx().string_offset..], - ) - } - } fn pattern(&self) -> &[u32] { &self.state().pattern_codes[self.ctx().code_position..] } - fn peek_char(&self) -> char { - self.str().chars().next().unwrap() + fn peek_char(&self) -> u32 { + self.state().string.peek(self.ctx().string_offset) } fn peek_code(&self, peek: usize) -> u32 { self.state().pattern_codes[self.ctx().code_position + peek] } fn skip_char(&mut self, skip_count: usize) { - match self.str().char_indices().nth(skip_count).map(|x| x.0) { - Some(skipped) => { - self.ctx_mut().string_position += skip_count; - self.ctx_mut().string_offset += skipped; - } - None => { - self.ctx_mut().string_position = self.state().end; - self.ctx_mut().string_offset = self.state().string.len(); // bytes len - } - } + self.ctx_mut().string_offset = self + .state() + .string + .offset(self.ctx().string_offset, skip_count); + self.ctx_mut().string_position += skip_count; } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; @@ -222,7 +279,7 @@ trait MatchContextDrive { fn at_linebreak(&self) -> bool { !self.at_end() && is_linebreak(self.peek_char()) } - fn at_boundary bool>(&self, mut word_checker: F) -> bool { + fn at_boundary bool>(&self, mut word_checker: F) -> bool { if self.at_beginning() && self.at_end() { return false; } @@ -230,47 +287,15 @@ trait MatchContextDrive { let this = !self.at_end() && word_checker(self.peek_char()); this != that } - fn back_peek_offset(&self) -> usize { - let bytes = self.state().string.as_bytes(); - let mut offset = self.ctx().string_offset - 1; - if !is_utf8_first_byte(bytes[offset]) { - offset -= 1; - if !is_utf8_first_byte(bytes[offset]) { - offset -= 1; - if !is_utf8_first_byte(bytes[offset]) { - offset -= 1; - if !is_utf8_first_byte(bytes[offset]) { - panic!("not utf-8 code point"); - } - } - } - } - offset - } - fn back_peek_char(&self) -> char { - let bytes = self.state().string.as_bytes(); - let offset = self.back_peek_offset(); - let current_offset = self.ctx().string_offset; - let code = match current_offset - offset { - 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset]]), - 2 => u32::from_ne_bytes([0, 0, bytes[offset], bytes[offset + 1]]), - 3 => u32::from_ne_bytes([0, bytes[offset], bytes[offset + 1], bytes[offset + 2]]), - 4 => u32::from_ne_bytes([ - bytes[offset], - bytes[offset + 1], - bytes[offset + 2], - bytes[offset + 3], - ]), - _ => unreachable!(), - }; - // TODO: char::from_u32_unchecked is stable from 1.5.0 - unsafe { std::mem::transmute(code) } + fn back_peek_char(&self) -> u32 { + self.state().string.back_peek(self.ctx().string_offset) } fn back_skip_char(&mut self, skip_count: usize) { self.ctx_mut().string_position -= skip_count; - for _ in 0..skip_count { - self.ctx_mut().string_offset = self.back_peek_offset(); - } + self.ctx_mut().string_offset = self + .state() + .string + .back_offset(self.ctx().string_offset, skip_count); } } @@ -529,22 +554,22 @@ impl OpcodeDispatcher { drive.skip_code(drive.peek_code(1) as usize + 1); }), SreOpcode::LITERAL => once(|drive| { - general_op_literal(drive, |code, c| code == c as u32); + general_op_literal(drive, |code, c| code == c); }), SreOpcode::NOT_LITERAL => once(|drive| { - general_op_literal(drive, |code, c| code != c as u32); + general_op_literal(drive, |code, c| code != c); }), SreOpcode::LITERAL_IGNORE => once(|drive| { - general_op_literal(drive, |code, c| code == lower_ascii(c) as u32); + general_op_literal(drive, |code, c| code == lower_ascii(c)); }), SreOpcode::NOT_LITERAL_IGNORE => once(|drive| { - general_op_literal(drive, |code, c| code != lower_ascii(c) as u32); + general_op_literal(drive, |code, c| code != lower_ascii(c)); }), SreOpcode::LITERAL_UNI_IGNORE => once(|drive| { - general_op_literal(drive, |code, c| code == lower_unicode(c) as u32); + general_op_literal(drive, |code, c| code == lower_unicode(c)); }), SreOpcode::NOT_LITERAL_UNI_IGNORE => once(|drive| { - general_op_literal(drive, |code, c| code != lower_unicode(c) as u32); + general_op_literal(drive, |code, c| code != lower_unicode(c)); }), SreOpcode::LITERAL_LOC_IGNORE => once(|drive| { general_op_literal(drive, char_loc_ignore); @@ -610,19 +635,11 @@ impl OpcodeDispatcher { } } -fn calc_string_offset(string: &str, position: usize) -> usize { - string - .char_indices() - .nth(position) - .map(|(i, _)| i) - .unwrap_or(0) -} - -fn char_loc_ignore(code: u32, c: char) -> bool { - code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32 +fn char_loc_ignore(code: u32, c: u32) -> bool { + code == c || code == lower_locate(c) || code == upper_locate(c) } -fn charset_loc_ignore(set: &[u32], c: char) -> bool { +fn charset_loc_ignore(set: &[u32], c: u32) -> bool { let lo = lower_locate(c); if charset(set, c) { return true; @@ -631,7 +648,7 @@ fn charset_loc_ignore(set: &[u32], c: char) -> bool { up != lo && charset(set, up) } -fn general_op_groupref char>(drive: &mut StackDrive, mut f: F) { +fn general_op_groupref u32>(drive: &mut StackDrive, mut f: F) { let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); let (group_start, group_end) = match (group_start, group_end) { (Some(start), Some(end)) if start <= end => (start, end), @@ -645,7 +662,7 @@ fn general_op_groupref char>(drive: &mut StackDrive, mut f: F) MatchContext { string_position: group_start, // TODO: cache the offset - string_offset: calc_string_offset(drive.state.string, group_start), + string_offset: drive.state.string.offset(0, group_start), ..*drive.ctx() }, &drive, @@ -665,7 +682,7 @@ fn general_op_groupref char>(drive: &mut StackDrive, mut f: F) drive.ctx_mut().string_offset = offset; } -fn general_op_literal bool>(drive: &mut StackDrive, f: F) { +fn general_op_literal bool>(drive: &mut StackDrive, f: F) { if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); } else { @@ -674,7 +691,7 @@ fn general_op_literal bool>(drive: &mut StackDrive, f: F } } -fn general_op_in bool>(drive: &mut StackDrive, f: F) { +fn general_op_in bool>(drive: &mut StackDrive, f: F) { let skip = drive.peek_code(1) as usize; if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); @@ -700,7 +717,7 @@ fn at(drive: &StackDrive, atcode: SreAtCode) -> bool { } } -fn category(catcode: SreCatCode, c: char) -> bool { +fn category(catcode: SreCatCode, c: u32) -> bool { match catcode { SreCatCode::DIGIT => is_digit(c), SreCatCode::NOT_DIGIT => !is_digit(c), @@ -723,9 +740,8 @@ fn category(catcode: SreCatCode, c: char) -> bool { } } -fn charset(set: &[u32], c: char) -> bool { +fn charset(set: &[u32], ch: u32) -> bool { /* check if character is a member of the given set */ - let ch = c as u32; let mut ok = true; let mut i = 0; while i < set.len() { @@ -747,7 +763,7 @@ fn charset(set: &[u32], c: char) -> bool { break; } }; - if category(catcode, c) { + if category(catcode, ch) { return ok; } i += 2; @@ -801,7 +817,7 @@ fn charset(set: &[u32], c: char) -> bool { if set[i + 1] <= ch && ch <= set[i + 2] { return ok; } - let ch = upper_unicode(c) as u32; + let ch = upper_unicode(ch); if set[i + 1] <= ch && ch <= set[i + 2] { return ok; } @@ -898,86 +914,128 @@ fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { drive.ctx().string_position - drive.state().string_position } -fn general_count_literal bool>(drive: &mut WrapDrive, end: usize, mut f: F) { +fn general_count_literal bool>(drive: &mut WrapDrive, end: usize, mut f: F) { let ch = drive.peek_code(1); while !drive.ctx().string_position < end && f(ch, drive.peek_char()) { drive.skip_char(1); } } -fn eq_loc_ignore(code: u32, c: char) -> bool { - code == c as u32 || code == lower_locate(c) as u32 || code == upper_locate(c) as u32 +fn eq_loc_ignore(code: u32, ch: u32) -> bool { + code == ch || code == lower_locate(ch) || code == upper_locate(ch) } -fn is_word(c: char) -> bool { - c.is_ascii_alphanumeric() || c == '_' +fn is_word(ch: u32) -> bool { + ch == '_' as u32 + || u8::try_from(ch) + .map(|x| x.is_ascii_alphanumeric()) + .unwrap_or(false) } -fn is_space(c: char) -> bool { - c.is_ascii_whitespace() +fn is_space(ch: u32) -> bool { + u8::try_from(ch) + .map(is_py_ascii_whitespace) + .unwrap_or(false) } -fn is_digit(c: char) -> bool { - c.is_ascii_digit() +fn is_digit(ch: u32) -> bool { + u8::try_from(ch) + .map(|x| x.is_ascii_digit()) + .unwrap_or(false) } -fn is_loc_alnum(c: char) -> bool { +fn is_loc_alnum(ch: u32) -> bool { // TODO: check with cpython - c.is_alphanumeric() + u8::try_from(ch) + .map(|x| x.is_ascii_alphanumeric()) + .unwrap_or(false) } -fn is_loc_word(c: char) -> bool { - is_loc_alnum(c) || c == '_' +fn is_loc_word(ch: u32) -> bool { + ch == '_' as u32 || is_loc_alnum(ch) } -fn is_linebreak(c: char) -> bool { - c == '\n' +fn is_linebreak(ch: u32) -> bool { + ch == '\n' as u32 } -pub(crate) fn lower_ascii(c: char) -> char { - c.to_ascii_lowercase() +pub(crate) fn lower_ascii(ch: u32) -> u32 { + u8::try_from(ch) + .map(|x| x.to_ascii_lowercase() as u32) + .unwrap_or(ch) } -fn lower_locate(c: char) -> char { +fn lower_locate(ch: u32) -> u32 { // TODO: check with cpython // https://doc.rust-lang.org/std/primitive.char.html#method.to_lowercase - c.to_lowercase().next().unwrap() + lower_ascii(ch) } -fn upper_locate(c: char) -> char { +fn upper_locate(ch: u32) -> u32 { // TODO: check with cpython // https://doc.rust-lang.org/std/primitive.char.html#method.to_uppercase - c.to_uppercase().next().unwrap() + u8::try_from(ch) + .map(|x| x.to_ascii_uppercase() as u32) + .unwrap_or(ch) } -fn is_uni_digit(c: char) -> bool { +fn is_uni_digit(ch: u32) -> bool { // TODO: check with cpython - c.is_digit(10) + char::try_from(ch).map(|x| x.is_digit(10)).unwrap_or(false) } -fn is_uni_space(c: char) -> bool { +fn is_uni_space(ch: u32) -> bool { // TODO: check with cpython - c.is_whitespace() -} -fn is_uni_linebreak(c: char) -> bool { + is_space(ch) + || matches!( + ch, + 0x0009 + | 0x000A + | 0x000B + | 0x000C + | 0x000D + | 0x001C + | 0x001D + | 0x001E + | 0x001F + | 0x0020 + | 0x0085 + | 0x00A0 + | 0x1680 + | 0x2000 + | 0x2001 + | 0x2002 + | 0x2003 + | 0x2004 + | 0x2005 + | 0x2006 + | 0x2007 + | 0x2008 + | 0x2009 + | 0x200A + | 0x2028 + | 0x2029 + | 0x202F + | 0x205F + | 0x3000 + ) +} +fn is_uni_linebreak(ch: u32) -> bool { matches!( - c, - '\u{000A}' - | '\u{000B}' - | '\u{000C}' - | '\u{000D}' - | '\u{001C}' - | '\u{001D}' - | '\u{001E}' - | '\u{0085}' - | '\u{2028}' - | '\u{2029}' + ch, + 0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029 ) } -fn is_uni_alnum(c: char) -> bool { +fn is_uni_alnum(ch: u32) -> bool { // TODO: check with cpython - c.is_alphanumeric() + char::try_from(ch) + .map(|x| x.is_alphanumeric()) + .unwrap_or(false) } -fn is_uni_word(c: char) -> bool { - is_uni_alnum(c) || c == '_' +fn is_uni_word(ch: u32) -> bool { + ch == '_' as u32 || is_uni_alnum(ch) } -pub(crate) fn lower_unicode(c: char) -> char { +pub(crate) fn lower_unicode(ch: u32) -> u32 { // TODO: check with cpython - c.to_lowercase().next().unwrap() + char::try_from(ch) + .map(|x| x.to_lowercase().next().unwrap() as u32) + .unwrap_or(ch) } -pub(crate) fn upper_unicode(c: char) -> char { +pub(crate) fn upper_unicode(ch: u32) -> u32 { // TODO: check with cpython - c.to_uppercase().next().unwrap() + char::try_from(ch) + .map(|x| x.to_uppercase().next().unwrap() as u32) + .unwrap_or(ch) } fn is_utf8_first_byte(b: u8) -> bool { @@ -988,6 +1046,23 @@ fn is_utf8_first_byte(b: u8) -> bool { (b & 0b10000000 == 0) || (b & 0b11000000 == 0b11000000) } +fn utf8_back_peek_offset(bytes: &[u8], offset: usize) -> usize { + let mut offset = offset - 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + offset -= 1; + if !is_utf8_first_byte(bytes[offset]) { + panic!("not utf-8 code point"); + } + } + } + } + offset +} + #[derive(Debug, Copy, Clone)] struct RepeatContext { count: isize, From 97000fc4e046c5f9d7da42357d8b3290e33972db Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 19 Jan 2021 17:14:00 +0200 Subject: [PATCH 026/705] fix multiple bugs; skip crash tests --- interp.rs | 220 ++++++++++++++++++++++-------------------------------- 1 file changed, 88 insertions(+), 132 deletions(-) diff --git a/interp.rs b/interp.rs index 7c9246142a..dfb2e04e47 100644 --- a/interp.rs +++ b/interp.rs @@ -131,18 +131,17 @@ impl<'a> State<'a> { pub fn search(mut self) -> Self { // TODO: optimize by op info and skip prefix - loop { + while self.start <= self.end { self = self.pymatch(); if self.has_matched == Some(true) { return self; } self.start += 1; - if self.start > self.end { - return self; - } self.reset(); } + + self } } @@ -182,16 +181,19 @@ impl<'a> StrDrive<'a> { let bytes = s.as_bytes(); let back_offset = utf8_back_peek_offset(bytes, offset); match offset - back_offset { - 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset]]), - 2 => u32::from_ne_bytes([0, 0, bytes[offset], bytes[offset + 1]]), - 3 => { - u32::from_ne_bytes([0, bytes[offset], bytes[offset + 1], bytes[offset + 2]]) - } + 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset - 1]]), + 2 => u32::from_ne_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), + 3 => u32::from_ne_bytes([ + 0, + bytes[offset - 3], + bytes[offset - 2], + bytes[offset - 1], + ]), 4 => u32::from_ne_bytes([ - bytes[offset], - bytes[offset + 1], - bytes[offset + 2], - bytes[offset + 3], + bytes[offset - 4], + bytes[offset - 3], + bytes[offset - 2], + bytes[offset - 1], ]), _ => unreachable!(), } @@ -222,7 +224,10 @@ impl<'a> StrDrive<'a> { .skip(start) .collect::() .into_pyobject(vm), - StrDrive::Bytes(b) => PyBytes::from(b[start..end].to_vec()).into_pyobject(vm), + StrDrive::Bytes(b) => { + PyBytes::from(b.iter().take(end).skip(start).cloned().collect::>()) + .into_pyobject(vm) + } } } } @@ -256,13 +261,11 @@ trait MatchContextDrive { .state() .string .offset(self.ctx().string_offset, skip_count); - self.ctx_mut().string_position += skip_count; + self.ctx_mut().string_position = + std::cmp::min(self.ctx().string_position + skip_count, self.state().end); } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; - if self.ctx().code_position > self.state().pattern_codes.len() { - self.ctx_mut().code_position = self.state().pattern_codes.len(); - } } fn remaining_chars(&self) -> usize { self.state().end - self.ctx().string_position @@ -314,9 +317,7 @@ impl<'a> StackDrive<'a> { self.state } fn push_new_context(&mut self, pattern_offset: usize) { - let mut child_ctx = MatchContext { ..*self.ctx() }; - child_ctx.code_position += pattern_offset; - self.state.context_stack.push(child_ctx); + self.push_new_context_at(self.ctx().code_position + pattern_offset); } fn push_new_context_at(&mut self, code_position: usize) { let mut child_ctx = MatchContext { ..*self.ctx() }; @@ -352,11 +353,9 @@ impl MatchContextDrive for WrapDrive<'_> { fn ctx_mut(&mut self) -> &mut MatchContext { &mut self.ctx } - fn ctx(&self) -> &MatchContext { &self.ctx } - fn state(&self) -> &State { self.stack_drive.state() } @@ -833,6 +832,7 @@ fn charset(set: &[u32], ch: u32) -> bool { false } +/* General case */ fn count(drive: &mut StackDrive, maxcount: usize) -> usize { let mut count = 0; let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); @@ -854,6 +854,8 @@ fn count(drive: &mut StackDrive, maxcount: usize) -> usize { count } +/* TODO: check literal cases should improve the perfermance + fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); @@ -924,6 +926,7 @@ fn general_count_literal bool>(drive: &mut WrapDrive, end: fn eq_loc_ignore(code: u32, ch: u32) -> bool { code == ch || code == lower_locate(ch) || code == upper_locate(ch) } +*/ fn is_word(ch: u32) -> bool { ch == '_' as u32 @@ -1073,6 +1076,7 @@ struct RepeatContext { maxcount: usize, } +#[derive(Default)] struct OpMinRepeatOne { jump_id: usize, mincount: usize, @@ -1082,102 +1086,79 @@ struct OpMinRepeatOne { impl OpcodeExecutor for OpMinRepeatOne { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { - 0 => self._0(drive), - 1 => self._1(drive), - 2 => self._2(drive), - _ => unreachable!(), - } - } -} -impl Default for OpMinRepeatOne { - fn default() -> Self { - OpMinRepeatOne { - jump_id: 0, - mincount: 0, - maxcount: 0, - count: 0, - } - } -} -impl OpMinRepeatOne { - fn _0(&mut self, drive: &mut StackDrive) -> Option<()> { - self.mincount = drive.peek_code(2) as usize; - self.maxcount = drive.peek_code(3) as usize; + 0 => { + self.mincount = drive.peek_code(2) as usize; + self.maxcount = drive.peek_code(3) as usize; - if drive.remaining_chars() < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } + if drive.remaining_chars() < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } - drive.state.string_position = drive.ctx().string_position; + drive.state.string_position = drive.ctx().string_position; - self.count = if self.mincount == 0 { - 0 - } else { - let count = count(drive, self.mincount); - if count < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.skip_char(count); - count - }; + self.count = if self.mincount == 0 { + 0 + } else { + let count = count(drive, self.mincount); + if count < self.mincount { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(count); + count + }; - if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { - drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); - return None; - } + if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + return None; + } - drive.state.marks_push(); - self.jump_id = 1; - self._1(drive) - } - fn _1(&mut self, drive: &mut StackDrive) -> Option<()> { - if self.maxcount == MAXREPEAT || self.count <= self.maxcount { - drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(drive.peek_code(1) as usize + 1); - self.jump_id = 2; - return Some(()); - } + drive.state.marks_push(); + self.jump_id = 1; + self.next(drive) + } + 1 => { + if self.maxcount == MAXREPEAT || self.count <= self.maxcount { + drive.state.string_position = drive.ctx().string_position; + drive.push_new_context(drive.peek_code(1) as usize + 1); + self.jump_id = 2; + return Some(()); + } - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - None - } - fn _2(&mut self, drive: &mut StackDrive) -> Option<()> { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.string_position = drive.ctx().string_position; - if count(drive, 1) == 0 { - drive.ctx_mut().has_matched = Some(false); - return None; + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + None + } + 2 => { + let child_ctx = drive.state.popped_context.unwrap(); + if child_ctx.has_matched == Some(true) { + drive.ctx_mut().has_matched = Some(true); + return None; + } + drive.state.string_position = drive.ctx().string_position; + if count(drive, 1) == 0 { + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.skip_char(1); + self.count += 1; + drive.state.marks_pop_keep(); + self.jump_id = 1; + self.next(drive) + } + _ => unreachable!(), } - drive.skip_char(1); - self.count += 1; - drive.state.marks_pop_keep(); - self.jump_id = 1; - self._1(drive) } } +#[derive(Default)] struct OpMaxUntil { jump_id: usize, count: isize, save_last_position: usize, } -impl Default for OpMaxUntil { - fn default() -> Self { - OpMaxUntil { - jump_id: 0, - count: 0, - save_last_position: 0, - } - } -} impl OpcodeExecutor for OpMaxUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { @@ -1259,20 +1240,12 @@ impl OpcodeExecutor for OpMaxUntil { } } +#[derive(Default)] struct OpMinUntil { jump_id: usize, count: isize, save_repeat: Option, } -impl Default for OpMinUntil { - fn default() -> Self { - Self { - jump_id: 0, - count: 0, - save_repeat: None, - } - } -} impl OpcodeExecutor for OpMinUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { @@ -1338,18 +1311,11 @@ impl OpcodeExecutor for OpMinUntil { } } +#[derive(Default)] struct OpBranch { jump_id: usize, current_branch_length: usize, } -impl Default for OpBranch { - fn default() -> Self { - Self { - jump_id: 0, - current_branch_length: 0, - } - } -} impl OpcodeExecutor for OpBranch { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { @@ -1388,22 +1354,13 @@ impl OpcodeExecutor for OpBranch { } } +#[derive(Default)] struct OpRepeatOne { jump_id: usize, mincount: usize, maxcount: usize, count: isize, } -impl Default for OpRepeatOne { - fn default() -> Self { - Self { - jump_id: 0, - mincount: 0, - maxcount: 0, - count: 0, - } - } -} impl OpcodeExecutor for OpRepeatOne { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { @@ -1413,7 +1370,6 @@ impl OpcodeExecutor for OpRepeatOne { if drive.remaining_chars() < self.mincount { drive.ctx_mut().has_matched = Some(false); - return None; } drive.state.string_position = drive.ctx().string_position; self.count = count(drive, self.maxcount) as isize; From 84113cba2cd66b83106f4564b03ae8cb999197c3 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 20 Jan 2021 16:33:30 +0200 Subject: [PATCH 027/705] fix zero width repeat --- interp.rs | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/interp.rs b/interp.rs index dfb2e04e47..8b5a023ffd 100644 --- a/interp.rs +++ b/interp.rs @@ -1245,6 +1245,7 @@ struct OpMinUntil { jump_id: usize, count: isize, save_repeat: Option, + save_last_position: usize, } impl OpcodeExecutor for OpMinUntil { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { @@ -1280,6 +1281,7 @@ impl OpcodeExecutor for OpMinUntil { drive.ctx_mut().has_matched = child_ctx.has_matched; if drive.ctx().has_matched != Some(true) { drive.repeat_ctx_mut().count = self.count - 1; + drive.repeat_ctx_mut().last_position = self.save_last_position; drive.state.string_position = drive.ctx().string_position; } None @@ -1295,13 +1297,26 @@ impl OpcodeExecutor for OpMinUntil { drive.state.marks_pop(); // match more unital tail matches - let maxcount = drive.repeat_ctx().maxcount; - let code_position = drive.repeat_ctx().code_position; - if self.count as usize >= maxcount && maxcount != MAXREPEAT { + let RepeatContext { + count: _, + code_position, + last_position, + mincount: _, + maxcount, + } = *drive.repeat_ctx(); + + if self.count as usize >= maxcount && maxcount != MAXREPEAT + || drive.state.string_position == last_position + { drive.ctx_mut().has_matched = Some(false); return None; } drive.repeat_ctx_mut().count = self.count; + + /* zero-width match protection */ + self.save_last_position = last_position; + drive.repeat_ctx_mut().last_position = drive.state.string_position; + drive.push_new_context_at(code_position + 4); self.jump_id = 1; Some(()) From f2311b56fcbc87431eb0e291b68f44bb3658c2c1 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 21 Jan 2021 12:48:32 +0200 Subject: [PATCH 028/705] fix op branch --- interp.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/interp.rs b/interp.rs index 8b5a023ffd..5fa7ca8c0e 100644 --- a/interp.rs +++ b/interp.rs @@ -1329,28 +1329,30 @@ impl OpcodeExecutor for OpMinUntil { #[derive(Default)] struct OpBranch { jump_id: usize, - current_branch_length: usize, + branch_offset: usize, } impl OpcodeExecutor for OpBranch { + // alternation + // <0=skip> code ... fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { 0 => { drive.state.marks_push(); // jump out the head - self.current_branch_length = 1; + self.branch_offset = 1; self.jump_id = 1; self.next(drive) } 1 => { - drive.skip_code(self.current_branch_length); - self.current_branch_length = drive.peek_code(0) as usize; - if self.current_branch_length == 0 { + let next_branch_length = drive.peek_code(self.branch_offset) as usize; + if next_branch_length == 0 { drive.state.marks_pop_discard(); drive.ctx_mut().has_matched = Some(false); return None; } drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(1); + drive.push_new_context(self.branch_offset + 1); + self.branch_offset += next_branch_length; self.jump_id = 2; Some(()) } From 6a792324b027bea56c39cb04fe072f719fd1efed Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 21 Jan 2021 16:14:23 +0200 Subject: [PATCH 029/705] fix at_beginning --- interp.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/interp.rs b/interp.rs index 5fa7ca8c0e..11010c7152 100644 --- a/interp.rs +++ b/interp.rs @@ -274,7 +274,8 @@ trait MatchContextDrive { self.state().pattern_codes.len() - self.ctx().code_position } fn at_beginning(&self) -> bool { - self.ctx().string_position == self.state().start + // self.ctx().string_position == self.state().start + self.ctx().string_position == 0 } fn at_end(&self) -> bool { self.ctx().string_position == self.state().end From 7f0dad7901751abcc09ac8308742c91627401c81 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 21 Jan 2021 19:24:10 +0200 Subject: [PATCH 030/705] fix back_peek_char --- interp.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/interp.rs b/interp.rs index 11010c7152..beb96f1f09 100644 --- a/interp.rs +++ b/interp.rs @@ -181,15 +181,15 @@ impl<'a> StrDrive<'a> { let bytes = s.as_bytes(); let back_offset = utf8_back_peek_offset(bytes, offset); match offset - back_offset { - 1 => u32::from_ne_bytes([0, 0, 0, bytes[offset - 1]]), - 2 => u32::from_ne_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), - 3 => u32::from_ne_bytes([ + 1 => u32::from_be_bytes([0, 0, 0, bytes[offset - 1]]), + 2 => u32::from_be_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), + 3 => u32::from_be_bytes([ 0, bytes[offset - 3], bytes[offset - 2], bytes[offset - 1], ]), - 4 => u32::from_ne_bytes([ + 4 => u32::from_be_bytes([ bytes[offset - 4], bytes[offset - 3], bytes[offset - 2], From 4416158eceb9dd6931ca25cfc610f50582dd83dc Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 22 Jan 2021 16:40:33 +0200 Subject: [PATCH 031/705] fix multiple bugs; pass tests --- interp.rs | 97 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 27 deletions(-) diff --git a/interp.rs b/interp.rs index beb96f1f09..71e3ead143 100644 --- a/interp.rs +++ b/interp.rs @@ -25,6 +25,7 @@ pub(crate) struct State<'a> { pub string_position: usize, popped_context: Option, pub has_matched: Option, + pub match_all: bool, } impl<'a> State<'a> { @@ -51,6 +52,7 @@ impl<'a> State<'a> { string_position: start, popped_context: None, has_matched: None, + match_all: false, } } @@ -105,6 +107,7 @@ impl<'a> State<'a> { string_offset: self.string.offset(0, self.start), code_position: 0, has_matched: None, + toplevel: true, }; self.context_stack.push(ctx); @@ -132,6 +135,7 @@ impl<'a> State<'a> { pub fn search(mut self) -> Self { // TODO: optimize by op info and skip prefix while self.start <= self.end { + self.match_all = false; self = self.pymatch(); if self.has_matched == Some(true) { @@ -232,12 +236,13 @@ impl<'a> StrDrive<'a> { } } -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Clone, Copy)] struct MatchContext { string_position: usize, string_offset: usize, code_position: usize, has_matched: Option, + toplevel: bool, } trait MatchContextDrive { @@ -463,8 +468,12 @@ impl OpcodeDispatcher { drive.ctx_mut().has_matched = Some(false); }), SreOpcode::SUCCESS => once(|drive| { - drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); + if drive.ctx().toplevel && drive.state.match_all && !drive.at_end() { + drive.ctx_mut().has_matched = Some(false); + } else { + drive.state.string_position = drive.ctx().string_position; + drive.ctx_mut().has_matched = Some(true); + } }), SreOpcode::ANY => once(|drive| { if drive.at_end() || drive.at_linebreak() { @@ -482,15 +491,19 @@ impl OpcodeDispatcher { drive.skip_char(1); } }), + /* assert subpattern */ + /* */ SreOpcode::ASSERT => twice( |drive| { let back = drive.peek_code(2) as usize; - if back > drive.ctx().string_position { + let passed = drive.ctx().string_position - drive.state.start; + if passed < back { drive.ctx_mut().has_matched = Some(false); return; } drive.state.string_position = drive.ctx().string_position - back; drive.push_new_context(3); + drive.state.context_stack.last_mut().unwrap().toplevel = false; }, |drive| { let child_ctx = drive.state.popped_context.unwrap(); @@ -504,12 +517,14 @@ impl OpcodeDispatcher { SreOpcode::ASSERT_NOT => twice( |drive| { let back = drive.peek_code(2) as usize; - if back > drive.ctx().string_position { + let passed = drive.ctx().string_position - drive.state.start; + if passed < back { drive.skip_code(drive.peek_code(1) as usize + 1); return; } drive.state.string_position = drive.ctx().string_position - back; drive.push_new_context(3); + drive.state.context_stack.last_mut().unwrap().toplevel = false; }, |drive| { let child_ctx = drive.state.popped_context.unwrap(); @@ -770,17 +785,17 @@ fn charset(set: &[u32], ch: u32) -> bool { } SreOpcode::CHARSET => { /* */ - let set = &set[1..]; + let set = &set[i + 1..]; if ch < 256 && ((set[(ch >> 5) as usize] & (1u32 << (ch & 31))) != 0) { return ok; } - i += 8; + i += 1 + 8; } SreOpcode::BIGCHARSET => { /* <256 blockindices> */ let count = set[i + 1] as usize; if ch < 0x10000 { - let set = &set[2..]; + let set = &set[i + 2..]; let block_index = ch >> 8; let (_, blockindices, _) = unsafe { set.align_to::() }; let blocks = &set[64..]; @@ -1085,6 +1100,7 @@ struct OpMinRepeatOne { count: usize, } impl OpcodeExecutor for OpMinRepeatOne { + /* <1=min> <2=max> item tail */ fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { 0 => { @@ -1110,7 +1126,11 @@ impl OpcodeExecutor for OpMinRepeatOne { count }; - if drive.peek_code(drive.peek_code(1) as usize + 1) == SreOpcode::SUCCESS as u32 { + let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 + && !(drive.ctx().toplevel && drive.state.match_all && !drive.at_end()) + { + // tail is empty. we're finished drive.state.string_position = drive.ctx().string_position; drive.ctx_mut().has_matched = Some(true); return None; @@ -1377,9 +1397,18 @@ struct OpRepeatOne { jump_id: usize, mincount: usize, maxcount: usize, - count: isize, + count: usize, + following_literal: Option, } impl OpcodeExecutor for OpRepeatOne { + /* match repeated sequence (maximizing regexp) */ + + /* this operator only works if the repeated item is + exactly one character wide, and we're not already + collecting backtracking points. for other cases, + use the MAX_REPEAT operator */ + + /* <1=min> <2=max> item tail */ fn next(&mut self, drive: &mut StackDrive) -> Option<()> { match self.jump_id { 0 => { @@ -1388,17 +1417,21 @@ impl OpcodeExecutor for OpRepeatOne { if drive.remaining_chars() < self.mincount { drive.ctx_mut().has_matched = Some(false); + return None; } + drive.state.string_position = drive.ctx().string_position; - self.count = count(drive, self.maxcount) as isize; - drive.skip_char(self.count as usize); - if self.count < self.mincount as isize { + + self.count = count(drive, self.maxcount); + drive.skip_char(self.count); + if self.count < self.mincount { drive.ctx_mut().has_matched = Some(false); return None; } let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 { + if next_code == SreOpcode::SUCCESS as u32 && drive.at_end() && !drive.ctx().toplevel + { // tail is empty. we're finished drive.state.string_position = drive.ctx().string_position; drive.ctx_mut().has_matched = Some(true); @@ -1406,24 +1439,34 @@ impl OpcodeExecutor for OpRepeatOne { } drive.state.marks_push(); - // TODO: + // Special case: Tail starts with a literal. Skip positions where // the rest of the pattern cannot possibly match. + if next_code == SreOpcode::LITERAL as u32 { + self.following_literal = Some(drive.peek_code(drive.peek_code(1) as usize + 2)) + } + self.jump_id = 1; self.next(drive) } 1 => { - // General case: backtracking - if self.count >= self.mincount as isize { - drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(drive.peek_code(1) as usize + 1); - self.jump_id = 2; - return Some(()); + if let Some(c) = self.following_literal { + while drive.at_end() || drive.peek_char() != c { + if self.count <= self.mincount { + drive.state.marks_pop_discard(); + drive.ctx_mut().has_matched = Some(false); + return None; + } + drive.back_skip_char(1); + self.count -= 1; + } } - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - None + // General case: backtracking + drive.state.string_position = drive.ctx().string_position; + drive.push_new_context(drive.peek_code(1) as usize + 1); + self.jump_id = 2; + Some(()) } 2 => { let child_ctx = drive.state.popped_context.unwrap(); @@ -1431,19 +1474,19 @@ impl OpcodeExecutor for OpRepeatOne { drive.ctx_mut().has_matched = Some(true); return None; } - if self.count <= self.mincount as isize { + if self.count <= self.mincount { drive.state.marks_pop_discard(); drive.ctx_mut().has_matched = Some(false); return None; } - // TODO: unnesscary double check drive.back_skip_char(1); self.count -= 1; + drive.state.marks_pop_keep(); self.jump_id = 1; - Some(()) + self.next(drive) } _ => unreachable!(), } From db5bd646b928048c7183f46ede40a114f19a82e4 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Wed, 27 Jan 2021 18:54:38 -0600 Subject: [PATCH 032/705] Initial commit to switch to new repo --- .gitignore | 2 ++ Cargo.toml | 7 +++++++ constants.rs => src/constants.rs | 0 interp.rs => src/engine.rs | 0 src/lib.rs | 2 ++ 5 files changed, 11 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml rename constants.rs => src/constants.rs (100%) rename interp.rs => src/engine.rs (100%) create mode 100644 src/lib.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..96ef6c0b94 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000000..f9f504966b --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "sre-engine" +version = "0.1.0" +authors = ["Kangzhi Shi ", "RustPython Team"] +edition = "2018" + +[dependencies] diff --git a/constants.rs b/src/constants.rs similarity index 100% rename from constants.rs rename to src/constants.rs diff --git a/interp.rs b/src/engine.rs similarity index 100% rename from interp.rs rename to src/engine.rs diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000000..f305aa094a --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,2 @@ +pub mod constants; +pub mod engine; From 9c95994dab20fea56004940ba50323ee7e327f81 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Wed, 27 Jan 2021 19:18:42 -0600 Subject: [PATCH 033/705] Modify to work outside of rustpython-vm --- Cargo.toml | 2 ++ src/engine.rs | 38 +++++++++++--------------------------- src/lib.rs | 12 ++++++++++++ 3 files changed, 25 insertions(+), 27 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f9f504966b..3a82ba73f1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,3 +5,5 @@ authors = ["Kangzhi Shi ", "RustPython Team"] edition = "2018" [dependencies] +num_enum = "0.5" +bitflags = "1.2" diff --git a/src/engine.rs b/src/engine.rs index 71e3ead143..9aff2dcc91 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,17 +1,16 @@ // good luck to those that follow; here be dragons -use super::_sre::MAXREPEAT; use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; -use crate::builtins::PyBytes; -use crate::bytesinner::is_py_ascii_whitespace; -use crate::pyobject::{IntoPyObject, PyObjectRef}; -use crate::VirtualMachine; +use super::MAXREPEAT; use std::collections::HashMap; use std::convert::TryFrom; -use std::unreachable; + +const fn is_py_ascii_whitespace(b: u8) -> bool { + matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') +} #[derive(Debug)] -pub(crate) struct State<'a> { +pub struct State<'a> { pub string: StrDrive<'a>, pub start: usize, pub end: usize, @@ -29,7 +28,7 @@ pub(crate) struct State<'a> { } impl<'a> State<'a> { - pub(crate) fn new( + pub fn new( string: StrDrive<'a>, start: usize, end: usize, @@ -150,7 +149,7 @@ impl<'a> State<'a> { } #[derive(Debug, Clone, Copy)] -pub(crate) enum StrDrive<'a> { +pub enum StrDrive<'a> { Str(&'a str), Bytes(&'a [u8]), } @@ -219,21 +218,6 @@ impl<'a> StrDrive<'a> { StrDrive::Bytes(_) => offset - skip, } } - - pub fn slice_to_pyobject(&self, start: usize, end: usize, vm: &VirtualMachine) -> PyObjectRef { - match *self { - StrDrive::Str(s) => s - .chars() - .take(end) - .skip(start) - .collect::() - .into_pyobject(vm), - StrDrive::Bytes(b) => { - PyBytes::from(b.iter().take(end).skip(start).cloned().collect::>()) - .into_pyobject(vm) - } - } - } } #[derive(Debug, Clone, Copy)] @@ -972,7 +956,7 @@ fn is_loc_word(ch: u32) -> bool { fn is_linebreak(ch: u32) -> bool { ch == '\n' as u32 } -pub(crate) fn lower_ascii(ch: u32) -> u32 { +pub fn lower_ascii(ch: u32) -> u32 { u8::try_from(ch) .map(|x| x.to_ascii_lowercase() as u32) .unwrap_or(ch) @@ -1044,13 +1028,13 @@ fn is_uni_alnum(ch: u32) -> bool { fn is_uni_word(ch: u32) -> bool { ch == '_' as u32 || is_uni_alnum(ch) } -pub(crate) fn lower_unicode(ch: u32) -> u32 { +pub fn lower_unicode(ch: u32) -> u32 { // TODO: check with cpython char::try_from(ch) .map(|x| x.to_lowercase().next().unwrap() as u32) .unwrap_or(ch) } -pub(crate) fn upper_unicode(ch: u32) -> u32 { +pub fn upper_unicode(ch: u32) -> u32 { // TODO: check with cpython char::try_from(ch) .map(|x| x.to_uppercase().next().unwrap() as u32) diff --git a/src/lib.rs b/src/lib.rs index f305aa094a..4a3ed1b754 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,14 @@ pub mod constants; pub mod engine; + +pub const CODESIZE: usize = 4; + +#[cfg(target_pointer_width = "32")] +pub const MAXREPEAT: usize = usize::MAX; +#[cfg(target_pointer_width = "64")] +pub const MAXREPEAT: usize = u32::MAX as usize; + +#[cfg(target_pointer_width = "32")] +pub const MAXGROUPS: usize = MAXREPEAT / 4 / 2; +#[cfg(target_pointer_width = "64")] +pub const MAXGROUPS: usize = MAXREPEAT / 2; From 2592067f95f530850db8451fd45982dc3bf161e0 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Wed, 27 Jan 2021 19:19:31 -0600 Subject: [PATCH 034/705] Add LICENSE --- Cargo.toml | 1 + LICENSE | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 LICENSE diff --git a/Cargo.toml b/Cargo.toml index 3a82ba73f1..03db7aba4f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ name = "sre-engine" version = "0.1.0" authors = ["Kangzhi Shi ", "RustPython Team"] +license = "MIT" edition = "2018" [dependencies] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..7213274e0f --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 RustPython Team + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From 2473c3e49f6ba539549a8bf4f87db9146044fc52 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 1 Feb 2021 20:25:45 +0200 Subject: [PATCH 035/705] add tests; fix OpAssert panic --- src/engine.rs | 15 ++++++++------- src/lib.rs | 2 ++ src/tests.rs | 19 +++++++++++++++++++ 3 files changed, 29 insertions(+), 7 deletions(-) create mode 100644 src/tests.rs diff --git a/src/engine.rs b/src/engine.rs index 9aff2dcc91..f6a6092f9b 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -369,20 +369,18 @@ fn once(f: F) -> Box> { Box::new(OpOnce { f: Some(f) }) } -// F1 F2 are same identical, but workaround for closure struct OpTwice { f1: Option, f2: Option, } impl OpcodeExecutor for OpTwice where - F1: FnOnce(&mut StackDrive), + F1: FnOnce(&mut StackDrive) -> Option<()>, F2: FnOnce(&mut StackDrive), { fn next(&mut self, drive: &mut StackDrive) -> Option<()> { if let Some(f1) = self.f1.take() { - f1(drive); - Some(()) + f1(drive) } else if let Some(f2) = self.f2.take() { f2(drive); None @@ -393,7 +391,7 @@ where } fn twice(f1: F1, f2: F2) -> Box> where - F1: FnOnce(&mut StackDrive), + F1: FnOnce(&mut StackDrive) -> Option<()>, F2: FnOnce(&mut StackDrive), { Box::new(OpTwice { @@ -483,11 +481,12 @@ impl OpcodeDispatcher { let passed = drive.ctx().string_position - drive.state.start; if passed < back { drive.ctx_mut().has_matched = Some(false); - return; + return None; } drive.state.string_position = drive.ctx().string_position - back; drive.push_new_context(3); drive.state.context_stack.last_mut().unwrap().toplevel = false; + Some(()) }, |drive| { let child_ctx = drive.state.popped_context.unwrap(); @@ -504,11 +503,12 @@ impl OpcodeDispatcher { let passed = drive.ctx().string_position - drive.state.start; if passed < back { drive.skip_code(drive.peek_code(1) as usize + 1); - return; + return None; } drive.state.string_position = drive.ctx().string_position - back; drive.push_new_context(3); drive.state.context_stack.last_mut().unwrap().toplevel = false; + Some(()) }, |drive| { let child_ctx = drive.state.popped_context.unwrap(); @@ -598,6 +598,7 @@ impl OpcodeDispatcher { drive.state.string_position = drive.ctx().string_position; // execute UNTIL operator drive.push_new_context(drive.peek_code(1) as usize + 1); + Some(()) }, |drive| { drive.state.repeat_stack.pop(); diff --git a/src/lib.rs b/src/lib.rs index 4a3ed1b754..eae3be617d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,7 @@ pub mod constants; pub mod engine; +#[cfg(test)] +mod tests; pub const CODESIZE: usize = 4; diff --git a/src/tests.rs b/src/tests.rs new file mode 100644 index 0000000000..95922e88a7 --- /dev/null +++ b/src/tests.rs @@ -0,0 +1,19 @@ +use engine::{State, StrDrive}; + +use super::*; + +#[test] +fn test_2427() { + let str_drive = StrDrive::Str("x"); + // r'(? = vec![15, 4, 0, 1, 1, 5, 5, 1, 17, 46, 1, 17, 120, 6, 10, 1]; + let mut state = State::new( + str_drive, + 0, + std::usize::MAX, + constants::SreFlag::UNICODE, + &code, + ); + state = state.pymatch(); + assert!(state.has_matched == Some(true)); +} From cc4441b50ffa361c8f52eeecad54a9459c471e9b Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Mon, 1 Feb 2021 14:27:01 -0600 Subject: [PATCH 036/705] Compile regex patterns for tests with a script --- generate_tests.py | 37 +++++++++++++++++++++++++++++++++++++ src/engine.rs | 12 ++++++++++++ src/lib.rs | 2 -- src/tests.rs | 19 ------------------- tests/lookbehind.py | 1 + tests/lookbehind.re | 2 ++ tests/tests.rs | 26 ++++++++++++++++++++++++++ 7 files changed, 78 insertions(+), 21 deletions(-) create mode 100644 generate_tests.py delete mode 100644 src/tests.rs create mode 100644 tests/lookbehind.py create mode 100644 tests/lookbehind.re create mode 100644 tests/tests.rs diff --git a/generate_tests.py b/generate_tests.py new file mode 100644 index 0000000000..49a24792be --- /dev/null +++ b/generate_tests.py @@ -0,0 +1,37 @@ +import os +from pathlib import Path +import re +import sre_constants +import sre_compile +import sre_parse +import json + +m = re.search(r"const SRE_MAGIC: usize = (\d+);", open("src/constants.rs").read()) +sre_engine_magic = int(m.group(1)) +del m + +assert sre_constants.MAGIC == sre_engine_magic + +class CompiledPattern: + @classmethod + def compile(cls, pattern, flags=0): + p = sre_parse.parse(pattern) + code = sre_compile._code(p, flags) + self = cls() + self.pattern = pattern + self.code = code + self.flags = re.RegexFlag(flags | p.state.flags) + return self + +for k, v in re.RegexFlag.__members__.items(): + setattr(CompiledPattern, k, v) + +with os.scandir("tests") as d: + for f in d: + path = Path(f.path) + if path.suffix == ".py": + pattern = eval(path.read_text(), {"re": CompiledPattern}) + path.with_suffix(".re").write_text( + f"// {pattern.pattern!r}, flags={pattern.flags!r}\n" + f"Pattern {{ code: &{json.dumps(pattern.code)}, flags: SreFlag::from_bits_truncate({int(pattern.flags)}) }}" + ) diff --git a/src/engine.rs b/src/engine.rs index f6a6092f9b..a48b799e1b 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -153,6 +153,18 @@ pub enum StrDrive<'a> { Str(&'a str), Bytes(&'a [u8]), } + +impl<'a> From<&'a str> for StrDrive<'a> { + fn from(s: &'a str) -> Self { + Self::Str(s) + } +} +impl<'a> From<&'a [u8]> for StrDrive<'a> { + fn from(b: &'a [u8]) -> Self { + Self::Bytes(b) + } +} + impl<'a> StrDrive<'a> { fn offset(&self, offset: usize, skip: usize) -> usize { match *self { diff --git a/src/lib.rs b/src/lib.rs index eae3be617d..4a3ed1b754 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,5 @@ pub mod constants; pub mod engine; -#[cfg(test)] -mod tests; pub const CODESIZE: usize = 4; diff --git a/src/tests.rs b/src/tests.rs deleted file mode 100644 index 95922e88a7..0000000000 --- a/src/tests.rs +++ /dev/null @@ -1,19 +0,0 @@ -use engine::{State, StrDrive}; - -use super::*; - -#[test] -fn test_2427() { - let str_drive = StrDrive::Str("x"); - // r'(? = vec![15, 4, 0, 1, 1, 5, 5, 1, 17, 46, 1, 17, 120, 6, 10, 1]; - let mut state = State::new( - str_drive, - 0, - std::usize::MAX, - constants::SreFlag::UNICODE, - &code, - ); - state = state.pymatch(); - assert!(state.has_matched == Some(true)); -} diff --git a/tests/lookbehind.py b/tests/lookbehind.py new file mode 100644 index 0000000000..3da6425959 --- /dev/null +++ b/tests/lookbehind.py @@ -0,0 +1 @@ +re.compile(r'(?( + &self, + string: impl Into>, + range: std::ops::Range, + ) -> engine::State<'a> { + engine::State::new(string.into(), range.start, range.end, self.flags, self.code) + } +} + +#[test] +fn test_2427() { + // r'(? Date: Wed, 3 Feb 2021 13:32:48 +0200 Subject: [PATCH 037/705] fix OpAssert positive lookbehind --- src/engine.rs | 28 ++++++++++++++++++++++++---- tests/positive_lookbehind.py | 1 + tests/positive_lookbehind.re | 2 ++ tests/tests.rs | 9 +++++++++ 4 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 tests/positive_lookbehind.py create mode 100644 tests/positive_lookbehind.re diff --git a/src/engine.rs b/src/engine.rs index a48b799e1b..5e0e0f4208 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -490,14 +490,24 @@ impl OpcodeDispatcher { SreOpcode::ASSERT => twice( |drive| { let back = drive.peek_code(2) as usize; - let passed = drive.ctx().string_position - drive.state.start; + let passed = drive.ctx().string_position; if passed < back { drive.ctx_mut().has_matched = Some(false); return None; } + let back_offset = drive + .state + .string + .back_offset(drive.ctx().string_offset, back); + drive.state.string_position = drive.ctx().string_position - back; + drive.push_new_context(3); - drive.state.context_stack.last_mut().unwrap().toplevel = false; + let child_ctx = drive.state.context_stack.last_mut().unwrap(); + child_ctx.toplevel = false; + child_ctx.string_position -= back; + child_ctx.string_offset = back_offset; + Some(()) }, |drive| { @@ -512,14 +522,24 @@ impl OpcodeDispatcher { SreOpcode::ASSERT_NOT => twice( |drive| { let back = drive.peek_code(2) as usize; - let passed = drive.ctx().string_position - drive.state.start; + let passed = drive.ctx().string_position; if passed < back { drive.skip_code(drive.peek_code(1) as usize + 1); return None; } + let back_offset = drive + .state + .string + .back_offset(drive.ctx().string_offset, back); + drive.state.string_position = drive.ctx().string_position - back; + drive.push_new_context(3); - drive.state.context_stack.last_mut().unwrap().toplevel = false; + let child_ctx = drive.state.context_stack.last_mut().unwrap(); + child_ctx.toplevel = false; + child_ctx.string_position -= back; + child_ctx.string_offset = back_offset; + Some(()) }, |drive| { diff --git a/tests/positive_lookbehind.py b/tests/positive_lookbehind.py new file mode 100644 index 0000000000..2a0ab29253 --- /dev/null +++ b/tests/positive_lookbehind.py @@ -0,0 +1 @@ +re.compile(r'(?<=abc)def') \ No newline at end of file diff --git a/tests/positive_lookbehind.re b/tests/positive_lookbehind.re new file mode 100644 index 0000000000..68923b58ee --- /dev/null +++ b/tests/positive_lookbehind.re @@ -0,0 +1,2 @@ +// '(?<=abc)def', flags=re.UNICODE +Pattern { code: &[15, 4, 0, 3, 3, 4, 9, 3, 17, 97, 17, 98, 17, 99, 1, 17, 100, 17, 101, 17, 102, 1], flags: SreFlag::from_bits_truncate(32) } \ No newline at end of file diff --git a/tests/tests.rs b/tests/tests.rs index 4db110177d..8a18b5f333 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -24,3 +24,12 @@ fn test_2427() { state = state.pymatch(); assert!(state.has_matched == Some(true)); } + +#[test] +fn test_assert() { + // '(?<=abc)def', flags=re.UNICODE + let pattern = include!("positive_lookbehind.re"); + let mut state = pattern.state("abcdef", 0..usize::MAX); + state = state.search(); + assert!(state.has_matched == Some(true)); +} From 2a43d66e11a4a2245c12b24484b14eb01d7033a1 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Thu, 1 Apr 2021 21:38:14 -0500 Subject: [PATCH 038/705] Fix clippy lint --- src/constants.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/constants.rs b/src/constants.rs index f5ab92c531..f3962b339a 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -16,7 +16,7 @@ use bitflags::bitflags; pub const SRE_MAGIC: usize = 20171005; #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] -#[allow(non_camel_case_types)] +#[allow(non_camel_case_types, clippy::upper_case_acronyms)] pub enum SreOpcode { FAILURE = 0, SUCCESS = 1, @@ -62,7 +62,7 @@ pub enum SreOpcode { } #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] -#[allow(non_camel_case_types)] +#[allow(non_camel_case_types, clippy::upper_case_acronyms)] pub enum SreAtCode { BEGINNING = 0, BEGINNING_LINE = 1, @@ -79,7 +79,7 @@ pub enum SreAtCode { } #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] -#[allow(non_camel_case_types)] +#[allow(non_camel_case_types, clippy::upper_case_acronyms)] pub enum SreCatCode { DIGIT = 0, NOT_DIGIT = 1, From ca1346ee031624b37e6e5d531a1c5dcaa5b284fc Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Thu, 1 Apr 2021 22:05:45 -0500 Subject: [PATCH 039/705] Have generate_tests.py generate Patterns inline in tests.rs --- generate_tests.py | 21 +++++++++++++++------ tests/lookbehind.py | 1 - tests/lookbehind.re | 2 -- tests/positive_lookbehind.py | 1 - tests/positive_lookbehind.re | 2 -- tests/tests.rs | 16 ++++++++++------ 6 files changed, 25 insertions(+), 18 deletions(-) delete mode 100644 tests/lookbehind.py delete mode 100644 tests/lookbehind.re delete mode 100644 tests/positive_lookbehind.py delete mode 100644 tests/positive_lookbehind.re diff --git a/generate_tests.py b/generate_tests.py index 49a24792be..7af1d2f0c2 100644 --- a/generate_tests.py +++ b/generate_tests.py @@ -26,12 +26,21 @@ def compile(cls, pattern, flags=0): for k, v in re.RegexFlag.__members__.items(): setattr(CompiledPattern, k, v) + +# matches `// pattern {varname} = re.compile(...)` +pattern_pattern = re.compile(r"^((\s*)\/\/\s*pattern\s+(\w+)\s+=\s+(.+?))$(?:.+?END GENERATED)?", re.M | re.S) +def replace_compiled(m): + line, indent, varname, pattern = m.groups() + pattern = eval(pattern, {"re": CompiledPattern}) + pattern = f"Pattern {{ code: &{json.dumps(pattern.code)}, flags: SreFlag::from_bits_truncate({int(pattern.flags)}) }}" + return f'''{line} +{indent}// START GENERATED by generate_tests.py +{indent}#[rustfmt::skip] let {varname} = {pattern}; +{indent}// END GENERATED''' + with os.scandir("tests") as d: for f in d: path = Path(f.path) - if path.suffix == ".py": - pattern = eval(path.read_text(), {"re": CompiledPattern}) - path.with_suffix(".re").write_text( - f"// {pattern.pattern!r}, flags={pattern.flags!r}\n" - f"Pattern {{ code: &{json.dumps(pattern.code)}, flags: SreFlag::from_bits_truncate({int(pattern.flags)}) }}" - ) + if path.suffix == ".rs": + replaced = pattern_pattern.sub(replace_compiled, path.read_text()) + path.write_text(replaced) diff --git a/tests/lookbehind.py b/tests/lookbehind.py deleted file mode 100644 index 3da6425959..0000000000 --- a/tests/lookbehind.py +++ /dev/null @@ -1 +0,0 @@ -re.compile(r'(? Date: Mon, 5 Apr 2021 11:10:32 -0500 Subject: [PATCH 040/705] Add more info to Cargo.toml --- Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 03db7aba4f..8e69ab5235 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,8 +2,11 @@ name = "sre-engine" version = "0.1.0" authors = ["Kangzhi Shi ", "RustPython Team"] +description = "A low-level implementation of Python's SRE regex engine" +repository = "https://github.com/RustPython/sre-engine" license = "MIT" edition = "2018" +keywords = ["regex"] [dependencies] num_enum = "0.5" From 9728dd8699a255686deaedbdd0f23f549e62009f Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 16 Apr 2021 09:35:11 +0200 Subject: [PATCH 041/705] fix test_string_boundaries --- src/engine.rs | 14 +++++++++++--- tests/tests.rs | 11 +++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 5e0e0f4208..0de9f43844 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -292,6 +292,14 @@ trait MatchContextDrive { let this = !self.at_end() && word_checker(self.peek_char()); this != that } + fn at_non_boundary bool>(&self, mut word_checker: F) -> bool { + if self.at_beginning() && self.at_end() { + return false; + } + let that = !self.at_beginning() && word_checker(self.back_peek_char()); + let this = !self.at_end() && word_checker(self.peek_char()); + this == that + } fn back_peek_char(&self) -> u32 { self.state().string.back_peek(self.ctx().string_offset) } @@ -738,14 +746,14 @@ fn at(drive: &StackDrive, atcode: SreAtCode) -> bool { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), SreAtCode::BOUNDARY => drive.at_boundary(is_word), - SreAtCode::NON_BOUNDARY => !drive.at_boundary(is_word), + SreAtCode::NON_BOUNDARY => drive.at_non_boundary(is_word), SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(), SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(), SreAtCode::END_STRING => drive.at_end(), SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word), - SreAtCode::LOC_NON_BOUNDARY => !drive.at_boundary(is_loc_word), + SreAtCode::LOC_NON_BOUNDARY => drive.at_non_boundary(is_loc_word), SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word), - SreAtCode::UNI_NON_BOUNDARY => !drive.at_boundary(is_uni_word), + SreAtCode::UNI_NON_BOUNDARY => drive.at_non_boundary(is_uni_word), } } diff --git a/tests/tests.rs b/tests/tests.rs index f4cd091f0d..690c72861b 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -37,3 +37,14 @@ fn test_assert() { state = state.search(); assert!(state.has_matched == Some(true)); } + +#[test] +fn test_string_boundaries() { + // pattern big_b = re.compile(r'\B') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let big_b = Pattern { code: &[15, 4, 0, 0, 0, 6, 11, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + let mut state = big_b.state("", 0..usize::MAX); + state = state.search(); + assert!(state.has_matched == None) +} From d2b48fdea2986e8513d63d19ea30859c5a48a66e Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Fri, 16 Apr 2021 10:40:42 -0500 Subject: [PATCH 042/705] Release 0.1.1 sre-engine@0.1.1 Generated by cargo-workspaces --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 8e69ab5235..cf830a6053 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.1.0" +version = "0.1.1" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" From 73abbace85aebf063eb8c7ce4815cacd2449f522 Mon Sep 17 00:00:00 2001 From: Noah <33094578+coolreader18@users.noreply.github.com> Date: Fri, 16 Apr 2021 10:53:37 -0500 Subject: [PATCH 043/705] Add explicit include for Cargo files --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index cf830a6053..f0cd628f0e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ repository = "https://github.com/RustPython/sre-engine" license = "MIT" edition = "2018" keywords = ["regex"] +include = ["LICENSE", "src/**/*.rs"] [dependencies] num_enum = "0.5" From df8453d387c0a4bc6b84131f40703b46000ba7ee Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 20 Apr 2021 10:19:27 +0200 Subject: [PATCH 044/705] fix zerowidth search --- Cargo.toml | 2 +- src/engine.rs | 99 +++++++++++++++++++++++++++++++++++--------------- tests/tests.rs | 18 +++++++-- 3 files changed, 85 insertions(+), 34 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f0cd628f0e..614243eeb1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.1.1" +version = "0.1.2" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" diff --git a/src/engine.rs b/src/engine.rs index 0de9f43844..bded52448d 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -23,8 +23,9 @@ pub struct State<'a> { repeat_stack: Vec, pub string_position: usize, popped_context: Option, - pub has_matched: Option, + pub has_matched: bool, pub match_all: bool, + pub must_advance: bool, } impl<'a> State<'a> { @@ -50,8 +51,9 @@ impl<'a> State<'a> { marks: Vec::new(), string_position: start, popped_context: None, - has_matched: None, + has_matched: false, match_all: false, + must_advance: false, } } @@ -63,7 +65,7 @@ impl<'a> State<'a> { self.marks.clear(); self.string_position = self.start; self.popped_context = None; - self.has_matched = None; + self.has_matched = false; } fn set_mark(&mut self, mark_nr: usize, position: usize) { @@ -100,17 +102,7 @@ impl<'a> State<'a> { self.marks_stack.pop(); } - pub fn pymatch(mut self) -> Self { - let ctx = MatchContext { - string_position: self.start, - string_offset: self.string.offset(0, self.start), - code_position: 0, - has_matched: None, - toplevel: true, - }; - self.context_stack.push(ctx); - - let mut dispatcher = OpcodeDispatcher::new(); + fn _match(mut self, dispatcher: &mut OpcodeDispatcher) -> Self { let mut has_matched = None; loop { @@ -127,21 +119,58 @@ impl<'a> State<'a> { } } - self.has_matched = has_matched; + self.has_matched = has_matched == Some(true); self } + pub fn pymatch(mut self) -> Self { + let ctx = MatchContext { + string_position: self.start, + string_offset: self.string.offset(0, self.start), + code_position: 0, + has_matched: None, + toplevel: true, + }; + self.context_stack.push(ctx); + + let mut dispatcher = OpcodeDispatcher::new(); + + self._match(&mut dispatcher) + } + pub fn search(mut self) -> Self { // TODO: optimize by op info and skip prefix - while self.start <= self.end { - self.match_all = false; - self = self.pymatch(); - if self.has_matched == Some(true) { - return self; - } + if self.start > self.end { + return self; + } + + let mut dispatcher = OpcodeDispatcher::new(); + + let ctx = MatchContext { + string_position: self.start, + string_offset: self.string.offset(0, self.start), + code_position: 0, + has_matched: None, + toplevel: true, + }; + self.context_stack.push(ctx); + self = self._match(&mut dispatcher); + + self.must_advance = false; + while !self.has_matched && self.start < self.end { self.start += 1; self.reset(); + dispatcher.clear(); + let ctx = MatchContext { + string_position: self.start, + string_offset: self.string.offset(0, self.start), + code_position: 0, + has_matched: None, + toplevel: false, + }; + self.context_stack.push(ctx); + self = self._match(&mut dispatcher); } self @@ -310,6 +339,18 @@ trait MatchContextDrive { .string .back_offset(self.ctx().string_offset, skip_count); } + fn can_success(&self) -> bool { + if !self.ctx().toplevel { + return true; + } + if self.state().match_all && !self.at_end() { + return false; + } + if self.state().must_advance && self.ctx().string_position == self.state().start { + return false; + } + true + } } struct StackDrive<'a> { @@ -429,6 +470,9 @@ impl OpcodeDispatcher { executing_contexts: HashMap::new(), } } + fn clear(&mut self) { + self.executing_contexts.clear(); + } // Returns True if the current context matches, False if it doesn't and // None if matching is not finished, ie must be resumed after child // contexts have been matched. @@ -470,11 +514,9 @@ impl OpcodeDispatcher { drive.ctx_mut().has_matched = Some(false); }), SreOpcode::SUCCESS => once(|drive| { - if drive.ctx().toplevel && drive.state.match_all && !drive.at_end() { - drive.ctx_mut().has_matched = Some(false); - } else { + drive.ctx_mut().has_matched = Some(drive.can_success()); + if drive.ctx().has_matched == Some(true) { drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); } }), SreOpcode::ANY => once(|drive| { @@ -1152,9 +1194,7 @@ impl OpcodeExecutor for OpMinRepeatOne { }; let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 - && !(drive.ctx().toplevel && drive.state.match_all && !drive.at_end()) - { + if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { // tail is empty. we're finished drive.state.string_position = drive.ctx().string_position; drive.ctx_mut().has_matched = Some(true); @@ -1455,8 +1495,7 @@ impl OpcodeExecutor for OpRepeatOne { } let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && drive.at_end() && !drive.ctx().toplevel - { + if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { // tail is empty. we're finished drive.state.string_position = drive.ctx().string_position; drive.ctx_mut().has_matched = Some(true); diff --git a/tests/tests.rs b/tests/tests.rs index 690c72861b..d76ff3cfb5 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -24,7 +24,7 @@ fn test_2427() { // END GENERATED let mut state = lookbehind.state("x", 0..usize::MAX); state = state.pymatch(); - assert!(state.has_matched == Some(true)); + assert!(state.has_matched); } #[test] @@ -35,7 +35,7 @@ fn test_assert() { // END GENERATED let mut state = positive_lookbehind.state("abcdef", 0..usize::MAX); state = state.search(); - assert!(state.has_matched == Some(true)); + assert!(state.has_matched); } #[test] @@ -46,5 +46,17 @@ fn test_string_boundaries() { // END GENERATED let mut state = big_b.state("", 0..usize::MAX); state = state.search(); - assert!(state.has_matched == None) + assert!(!state.has_matched); +} + +#[test] +fn test_zerowidth() { + // pattern p = re.compile(r'\b|:+') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 7, 5, 6, 10, 16, 12, 10, 25, 6, 1, 4294967295, 17, 58, 1, 16, 2, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + let mut state = p.state("a:", 0..usize::MAX); + state.must_advance = true; + state = state.search(); + assert!(state.string_position == 1); } From a3c3573d67f94d6119a8bb7126f385c38ba438e8 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 20 Apr 2021 17:36:32 +0200 Subject: [PATCH 045/705] optimize count --- src/engine.rs | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index bded52448d..5409baf66d 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -916,7 +916,7 @@ fn charset(set: &[u32], ch: u32) -> bool { } /* General case */ -fn count(drive: &mut StackDrive, maxcount: usize) -> usize { +fn general_count(drive: &mut StackDrive, maxcount: usize) -> usize { let mut count = 0; let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); @@ -937,18 +937,11 @@ fn count(drive: &mut StackDrive, maxcount: usize) -> usize { count } -/* TODO: check literal cases should improve the perfermance - -fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { +fn count(stack_drive: &mut StackDrive, maxcount: usize) -> usize { let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); let end = drive.ctx().string_position + maxcount; - let opcode = match SreOpcode::try_from(drive.peek_code(1)) { - Ok(code) => code, - Err(_) => { - panic!("FIXME:COUNT1"); - } - }; + let opcode = SreOpcode::try_from(drive.peek_code(0)).unwrap(); match opcode { SreOpcode::ANY => { @@ -960,7 +953,6 @@ fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { drive.skip_char(maxcount); } SreOpcode::IN => { - // TODO: pattern[2 or 1..]? while !drive.ctx().string_position < end && charset(&drive.pattern()[2..], drive.peek_char()) { @@ -992,7 +984,7 @@ fn _count(stack_drive: &StackDrive, maxcount: usize) -> usize { general_count_literal(&mut drive, end, |code, c| code != lower_unicode(c) as u32); } _ => { - todo!("repeated single character pattern?"); + return general_count(stack_drive, maxcount); } } @@ -1006,11 +998,6 @@ fn general_count_literal bool>(drive: &mut WrapDrive, end: } } -fn eq_loc_ignore(code: u32, ch: u32) -> bool { - code == ch || code == lower_locate(ch) || code == upper_locate(ch) -} -*/ - fn is_word(ch: u32) -> bool { ch == '_' as u32 || u8::try_from(ch) @@ -1028,7 +1015,7 @@ fn is_digit(ch: u32) -> bool { .unwrap_or(false) } fn is_loc_alnum(ch: u32) -> bool { - // TODO: check with cpython + // FIXME: Ignore the locales u8::try_from(ch) .map(|x| x.is_ascii_alphanumeric()) .unwrap_or(false) @@ -1045,13 +1032,11 @@ pub fn lower_ascii(ch: u32) -> u32 { .unwrap_or(ch) } fn lower_locate(ch: u32) -> u32 { - // TODO: check with cpython - // https://doc.rust-lang.org/std/primitive.char.html#method.to_lowercase + // FIXME: Ignore the locales lower_ascii(ch) } fn upper_locate(ch: u32) -> u32 { - // TODO: check with cpython - // https://doc.rust-lang.org/std/primitive.char.html#method.to_uppercase + // FIXME: Ignore the locales u8::try_from(ch) .map(|x| x.to_ascii_uppercase() as u32) .unwrap_or(ch) From 5bd6b672d089fa4dc8db1d5d3d8564f6a98dbacd Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 21 Apr 2021 11:09:10 +0200 Subject: [PATCH 046/705] optimize opcode that execute only once --- src/engine.rs | 183 ++++++++++++++++++++++++++++---------------------- 1 file changed, 102 insertions(+), 81 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 5409baf66d..2888b6930c 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -416,20 +416,6 @@ trait OpcodeExecutor { fn next(&mut self, drive: &mut StackDrive) -> Option<()>; } -struct OpOnce { - f: Option, -} -impl OpcodeExecutor for OpOnce { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - let f = self.f.take()?; - f(drive); - None - } -} -fn once(f: F) -> Box> { - Box::new(OpOnce { f: Some(f) }) -} - struct OpTwice { f1: Option, f2: Option, @@ -496,48 +482,58 @@ impl OpcodeDispatcher { // Dispatches a context on a given opcode. Returns True if the context // is done matching, False if it must be resumed when next encountered. fn dispatch(&mut self, opcode: SreOpcode, drive: &mut StackDrive) -> bool { - let mut executor = match self.executing_contexts.remove_entry(&drive.id()) { - Some((_, executor)) => executor, - None => self.dispatch_table(opcode), - }; - if let Some(()) = executor.next(drive) { - self.executing_contexts.insert(drive.id(), executor); - false - } else { - true + let executor = self + .executing_contexts + .remove_entry(&drive.id()) + .map(|(_, x)| x) + .or_else(|| self.dispatch_table(opcode, drive)); + if let Some(mut executor) = executor { + if let Some(()) = executor.next(drive) { + self.executing_contexts.insert(drive.id(), executor); + return false; + } } + true } - fn dispatch_table(&mut self, opcode: SreOpcode) -> Box { + fn dispatch_table( + &mut self, + opcode: SreOpcode, + drive: &mut StackDrive, + ) -> Option> { match opcode { - SreOpcode::FAILURE => once(|drive| { + SreOpcode::FAILURE => { drive.ctx_mut().has_matched = Some(false); - }), - SreOpcode::SUCCESS => once(|drive| { + None + } + SreOpcode::SUCCESS => { drive.ctx_mut().has_matched = Some(drive.can_success()); if drive.ctx().has_matched == Some(true) { drive.state.string_position = drive.ctx().string_position; } - }), - SreOpcode::ANY => once(|drive| { + None + } + SreOpcode::ANY => { if drive.at_end() || drive.at_linebreak() { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(1); drive.skip_char(1); } - }), - SreOpcode::ANY_ALL => once(|drive| { + None + } + SreOpcode::ANY_ALL => { if drive.at_end() { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(1); drive.skip_char(1); } - }), + None + } /* assert subpattern */ /* */ - SreOpcode::ASSERT => twice( + SreOpcode::ASSERT => Some(twice( |drive| { let back = drive.peek_code(2) as usize; let passed = drive.ctx().string_position; @@ -568,8 +564,8 @@ impl OpcodeDispatcher { drive.ctx_mut().has_matched = Some(false); } }, - ), - SreOpcode::ASSERT_NOT => twice( + )), + SreOpcode::ASSERT_NOT => Some(twice( |drive| { let back = drive.peek_code(2) as usize; let passed = drive.ctx().string_position; @@ -600,17 +596,18 @@ impl OpcodeDispatcher { drive.skip_code(drive.peek_code(1) as usize + 1); } }, - ), - SreOpcode::AT => once(|drive| { + )), + SreOpcode::AT => { let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); if !at(drive, atcode) { drive.ctx_mut().has_matched = Some(false); } else { drive.skip_code(2); } - }), - SreOpcode::BRANCH => Box::new(OpBranch::default()), - SreOpcode::CATEGORY => once(|drive| { + None + } + SreOpcode::BRANCH => Some(Box::new(OpBranch::default())), + SreOpcode::CATEGORY => { let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); if drive.at_end() || !category(catcode, drive.peek_char()) { drive.ctx_mut().has_matched = Some(false); @@ -618,53 +615,68 @@ impl OpcodeDispatcher { drive.skip_code(2); drive.skip_char(1); } - }), - SreOpcode::IN => once(|drive| { + None + } + SreOpcode::IN => { general_op_in(drive, |set, c| charset(set, c)); - }), - SreOpcode::IN_IGNORE => once(|drive| { + None + } + SreOpcode::IN_IGNORE => { general_op_in(drive, |set, c| charset(set, lower_ascii(c))); - }), - SreOpcode::IN_UNI_IGNORE => once(|drive| { + None + } + SreOpcode::IN_UNI_IGNORE => { general_op_in(drive, |set, c| charset(set, lower_unicode(c))); - }), - SreOpcode::IN_LOC_IGNORE => once(|drive| { + None + } + SreOpcode::IN_LOC_IGNORE => { general_op_in(drive, |set, c| charset_loc_ignore(set, c)); - }), - SreOpcode::INFO | SreOpcode::JUMP => once(|drive| { + None + } + SreOpcode::INFO | SreOpcode::JUMP => { drive.skip_code(drive.peek_code(1) as usize + 1); - }), - SreOpcode::LITERAL => once(|drive| { + None + } + SreOpcode::LITERAL => { general_op_literal(drive, |code, c| code == c); - }), - SreOpcode::NOT_LITERAL => once(|drive| { + None + } + SreOpcode::NOT_LITERAL => { general_op_literal(drive, |code, c| code != c); - }), - SreOpcode::LITERAL_IGNORE => once(|drive| { + None + } + SreOpcode::LITERAL_IGNORE => { general_op_literal(drive, |code, c| code == lower_ascii(c)); - }), - SreOpcode::NOT_LITERAL_IGNORE => once(|drive| { + None + } + SreOpcode::NOT_LITERAL_IGNORE => { general_op_literal(drive, |code, c| code != lower_ascii(c)); - }), - SreOpcode::LITERAL_UNI_IGNORE => once(|drive| { + None + } + SreOpcode::LITERAL_UNI_IGNORE => { general_op_literal(drive, |code, c| code == lower_unicode(c)); - }), - SreOpcode::NOT_LITERAL_UNI_IGNORE => once(|drive| { + None + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { general_op_literal(drive, |code, c| code != lower_unicode(c)); - }), - SreOpcode::LITERAL_LOC_IGNORE => once(|drive| { + None + } + SreOpcode::LITERAL_LOC_IGNORE => { general_op_literal(drive, char_loc_ignore); - }), - SreOpcode::NOT_LITERAL_LOC_IGNORE => once(|drive| { + None + } + SreOpcode::NOT_LITERAL_LOC_IGNORE => { general_op_literal(drive, |code, c| !char_loc_ignore(code, c)); - }), - SreOpcode::MARK => once(|drive| { + None + } + SreOpcode::MARK => { drive .state .set_mark(drive.peek_code(1) as usize, drive.ctx().string_position); drive.skip_code(2); - }), - SreOpcode::REPEAT => twice( + None + } + SreOpcode::REPEAT => Some(twice( // create repeat context. all the hard work is done by the UNTIL // operator (MAX_UNTIL, MIN_UNTIL) // <1=min> <2=max> item tail @@ -687,20 +699,28 @@ impl OpcodeDispatcher { let child_ctx = drive.state.popped_context.unwrap(); drive.ctx_mut().has_matched = child_ctx.has_matched; }, - ), - SreOpcode::MAX_UNTIL => Box::new(OpMaxUntil::default()), - SreOpcode::MIN_UNTIL => Box::new(OpMinUntil::default()), - SreOpcode::REPEAT_ONE => Box::new(OpRepeatOne::default()), - SreOpcode::MIN_REPEAT_ONE => Box::new(OpMinRepeatOne::default()), - SreOpcode::GROUPREF => once(|drive| general_op_groupref(drive, |x| x)), - SreOpcode::GROUPREF_IGNORE => once(|drive| general_op_groupref(drive, lower_ascii)), + )), + SreOpcode::MAX_UNTIL => Some(Box::new(OpMaxUntil::default())), + SreOpcode::MIN_UNTIL => Some(Box::new(OpMinUntil::default())), + SreOpcode::REPEAT_ONE => Some(Box::new(OpRepeatOne::default())), + SreOpcode::MIN_REPEAT_ONE => Some(Box::new(OpMinRepeatOne::default())), + SreOpcode::GROUPREF => { + general_op_groupref(drive, |x| x); + None + } + SreOpcode::GROUPREF_IGNORE => { + general_op_groupref(drive, lower_ascii); + None + } SreOpcode::GROUPREF_LOC_IGNORE => { - once(|drive| general_op_groupref(drive, lower_locate)) + general_op_groupref(drive, lower_locate); + None } SreOpcode::GROUPREF_UNI_IGNORE => { - once(|drive| general_op_groupref(drive, lower_unicode)) + general_op_groupref(drive, lower_unicode); + None } - SreOpcode::GROUPREF_EXISTS => once(|drive| { + SreOpcode::GROUPREF_EXISTS => { let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); match (group_start, group_end) { (Some(start), Some(end)) if start <= end => { @@ -708,7 +728,8 @@ impl OpcodeDispatcher { } _ => drive.skip_code(drive.peek_code(2) as usize + 1), } - }), + None + } _ => { // TODO python expcetion? unreachable!("unexpected opcode") From 7324feef89dd692d909c98849e54969617728ecf Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 21 Apr 2021 11:13:58 +0200 Subject: [PATCH 047/705] optimize search cache the string offset --- src/engine.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 2888b6930c..2b85ea3514 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -147,9 +147,11 @@ impl<'a> State<'a> { let mut dispatcher = OpcodeDispatcher::new(); + let mut start_offset = self.string.offset(0, self.start); + let ctx = MatchContext { string_position: self.start, - string_offset: self.string.offset(0, self.start), + string_offset: start_offset, code_position: 0, has_matched: None, toplevel: true, @@ -160,11 +162,12 @@ impl<'a> State<'a> { self.must_advance = false; while !self.has_matched && self.start < self.end { self.start += 1; + start_offset = self.string.offset(start_offset, 1); self.reset(); dispatcher.clear(); let ctx = MatchContext { string_position: self.start, - string_offset: self.string.offset(0, self.start), + string_offset: start_offset, code_position: 0, has_matched: None, toplevel: false, From 86435b8a4b44d0b79109ace6acd66cbfcebaac66 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 22 Apr 2021 17:15:03 +0200 Subject: [PATCH 048/705] add benchmark --- benches/benches.rs | 112 +++++++++++++++++++++++++++++++++++++++++++++ generate_tests.py | 5 +- 2 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 benches/benches.rs diff --git a/benches/benches.rs b/benches/benches.rs new file mode 100644 index 0000000000..b86a592967 --- /dev/null +++ b/benches/benches.rs @@ -0,0 +1,112 @@ +#![feature(test)] + +extern crate test; +use test::Bencher; + +use sre_engine::constants::SreFlag; +use sre_engine::engine; +pub struct Pattern { + pub code: &'static [u32], + pub flags: SreFlag, +} + +impl Pattern { + pub fn state<'a>( + &self, + string: impl Into>, + range: std::ops::Range, + ) -> engine::State<'a> { + engine::State::new(string.into(), range.start, range.end, self.flags, self.code) + } +} +#[bench] +fn benchmarks(b: &mut Bencher) { + // # test common prefix + // pattern p1 = re.compile('Python|Perl') # , 'Perl'), # Alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p1 = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern p2 = re.compile('(Python|Perl)') #, 'Perl'), # Grouped alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p2 = Pattern { code: &[15, 8, 1, 4, 6, 1, 0, 80, 0, 18, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('Python|Perl|Tcl') #, 'Perl'), # Alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p3 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('(Python|Perl|Tcl)') #, 'Perl'), # Grouped alternation + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p4 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 18, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('(Python)\\1') #, 'PythonPython'), # Backreference + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p5 = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('([0a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # Disable the fastmap optimization + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p6 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 31, 1, 4294967295, 18, 0, 14, 7, 17, 48, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('([a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # A few sets + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p7 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 29, 1, 4294967295, 18, 0, 14, 5, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('Python') #, 'Python'), # Simple text literal + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p8 = Pattern { code: &[15, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('.*Python') #, 'Python'), # Bad text literal + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p9 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('.*Python.*') #, 'Python'), # Worse text literal + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p10 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 25, 5, 0, 4294967295, 2, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + // pattern pn = re.compile('.*(Python)') #, 'Python'), # Bad text literal with grouping + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p11 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + + let tests = [ + (p1, "Perl"), + (p2, "Perl"), + (p3, "Perl"), + (p4, "Perl"), + (p5, "PythonPython"), + (p6, "a5,b7,c9,"), + (p7, "a5,b7,c9,"), + (p8, "Python"), + (p9, "Python"), + (p10, "Python"), + (p11, "Python"), + ]; + + b.iter(move || { + for (p, s) in &tests { + let mut state = p.state(s.clone(), 0..usize::MAX); + state = state.search(); + assert!(state.has_matched); + state = p.state(s.clone(), 0..usize::MAX); + state = state.pymatch(); + assert!(state.has_matched); + state = p.state(s.clone(), 0..usize::MAX); + state.match_all = true; + state = state.pymatch(); + assert!(state.has_matched); + let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000)); + state = p.state(s2.as_str(), 0..usize::MAX); + state = state.search(); + assert!(state.has_matched); + state = p.state(s2.as_str(), 10000..usize::MAX); + state = state.pymatch(); + assert!(state.has_matched); + state = p.state(s2.as_str(), 10000..10000 + s.len()); + state = state.pymatch(); + assert!(state.has_matched); + state = p.state(s2.as_str(), 10000..10000 + s.len()); + state.match_all = true; + state = state.pymatch(); + assert!(state.has_matched); + } + }) +} diff --git a/generate_tests.py b/generate_tests.py index 7af1d2f0c2..b432720cd1 100644 --- a/generate_tests.py +++ b/generate_tests.py @@ -5,6 +5,7 @@ import sre_compile import sre_parse import json +from itertools import chain m = re.search(r"const SRE_MAGIC: usize = (\d+);", open("src/constants.rs").read()) sre_engine_magic = int(m.group(1)) @@ -38,8 +39,8 @@ def replace_compiled(m): {indent}#[rustfmt::skip] let {varname} = {pattern}; {indent}// END GENERATED''' -with os.scandir("tests") as d: - for f in d: +with os.scandir("tests") as t, os.scandir("benches") as b: + for f in chain(t, b): path = Path(f.path) if path.suffix == ".rs": replaced = pattern_pattern.sub(replace_compiled, path.read_text()) From 58981a41e99cbcb53002b559ea13c7131b597938 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 22 Apr 2021 19:27:24 +0200 Subject: [PATCH 049/705] optimize; replace hashmap with btreemap --- src/engine.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 2b85ea3514..3837974838 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -2,7 +2,7 @@ use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use super::MAXREPEAT; -use std::collections::HashMap; +use std::collections::BTreeMap; use std::convert::TryFrom; const fn is_py_ascii_whitespace(b: u8) -> bool { @@ -204,7 +204,7 @@ impl<'a> StrDrive<'a> { .get(offset..) .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) .unwrap_or_else(|| s.len()), - StrDrive::Bytes(b) => std::cmp::min(offset + skip, b.len()), + StrDrive::Bytes(_) => offset + skip, } } @@ -294,8 +294,7 @@ trait MatchContextDrive { .state() .string .offset(self.ctx().string_offset, skip_count); - self.ctx_mut().string_position = - std::cmp::min(self.ctx().string_position + skip_count, self.state().end); + self.ctx_mut().string_position += skip_count; } fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; @@ -451,12 +450,12 @@ where } struct OpcodeDispatcher { - executing_contexts: HashMap>, + executing_contexts: BTreeMap>, } impl OpcodeDispatcher { fn new() -> Self { Self { - executing_contexts: HashMap::new(), + executing_contexts: BTreeMap::new(), } } fn clear(&mut self) { @@ -487,8 +486,7 @@ impl OpcodeDispatcher { fn dispatch(&mut self, opcode: SreOpcode, drive: &mut StackDrive) -> bool { let executor = self .executing_contexts - .remove_entry(&drive.id()) - .map(|(_, x)| x) + .remove(&drive.id()) .or_else(|| self.dispatch_table(opcode, drive)); if let Some(mut executor) = executor { if let Some(()) = executor.next(drive) { From 74ebdaf4e8a50330ea6195333bcb47e086ff3b5e Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 11 Jul 2022 21:30:48 +0200 Subject: [PATCH 050/705] fix panic OpMinUntil return before restore repeat --- .vscode/launch.json | 21 +++++++++++++++++++++ src/engine.rs | 10 ++++++---- tests/tests.rs | 11 +++++++++++ 3 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 .vscode/launch.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000000..5ebfe34f05 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,21 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "type": "lldb", + "request": "launch", + "name": "Debug Unit Test", + "cargo": { + "args": [ + "test", + "--no-run" + ], + "filter": { + "kind": "test" + } + }, + "args": [], + "cwd": "${workspaceFolder}" + } + ] +} \ No newline at end of file diff --git a/src/engine.rs b/src/engine.rs index 3837974838..d4e036ff32 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -14,7 +14,7 @@ pub struct State<'a> { pub string: StrDrive<'a>, pub start: usize, pub end: usize, - flags: SreFlag, + _flags: SreFlag, pattern_codes: &'a [u32], pub marks: Vec>, pub lastindex: isize, @@ -42,7 +42,7 @@ impl<'a> State<'a> { string, start, end, - flags, + _flags: flags, pattern_codes, lastindex: -1, marks_stack: Vec::new(), @@ -1380,16 +1380,18 @@ impl OpcodeExecutor for OpMinUntil { None } 2 => { + // restore repeat before return + drive.state.repeat_stack.push(self.save_repeat.unwrap()); + let child_ctx = drive.state.popped_context.unwrap(); if child_ctx.has_matched == Some(true) { drive.ctx_mut().has_matched = Some(true); return None; } - drive.state.repeat_stack.push(self.save_repeat.unwrap()); drive.state.string_position = drive.ctx().string_position; drive.state.marks_pop(); - // match more unital tail matches + // match more until tail matches let RepeatContext { count: _, code_position, diff --git a/tests/tests.rs b/tests/tests.rs index d76ff3cfb5..b430947a9b 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -60,3 +60,14 @@ fn test_zerowidth() { state = state.search(); assert!(state.string_position == 1); } + +#[test] +fn test_repeat_context_panic() { + // pattern p = re.compile(r'(?:a*?(xx)??z)*') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 25, 0, 4294967295, 27, 6, 0, 4294967295, 17, 97, 1, 24, 11, 0, 1, 18, 0, 17, 120, 17, 120, 18, 1, 20, 17, 122, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + let mut state = p.state("axxzaz", 0..usize::MAX); + state = state.pymatch(); + assert!(state.marks == vec![Some(1), Some(3)]); +} \ No newline at end of file From 919e1d7933b62bba0830b5b8dce63c1dfc758056 Mon Sep 17 00:00:00 2001 From: Steve Shi Date: Tue, 26 Jul 2022 20:38:03 +0200 Subject: [PATCH 051/705] Refactor and fix multiple max_until recusion (#10) * wip refactor engine * wip 2 refactor engine * wip 3 refactor engine * wip 3 refactor engine * wip 4 refactor engine * wip 5 refactor engine * refactor seperate Stacks * fix clippy * fix pymatch and search restore _stacks * fix toplevel * fix marks panic * fix double max_until repeat context * clearup * update version to 0.2.0 --- Cargo.toml | 2 +- src/engine.rs | 1700 ++++++++++++++++++++++++------------------------ src/lib.rs | 2 +- tests/tests.rs | 13 +- 4 files changed, 847 insertions(+), 870 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 614243eeb1..6ba3996947 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.1.2" +version = "0.2.0" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" diff --git a/src/engine.rs b/src/engine.rs index d4e036ff32..81903ccfdd 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -2,7 +2,6 @@ use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; use super::MAXREPEAT; -use std::collections::BTreeMap; use std::convert::TryFrom; const fn is_py_ascii_whitespace(b: u8) -> bool { @@ -20,7 +19,7 @@ pub struct State<'a> { pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, context_stack: Vec, - repeat_stack: Vec, + _stacks: Option>, pub string_position: usize, popped_context: Option, pub has_matched: bool, @@ -44,11 +43,11 @@ impl<'a> State<'a> { end, _flags: flags, pattern_codes, + marks: Vec::new(), lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), - repeat_stack: Vec::new(), - marks: Vec::new(), + _stacks: Default::default(), string_position: start, popped_context: None, has_matched: false, @@ -59,10 +58,12 @@ impl<'a> State<'a> { pub fn reset(&mut self) { self.lastindex = -1; + self.marks.clear(); self.marks_stack.clear(); self.context_stack.clear(); - self.repeat_stack.clear(); - self.marks.clear(); + if let Some(x) = self._stacks.as_mut() { + x.clear() + }; self.string_position = self.start; self.popped_context = None; self.has_matched = false; @@ -102,51 +103,71 @@ impl<'a> State<'a> { self.marks_stack.pop(); } - fn _match(mut self, dispatcher: &mut OpcodeDispatcher) -> Self { - let mut has_matched = None; + fn _match(mut self, stacks: &mut Stacks) -> Self { + while let Some(ctx) = self.context_stack.pop() { + let mut drive = StateContext { + state: self, + ctx, + next_ctx: None, + }; - loop { - if self.context_stack.is_empty() { - break; + if let Some(handler) = drive.ctx.handler { + handler(&mut drive, stacks); + } else if drive.remaining_codes() > 0 { + let code = drive.peek_code(0); + let code = SreOpcode::try_from(code).unwrap(); + dispatch(code, &mut drive, stacks); + } else { + drive.failure(); } - let ctx_id = self.context_stack.len() - 1; - let mut drive = StackDrive::drive(ctx_id, self); - has_matched = dispatcher.pymatch(&mut drive); - self = drive.take(); - if has_matched.is_some() { - self.popped_context = self.context_stack.pop(); + let StateContext { + mut state, + ctx, + next_ctx, + } = drive; + + if ctx.has_matched.is_some() { + state.popped_context = Some(ctx); + } else { + state.context_stack.push(ctx); + if let Some(next_ctx) = next_ctx { + state.context_stack.push(next_ctx); + } } + self = state } - - self.has_matched = has_matched == Some(true); + self.has_matched = self.popped_context.unwrap().has_matched == Some(true); self } pub fn pymatch(mut self) -> Self { + let mut stacks = self._stacks.take().unwrap_or_default(); + let ctx = MatchContext { string_position: self.start, string_offset: self.string.offset(0, self.start), code_position: 0, has_matched: None, toplevel: true, + handler: None, + repeat_ctx_id: usize::MAX, }; self.context_stack.push(ctx); - let mut dispatcher = OpcodeDispatcher::new(); - - self._match(&mut dispatcher) + self = self._match(&mut stacks); + self._stacks = Some(stacks); + self } pub fn search(mut self) -> Self { + let mut stacks = self._stacks.take().unwrap_or_default(); // TODO: optimize by op info and skip prefix if self.start > self.end { return self; } - let mut dispatcher = OpcodeDispatcher::new(); - let mut start_offset = self.string.offset(0, self.start); let ctx = MatchContext { @@ -155,31 +176,664 @@ impl<'a> State<'a> { code_position: 0, has_matched: None, toplevel: true, + handler: None, + repeat_ctx_id: usize::MAX, }; self.context_stack.push(ctx); - self = self._match(&mut dispatcher); + self = self._match(&mut stacks); self.must_advance = false; while !self.has_matched && self.start < self.end { self.start += 1; start_offset = self.string.offset(start_offset, 1); self.reset(); - dispatcher.clear(); + stacks.clear(); + let ctx = MatchContext { string_position: self.start, string_offset: start_offset, code_position: 0, has_matched: None, toplevel: false, + handler: None, + repeat_ctx_id: usize::MAX, }; self.context_stack.push(ctx); - self = self._match(&mut dispatcher); + self = self._match(&mut stacks); } + self._stacks = Some(stacks); self } } +fn dispatch(opcode: SreOpcode, drive: &mut StateContext, stacks: &mut Stacks) { + match opcode { + SreOpcode::FAILURE => { + drive.failure(); + } + SreOpcode::SUCCESS => { + drive.ctx.has_matched = Some(drive.can_success()); + if drive.ctx.has_matched == Some(true) { + drive.state.string_position = drive.ctx.string_position; + } + } + SreOpcode::ANY => { + if drive.at_end() || drive.at_linebreak() { + drive.failure(); + } else { + drive.skip_code(1); + drive.skip_char(1); + } + } + SreOpcode::ANY_ALL => { + if drive.at_end() { + drive.failure(); + } else { + drive.skip_code(1); + drive.skip_char(1); + } + } + SreOpcode::ASSERT => op_assert(drive), + SreOpcode::ASSERT_NOT => op_assert_not(drive), + SreOpcode::AT => { + let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); + if at(drive, atcode) { + drive.skip_code(2); + } else { + drive.failure(); + } + } + SreOpcode::BRANCH => op_branch(drive, stacks), + SreOpcode::CATEGORY => { + let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); + if drive.at_end() || !category(catcode, drive.peek_char()) { + drive.failure(); + } else { + drive.skip_code(2); + drive.skip_char(1); + } + } + SreOpcode::IN => general_op_in(drive, charset), + SreOpcode::IN_IGNORE => general_op_in(drive, |set, c| charset(set, lower_ascii(c))), + SreOpcode::IN_UNI_IGNORE => general_op_in(drive, |set, c| charset(set, lower_unicode(c))), + SreOpcode::IN_LOC_IGNORE => general_op_in(drive, charset_loc_ignore), + SreOpcode::INFO | SreOpcode::JUMP => drive.skip_code_from(1), + SreOpcode::LITERAL => general_op_literal(drive, |code, c| code == c), + SreOpcode::NOT_LITERAL => general_op_literal(drive, |code, c| code != c), + SreOpcode::LITERAL_IGNORE => general_op_literal(drive, |code, c| code == lower_ascii(c)), + SreOpcode::NOT_LITERAL_IGNORE => { + general_op_literal(drive, |code, c| code != lower_ascii(c)) + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_op_literal(drive, |code, c| code == lower_unicode(c)) + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_op_literal(drive, |code, c| code != lower_unicode(c)) + } + SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(drive, char_loc_ignore), + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_op_literal(drive, |code, c| !char_loc_ignore(code, c)) + } + SreOpcode::MARK => { + drive + .state + .set_mark(drive.peek_code(1) as usize, drive.ctx.string_position); + drive.skip_code(2); + } + SreOpcode::MAX_UNTIL => op_max_until(drive, stacks), + SreOpcode::MIN_UNTIL => op_min_until(drive, stacks), + SreOpcode::REPEAT => op_repeat(drive, stacks), + SreOpcode::REPEAT_ONE => op_repeat_one(drive, stacks), + SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(drive, stacks), + SreOpcode::GROUPREF => general_op_groupref(drive, |x| x), + SreOpcode::GROUPREF_IGNORE => general_op_groupref(drive, lower_ascii), + SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(drive, lower_locate), + SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(drive, lower_unicode), + SreOpcode::GROUPREF_EXISTS => { + let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); + match (group_start, group_end) { + (Some(start), Some(end)) if start <= end => { + drive.skip_code(3); + } + _ => drive.skip_code_from(2), + } + } + _ => unreachable!("unexpected opcode"), + } +} + +/* assert subpattern */ +/* */ +fn op_assert(drive: &mut StateContext) { + let back = drive.peek_code(2) as usize; + + if drive.ctx.string_position < back { + return drive.failure(); + } + + let offset = drive + .state + .string + .back_offset(drive.ctx.string_offset, back); + let position = drive.ctx.string_position - back; + + drive.state.string_position = position; + + let next_ctx = drive.next_ctx(3, |drive, _| { + if drive.popped_ctx().has_matched == Some(true) { + drive.ctx.handler = None; + drive.skip_code_from(1); + } else { + drive.failure(); + } + }); + next_ctx.string_position = position; + next_ctx.string_offset = offset; + next_ctx.toplevel = false; +} + +/* assert not subpattern */ +/* */ +fn op_assert_not(drive: &mut StateContext) { + let back = drive.peek_code(2) as usize; + + if drive.ctx.string_position < back { + return drive.skip_code_from(1); + } + + let offset = drive + .state + .string + .back_offset(drive.ctx.string_offset, back); + let position = drive.ctx.string_position - back; + + drive.state.string_position = position; + + let next_ctx = drive.next_ctx(3, |drive, _| { + if drive.popped_ctx().has_matched == Some(true) { + drive.failure(); + } else { + drive.ctx.handler = None; + drive.skip_code_from(1); + } + }); + next_ctx.string_position = position; + next_ctx.string_offset = offset; + next_ctx.toplevel = false; +} + +#[derive(Debug)] +struct BranchContext { + branch_offset: usize, +} + +// alternation +// <0=skip> code ... +fn op_branch(drive: &mut StateContext, stacks: &mut Stacks) { + drive.state.marks_push(); + stacks.branch.push(BranchContext { branch_offset: 1 }); + create_context(drive, stacks); + + fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { + let branch_offset = stacks.branch_last().branch_offset; + let next_length = drive.peek_code(branch_offset) as usize; + if next_length == 0 { + drive.state.marks_pop_discard(); + stacks.branch.pop(); + return drive.failure(); + } + + drive.sync_string_position(); + + stacks.branch_last().branch_offset += next_length; + drive.next_ctx(branch_offset + 1, callback); + } + + fn callback(drive: &mut StateContext, stacks: &mut Stacks) { + if drive.popped_ctx().has_matched == Some(true) { + stacks.branch.pop(); + return drive.success(); + } + drive.state.marks_pop_keep(); + drive.ctx.handler = Some(create_context) + } +} + +#[derive(Debug, Copy, Clone)] +struct MinRepeatOneContext { + count: usize, + max_count: usize, +} + +/* <1=min> <2=max> item tail */ +fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { + let min_count = drive.peek_code(2) as usize; + let max_count = drive.peek_code(3) as usize; + + if drive.remaining_chars() < min_count { + return drive.failure(); + } + + drive.sync_string_position(); + + let count = if min_count == 0 { + 0 + } else { + let count = count(drive, stacks, min_count); + if count < min_count { + return drive.failure(); + } + drive.skip_char(count); + count + }; + + let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { + // tail is empty. we're finished + drive.sync_string_position(); + return drive.success(); + } + + drive.state.marks_push(); + stacks + .min_repeat_one + .push(MinRepeatOneContext { count, max_count }); + create_context(drive, stacks); + + fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { + let MinRepeatOneContext { count, max_count } = *stacks.min_repeat_one_last(); + + if max_count == MAXREPEAT || count <= max_count { + drive.sync_string_position(); + drive.next_ctx_from(1, callback); + } else { + drive.state.marks_pop_discard(); + stacks.min_repeat_one.pop(); + drive.failure(); + } + } + + fn callback(drive: &mut StateContext, stacks: &mut Stacks) { + if drive.popped_ctx().has_matched == Some(true) { + stacks.min_repeat_one.pop(); + return drive.success(); + } + + drive.sync_string_position(); + + if crate::engine::count(drive, stacks, 1) == 0 { + drive.state.marks_pop_discard(); + stacks.min_repeat_one.pop(); + return drive.failure(); + } + + drive.skip_char(1); + stacks.min_repeat_one_last().count += 1; + drive.state.marks_pop_keep(); + create_context(drive, stacks); + } +} + +#[derive(Debug, Copy, Clone)] +struct RepeatOneContext { + count: usize, + min_count: usize, + following_literal: Option, +} + +/* match repeated sequence (maximizing regexp) */ + +/* this operator only works if the repeated item is +exactly one character wide, and we're not already +collecting backtracking points. for other cases, +use the MAX_REPEAT operator */ + +/* <1=min> <2=max> item tail */ +fn op_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { + let min_count = drive.peek_code(2) as usize; + let max_count = drive.peek_code(3) as usize; + + if drive.remaining_chars() < min_count { + return drive.failure(); + } + + drive.sync_string_position(); + + let count = count(drive, stacks, max_count); + drive.skip_char(count); + if count < min_count { + return drive.failure(); + } + + let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { + // tail is empty. we're finished + drive.sync_string_position(); + return drive.success(); + } + + // Special case: Tail starts with a literal. Skip positions where + // the rest of the pattern cannot possibly match. + let following_literal = (next_code == SreOpcode::LITERAL as u32) + .then(|| drive.peek_code(drive.peek_code(1) as usize + 2)); + + drive.state.marks_push(); + stacks.repeat_one.push(RepeatOneContext { + count, + min_count, + following_literal, + }); + create_context(drive, stacks); + + fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { + let RepeatOneContext { + mut count, + min_count, + following_literal, + } = *stacks.repeat_one_last(); + + if let Some(c) = following_literal { + while drive.at_end() || drive.peek_char() != c { + if count <= min_count { + drive.state.marks_pop_discard(); + stacks.repeat_one.pop(); + return drive.failure(); + } + drive.back_skip_char(1); + count -= 1; + } + } + stacks.repeat_one_last().count = count; + + drive.sync_string_position(); + + // General case: backtracking + drive.next_ctx_from(1, callback); + } + + fn callback(drive: &mut StateContext, stacks: &mut Stacks) { + if drive.popped_ctx().has_matched == Some(true) { + stacks.repeat_one.pop(); + return drive.success(); + } + + let RepeatOneContext { + count, + min_count, + following_literal: _, + } = stacks.repeat_one_last(); + + if count <= min_count { + drive.state.marks_pop_discard(); + stacks.repeat_one.pop(); + return drive.failure(); + } + + drive.back_skip_char(1); + *count -= 1; + + drive.state.marks_pop_keep(); + create_context(drive, stacks); + } +} + +#[derive(Debug, Clone, Copy)] +struct RepeatContext { + count: isize, + min_count: usize, + max_count: usize, + code_position: usize, + last_position: usize, + prev_id: usize, +} + +/* create repeat context. all the hard work is done +by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ +/* <1=min> <2=max> item tail */ +fn op_repeat(drive: &mut StateContext, stacks: &mut Stacks) { + let repeat_ctx = RepeatContext { + count: -1, + min_count: drive.peek_code(2) as usize, + max_count: drive.peek_code(3) as usize, + code_position: drive.ctx.code_position, + last_position: std::usize::MAX, + prev_id: drive.ctx.repeat_ctx_id, + }; + + stacks.repeat.push(repeat_ctx); + + drive.sync_string_position(); + + let next_ctx = drive.next_ctx_from(1, |drive, stacks| { + drive.ctx.has_matched = drive.popped_ctx().has_matched; + stacks.repeat.pop(); + }); + next_ctx.repeat_ctx_id = stacks.repeat.len() - 1; +} + +#[derive(Debug, Clone, Copy)] +struct MinUntilContext { + count: isize, + save_repeat_ctx: Option, + save_last_position: usize, +} + +/* minimizing repeat */ +fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { + let repeat_ctx = stacks.repeat.last_mut().unwrap(); + + drive.sync_string_position(); + + let count = repeat_ctx.count + 1; + + stacks.min_until.push(MinUntilContext { + count, + save_repeat_ctx: None, + save_last_position: repeat_ctx.last_position, + }); + + if (count as usize) < repeat_ctx.min_count { + // not enough matches + repeat_ctx.count = count; + drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { + if drive.popped_ctx().has_matched == Some(true) { + stacks.min_until.pop(); + return drive.success(); + } + + stacks.repeat_last().count = stacks.min_until_last().count - 1; + drive.sync_string_position(); + stacks.min_until.pop(); + drive.failure(); + }); + return; + } + + drive.state.marks_push(); + + // see if the tail matches + stacks.min_until_last().save_repeat_ctx = stacks.repeat.pop(); + + drive.next_ctx(1, |drive, stacks| { + let MinUntilContext { + count, + save_repeat_ctx, + save_last_position, + } = stacks.min_until_last(); + let count = *count; + + let mut repeat_ctx = save_repeat_ctx.take().unwrap(); + + if drive.popped_ctx().has_matched == Some(true) { + stacks.min_until.pop(); + // restore repeat before return + stacks.repeat.push(repeat_ctx); + return drive.success(); + } + + drive.sync_string_position(); + + drive.state.marks_pop(); + + // match more until tail matches + + if count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT + || drive.state.string_position == repeat_ctx.last_position + { + stacks.min_until.pop(); + // restore repeat before return + stacks.repeat.push(repeat_ctx); + return drive.failure(); + } + + repeat_ctx.count = count; + /* zero-width match protection */ + *save_last_position = repeat_ctx.last_position; + repeat_ctx.last_position = drive.state.string_position; + + stacks.repeat.push(repeat_ctx); + + drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { + if drive.popped_ctx().has_matched == Some(true) { + stacks.min_until.pop(); + drive.success(); + } else { + stacks.repeat_last().count = stacks.min_until_last().count - 1; + drive.sync_string_position(); + stacks.min_until.pop(); + drive.failure(); + } + }); + }); +} + +#[derive(Debug, Clone, Copy)] +struct MaxUntilContext { + save_last_position: usize, +} + +/* maximizing repeat */ +fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { + // let repeat_ctx = stacks.repeat.last_mut().unwrap(); + let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; + + drive.sync_string_position(); + + repeat_ctx.count += 1; + + // let count = repeat_ctx.count + 1; + + if (repeat_ctx.count as usize) < repeat_ctx.min_count { + // not enough matches + // repeat_ctx.count = count; + drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { + if drive.popped_ctx().has_matched == Some(true) { + // stacks.max_until.pop(); + drive.success(); + } else { + // let count = stacks.max_until_last().count; + // stacks.repeat_last().count -= 1; + stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; + drive.sync_string_position(); + // stacks.max_until.pop(); + drive.failure(); + } + }); + return; + } + + stacks.max_until.push(MaxUntilContext { + save_last_position: repeat_ctx.last_position, + }); + + if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) + && drive.state.string_position != repeat_ctx.last_position + { + /* we may have enough matches, but if we can + match another item, do so */ + repeat_ctx.last_position = drive.state.string_position; + + drive.state.marks_push(); + + drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { + let save_last_position = stacks.max_until_last().save_last_position; + let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; + repeat_ctx.last_position = save_last_position; + if drive.popped_ctx().has_matched == Some(true) { + drive.state.marks_pop_discard(); + stacks.max_until.pop(); + return drive.success(); + } + drive.state.marks_pop(); + repeat_ctx.count -= 1; + drive.sync_string_position(); + + /* cannot match more repeated items here. make sure the + tail matches */ + let next_ctx = drive.next_ctx(1, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + }); + return; + } + + /* cannot match more repeated items here. make sure the + tail matches */ + let next_ctx = drive.next_ctx(1, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + + fn tail_callback(drive: &mut StateContext, stacks: &mut Stacks) { + stacks.max_until.pop(); + + if drive.popped_ctx().has_matched == Some(true) { + drive.success(); + } else { + drive.sync_string_position(); + drive.failure(); + } + } +} + +#[derive(Debug, Default)] +struct Stacks { + branch: Vec, + min_repeat_one: Vec, + repeat_one: Vec, + repeat: Vec, + min_until: Vec, + max_until: Vec, +} + +impl Stacks { + fn clear(&mut self) { + self.branch.clear(); + self.min_repeat_one.clear(); + self.repeat_one.clear(); + self.repeat.clear(); + self.min_until.clear(); + self.max_until.clear(); + } + + fn branch_last(&mut self) -> &mut BranchContext { + self.branch.last_mut().unwrap() + } + fn min_repeat_one_last(&mut self) -> &mut MinRepeatOneContext { + self.min_repeat_one.last_mut().unwrap() + } + fn repeat_one_last(&mut self) -> &mut RepeatOneContext { + self.repeat_one.last_mut().unwrap() + } + fn repeat_last(&mut self) -> &mut RepeatContext { + self.repeat.last_mut().unwrap() + } + fn min_until_last(&mut self) -> &mut MinUntilContext { + self.min_until.last_mut().unwrap() + } + fn max_until_last(&mut self) -> &mut MaxUntilContext { + self.max_until.last_mut().unwrap() + } +} + #[derive(Debug, Clone, Copy)] pub enum StrDrive<'a> { Str(&'a str), @@ -203,7 +857,7 @@ impl<'a> StrDrive<'a> { StrDrive::Str(s) => s .get(offset..) .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) - .unwrap_or_else(|| s.len()), + .unwrap_or(s.len()), StrDrive::Bytes(_) => offset + skip, } } @@ -264,31 +918,63 @@ impl<'a> StrDrive<'a> { } } -#[derive(Debug, Clone, Copy)] +type OpcodeHandler = fn(&mut StateContext, &mut Stacks); + +#[derive(Clone, Copy)] struct MatchContext { string_position: usize, string_offset: usize, code_position: usize, has_matched: Option, toplevel: bool, + handler: Option, + repeat_ctx_id: usize, } -trait MatchContextDrive { - fn ctx_mut(&mut self) -> &mut MatchContext; +impl std::fmt::Debug for MatchContext { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MatchContext") + .field("string_position", &self.string_position) + .field("string_offset", &self.string_offset) + .field("code_position", &self.code_position) + .field("has_matched", &self.has_matched) + .field("toplevel", &self.toplevel) + .field("handler", &self.handler.map(|x| x as usize)) + .finish() + } +} + +trait ContextDrive { fn ctx(&self) -> &MatchContext; + fn ctx_mut(&mut self) -> &mut MatchContext; fn state(&self) -> &State; - fn repeat_ctx(&self) -> &RepeatContext { - self.state().repeat_stack.last().unwrap() + + fn popped_ctx(&self) -> &MatchContext { + self.state().popped_context.as_ref().unwrap() } + fn pattern(&self) -> &[u32] { &self.state().pattern_codes[self.ctx().code_position..] } + fn peek_char(&self) -> u32 { self.state().string.peek(self.ctx().string_offset) } fn peek_code(&self, peek: usize) -> u32 { self.state().pattern_codes[self.ctx().code_position + peek] } + + fn back_peek_char(&self) -> u32 { + self.state().string.back_peek(self.ctx().string_offset) + } + fn back_skip_char(&mut self, skip_count: usize) { + self.ctx_mut().string_position -= skip_count; + self.ctx_mut().string_offset = self + .state() + .string + .back_offset(self.ctx().string_offset, skip_count); + } + fn skip_char(&mut self, skip_count: usize) { self.ctx_mut().string_offset = self .state() @@ -299,12 +985,17 @@ trait MatchContextDrive { fn skip_code(&mut self, skip_count: usize) { self.ctx_mut().code_position += skip_count; } + fn skip_code_from(&mut self, peek: usize) { + self.skip_code(self.peek_code(peek) as usize + 1); + } + fn remaining_chars(&self) -> usize { self.state().end - self.ctx().string_position } fn remaining_codes(&self) -> usize { self.state().pattern_codes.len() - self.ctx().code_position } + fn at_beginning(&self) -> bool { // self.ctx().string_position == self.state().start self.ctx().string_position == 0 @@ -331,16 +1022,7 @@ trait MatchContextDrive { let this = !self.at_end() && word_checker(self.peek_char()); this == that } - fn back_peek_char(&self) -> u32 { - self.state().string.back_peek(self.ctx().string_offset) - } - fn back_skip_char(&mut self, skip_count: usize) { - self.ctx_mut().string_position -= skip_count; - self.ctx_mut().string_offset = self - .state() - .string - .back_offset(self.ctx().string_offset, skip_count); - } + fn can_success(&self) -> bool { if !self.ctx().toplevel { return true; @@ -353,389 +1035,71 @@ trait MatchContextDrive { } true } -} -struct StackDrive<'a> { - state: State<'a>, - ctx_id: usize, -} -impl<'a> StackDrive<'a> { - fn id(&self) -> usize { - self.ctx_id - } - fn drive(ctx_id: usize, state: State<'a>) -> Self { - Self { state, ctx_id } - } - fn take(self) -> State<'a> { - self.state - } - fn push_new_context(&mut self, pattern_offset: usize) { - self.push_new_context_at(self.ctx().code_position + pattern_offset); - } - fn push_new_context_at(&mut self, code_position: usize) { - let mut child_ctx = MatchContext { ..*self.ctx() }; - child_ctx.code_position = code_position; - self.state.context_stack.push(child_ctx); - } - fn repeat_ctx_mut(&mut self) -> &mut RepeatContext { - self.state.repeat_stack.last_mut().unwrap() - } -} -impl MatchContextDrive for StackDrive<'_> { - fn ctx_mut(&mut self) -> &mut MatchContext { - &mut self.state.context_stack[self.ctx_id] + fn success(&mut self) { + self.ctx_mut().has_matched = Some(true); } - fn ctx(&self) -> &MatchContext { - &self.state.context_stack[self.ctx_id] - } - fn state(&self) -> &State { - &self.state + + fn failure(&mut self) { + self.ctx_mut().has_matched = Some(false); } } -struct WrapDrive<'a> { - stack_drive: &'a StackDrive<'a>, +struct StateContext<'a> { + state: State<'a>, ctx: MatchContext, + next_ctx: Option, } -impl<'a> WrapDrive<'a> { - fn drive(ctx: MatchContext, stack_drive: &'a StackDrive<'a>) -> Self { - Self { stack_drive, ctx } + +impl ContextDrive for StateContext<'_> { + fn ctx(&self) -> &MatchContext { + &self.ctx } -} -impl MatchContextDrive for WrapDrive<'_> { fn ctx_mut(&mut self) -> &mut MatchContext { &mut self.ctx } - fn ctx(&self) -> &MatchContext { - &self.ctx - } fn state(&self) -> &State { - self.stack_drive.state() + &self.state } } -trait OpcodeExecutor { - fn next(&mut self, drive: &mut StackDrive) -> Option<()>; -} +impl StateContext<'_> { + fn next_ctx_from(&mut self, peek: usize, handler: OpcodeHandler) -> &mut MatchContext { + self.next_ctx(self.peek_code(peek) as usize + 1, handler) + } + fn next_ctx(&mut self, offset: usize, handler: OpcodeHandler) -> &mut MatchContext { + self.next_ctx_at(self.ctx.code_position + offset, handler) + } + fn next_ctx_at(&mut self, code_position: usize, handler: OpcodeHandler) -> &mut MatchContext { + self.next_ctx = Some(MatchContext { + code_position, + has_matched: None, + handler: None, + ..self.ctx + }); + self.ctx.handler = Some(handler); + self.next_ctx.as_mut().unwrap() + } -struct OpTwice { - f1: Option, - f2: Option, -} -impl OpcodeExecutor for OpTwice -where - F1: FnOnce(&mut StackDrive) -> Option<()>, - F2: FnOnce(&mut StackDrive), -{ - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - if let Some(f1) = self.f1.take() { - f1(drive) - } else if let Some(f2) = self.f2.take() { - f2(drive); - None - } else { - unreachable!() - } + fn sync_string_position(&mut self) { + self.state.string_position = self.ctx.string_position; } } -fn twice(f1: F1, f2: F2) -> Box> -where - F1: FnOnce(&mut StackDrive) -> Option<()>, - F2: FnOnce(&mut StackDrive), -{ - Box::new(OpTwice { - f1: Some(f1), - f2: Some(f2), - }) -} -struct OpcodeDispatcher { - executing_contexts: BTreeMap>, +struct StateRefContext<'a> { + entity: &'a StateContext<'a>, + ctx: MatchContext, } -impl OpcodeDispatcher { - fn new() -> Self { - Self { - executing_contexts: BTreeMap::new(), - } - } - fn clear(&mut self) { - self.executing_contexts.clear(); - } - // Returns True if the current context matches, False if it doesn't and - // None if matching is not finished, ie must be resumed after child - // contexts have been matched. - fn pymatch(&mut self, drive: &mut StackDrive) -> Option { - while drive.remaining_codes() > 0 && drive.ctx().has_matched.is_none() { - let code = drive.peek_code(0); - let opcode = SreOpcode::try_from(code).unwrap(); - if !self.dispatch(opcode, drive) { - return None; - } - } - match drive.ctx().has_matched { - Some(matched) => Some(matched), - None => { - drive.ctx_mut().has_matched = Some(false); - Some(false) - } - } - } - // Dispatches a context on a given opcode. Returns True if the context - // is done matching, False if it must be resumed when next encountered. - fn dispatch(&mut self, opcode: SreOpcode, drive: &mut StackDrive) -> bool { - let executor = self - .executing_contexts - .remove(&drive.id()) - .or_else(|| self.dispatch_table(opcode, drive)); - if let Some(mut executor) = executor { - if let Some(()) = executor.next(drive) { - self.executing_contexts.insert(drive.id(), executor); - return false; - } - } - true +impl ContextDrive for StateRefContext<'_> { + fn ctx(&self) -> &MatchContext { + &self.ctx } - - fn dispatch_table( - &mut self, - opcode: SreOpcode, - drive: &mut StackDrive, - ) -> Option> { - match opcode { - SreOpcode::FAILURE => { - drive.ctx_mut().has_matched = Some(false); - None - } - SreOpcode::SUCCESS => { - drive.ctx_mut().has_matched = Some(drive.can_success()); - if drive.ctx().has_matched == Some(true) { - drive.state.string_position = drive.ctx().string_position; - } - None - } - SreOpcode::ANY => { - if drive.at_end() || drive.at_linebreak() { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(1); - drive.skip_char(1); - } - None - } - SreOpcode::ANY_ALL => { - if drive.at_end() { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(1); - drive.skip_char(1); - } - None - } - /* assert subpattern */ - /* */ - SreOpcode::ASSERT => Some(twice( - |drive| { - let back = drive.peek_code(2) as usize; - let passed = drive.ctx().string_position; - if passed < back { - drive.ctx_mut().has_matched = Some(false); - return None; - } - let back_offset = drive - .state - .string - .back_offset(drive.ctx().string_offset, back); - - drive.state.string_position = drive.ctx().string_position - back; - - drive.push_new_context(3); - let child_ctx = drive.state.context_stack.last_mut().unwrap(); - child_ctx.toplevel = false; - child_ctx.string_position -= back; - child_ctx.string_offset = back_offset; - - Some(()) - }, - |drive| { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.skip_code(drive.peek_code(1) as usize + 1); - } else { - drive.ctx_mut().has_matched = Some(false); - } - }, - )), - SreOpcode::ASSERT_NOT => Some(twice( - |drive| { - let back = drive.peek_code(2) as usize; - let passed = drive.ctx().string_position; - if passed < back { - drive.skip_code(drive.peek_code(1) as usize + 1); - return None; - } - let back_offset = drive - .state - .string - .back_offset(drive.ctx().string_offset, back); - - drive.state.string_position = drive.ctx().string_position - back; - - drive.push_new_context(3); - let child_ctx = drive.state.context_stack.last_mut().unwrap(); - child_ctx.toplevel = false; - child_ctx.string_position -= back; - child_ctx.string_offset = back_offset; - - Some(()) - }, - |drive| { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(drive.peek_code(1) as usize + 1); - } - }, - )), - SreOpcode::AT => { - let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); - if !at(drive, atcode) { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(2); - } - None - } - SreOpcode::BRANCH => Some(Box::new(OpBranch::default())), - SreOpcode::CATEGORY => { - let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); - if drive.at_end() || !category(catcode, drive.peek_char()) { - drive.ctx_mut().has_matched = Some(false); - } else { - drive.skip_code(2); - drive.skip_char(1); - } - None - } - SreOpcode::IN => { - general_op_in(drive, |set, c| charset(set, c)); - None - } - SreOpcode::IN_IGNORE => { - general_op_in(drive, |set, c| charset(set, lower_ascii(c))); - None - } - SreOpcode::IN_UNI_IGNORE => { - general_op_in(drive, |set, c| charset(set, lower_unicode(c))); - None - } - SreOpcode::IN_LOC_IGNORE => { - general_op_in(drive, |set, c| charset_loc_ignore(set, c)); - None - } - SreOpcode::INFO | SreOpcode::JUMP => { - drive.skip_code(drive.peek_code(1) as usize + 1); - None - } - SreOpcode::LITERAL => { - general_op_literal(drive, |code, c| code == c); - None - } - SreOpcode::NOT_LITERAL => { - general_op_literal(drive, |code, c| code != c); - None - } - SreOpcode::LITERAL_IGNORE => { - general_op_literal(drive, |code, c| code == lower_ascii(c)); - None - } - SreOpcode::NOT_LITERAL_IGNORE => { - general_op_literal(drive, |code, c| code != lower_ascii(c)); - None - } - SreOpcode::LITERAL_UNI_IGNORE => { - general_op_literal(drive, |code, c| code == lower_unicode(c)); - None - } - SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_op_literal(drive, |code, c| code != lower_unicode(c)); - None - } - SreOpcode::LITERAL_LOC_IGNORE => { - general_op_literal(drive, char_loc_ignore); - None - } - SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_op_literal(drive, |code, c| !char_loc_ignore(code, c)); - None - } - SreOpcode::MARK => { - drive - .state - .set_mark(drive.peek_code(1) as usize, drive.ctx().string_position); - drive.skip_code(2); - None - } - SreOpcode::REPEAT => Some(twice( - // create repeat context. all the hard work is done by the UNTIL - // operator (MAX_UNTIL, MIN_UNTIL) - // <1=min> <2=max> item tail - |drive| { - let repeat = RepeatContext { - count: -1, - code_position: drive.ctx().code_position, - last_position: std::usize::MAX, - mincount: drive.peek_code(2) as usize, - maxcount: drive.peek_code(3) as usize, - }; - drive.state.repeat_stack.push(repeat); - drive.state.string_position = drive.ctx().string_position; - // execute UNTIL operator - drive.push_new_context(drive.peek_code(1) as usize + 1); - Some(()) - }, - |drive| { - drive.state.repeat_stack.pop(); - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - }, - )), - SreOpcode::MAX_UNTIL => Some(Box::new(OpMaxUntil::default())), - SreOpcode::MIN_UNTIL => Some(Box::new(OpMinUntil::default())), - SreOpcode::REPEAT_ONE => Some(Box::new(OpRepeatOne::default())), - SreOpcode::MIN_REPEAT_ONE => Some(Box::new(OpMinRepeatOne::default())), - SreOpcode::GROUPREF => { - general_op_groupref(drive, |x| x); - None - } - SreOpcode::GROUPREF_IGNORE => { - general_op_groupref(drive, lower_ascii); - None - } - SreOpcode::GROUPREF_LOC_IGNORE => { - general_op_groupref(drive, lower_locate); - None - } - SreOpcode::GROUPREF_UNI_IGNORE => { - general_op_groupref(drive, lower_unicode); - None - } - SreOpcode::GROUPREF_EXISTS => { - let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); - match (group_start, group_end) { - (Some(start), Some(end)) if start <= end => { - drive.skip_code(3); - } - _ => drive.skip_code(drive.peek_code(2) as usize + 1), - } - None - } - _ => { - // TODO python expcetion? - unreachable!("unexpected opcode") - } - } + fn ctx_mut(&mut self) -> &mut MatchContext { + &mut self.ctx + } + fn state(&self) -> &State { + &self.entity.state } } @@ -752,60 +1116,63 @@ fn charset_loc_ignore(set: &[u32], c: u32) -> bool { up != lo && charset(set, up) } -fn general_op_groupref u32>(drive: &mut StackDrive, mut f: F) { +fn general_op_groupref u32>(drive: &mut StateContext, mut f: F) { let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); let (group_start, group_end) = match (group_start, group_end) { (Some(start), Some(end)) if start <= end => (start, end), _ => { - drive.ctx_mut().has_matched = Some(false); - return; + return drive.failure(); } }; - let mut wdrive = WrapDrive::drive(*drive.ctx(), &drive); - let mut gdrive = WrapDrive::drive( - MatchContext { + + let mut wdrive = StateRefContext { + entity: drive, + ctx: drive.ctx, + }; + let mut gdrive = StateRefContext { + entity: drive, + ctx: MatchContext { string_position: group_start, // TODO: cache the offset string_offset: drive.state.string.offset(0, group_start), - ..*drive.ctx() + ..drive.ctx }, - &drive, - ); + }; + for _ in group_start..group_end { if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) { - drive.ctx_mut().has_matched = Some(false); - return; + return drive.failure(); } wdrive.skip_char(1); gdrive.skip_char(1); } - let position = wdrive.ctx().string_position; - let offset = wdrive.ctx().string_offset; + + let position = wdrive.ctx.string_position; + let offset = wdrive.ctx.string_offset; drive.skip_code(2); - drive.ctx_mut().string_position = position; - drive.ctx_mut().string_offset = offset; + drive.ctx.string_position = position; + drive.ctx.string_offset = offset; } -fn general_op_literal bool>(drive: &mut StackDrive, f: F) { +fn general_op_literal bool>(drive: &mut StateContext, f: F) { if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { - drive.ctx_mut().has_matched = Some(false); + drive.failure(); } else { drive.skip_code(2); drive.skip_char(1); } } -fn general_op_in bool>(drive: &mut StackDrive, f: F) { - let skip = drive.peek_code(1) as usize; +fn general_op_in bool>(drive: &mut StateContext, f: F) { if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { - drive.ctx_mut().has_matched = Some(false); + drive.failure(); } else { - drive.skip_code(skip + 1); + drive.skip_code_from(1); drive.skip_char(1); } } -fn at(drive: &StackDrive, atcode: SreAtCode) -> bool { +fn at(drive: &StateContext, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), @@ -938,84 +1305,91 @@ fn charset(set: &[u32], ch: u32) -> bool { } /* General case */ -fn general_count(drive: &mut StackDrive, maxcount: usize) -> usize { +fn general_count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { let mut count = 0; - let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); + let max_count = std::cmp::min(max_count, drive.remaining_chars()); - let save_ctx = *drive.ctx(); + let save_ctx = drive.ctx; drive.skip_code(4); - let reset_position = drive.ctx().code_position; - - let mut dispatcher = OpcodeDispatcher::new(); - while count < maxcount { - drive.ctx_mut().code_position = reset_position; - dispatcher.dispatch(SreOpcode::try_from(drive.peek_code(0)).unwrap(), drive); - if drive.ctx().has_matched == Some(false) { + let reset_position = drive.ctx.code_position; + + while count < max_count { + drive.ctx.code_position = reset_position; + let code = drive.peek_code(0); + let code = SreOpcode::try_from(code).unwrap(); + dispatch(code, drive, stacks); + if drive.ctx.has_matched == Some(false) { break; } count += 1; } - *drive.ctx_mut() = save_ctx; + drive.ctx = save_ctx; count } -fn count(stack_drive: &mut StackDrive, maxcount: usize) -> usize { - let mut drive = WrapDrive::drive(*stack_drive.ctx(), stack_drive); - let maxcount = std::cmp::min(maxcount, drive.remaining_chars()); - let end = drive.ctx().string_position + maxcount; +fn count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { + let save_ctx = drive.ctx; + let max_count = std::cmp::min(max_count, drive.remaining_chars()); + let end = drive.ctx.string_position + max_count; let opcode = SreOpcode::try_from(drive.peek_code(0)).unwrap(); match opcode { SreOpcode::ANY => { - while !drive.ctx().string_position < end && !drive.at_linebreak() { + while !drive.ctx.string_position < end && !drive.at_linebreak() { drive.skip_char(1); } } SreOpcode::ANY_ALL => { - drive.skip_char(maxcount); + drive.skip_char(max_count); } SreOpcode::IN => { - while !drive.ctx().string_position < end + while !drive.ctx.string_position < end && charset(&drive.pattern()[2..], drive.peek_char()) { drive.skip_char(1); } } SreOpcode::LITERAL => { - general_count_literal(&mut drive, end, |code, c| code == c as u32); + general_count_literal(drive, end, |code, c| code == c as u32); } SreOpcode::NOT_LITERAL => { - general_count_literal(&mut drive, end, |code, c| code != c as u32); + general_count_literal(drive, end, |code, c| code != c as u32); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(&mut drive, end, |code, c| code == lower_ascii(c) as u32); + general_count_literal(drive, end, |code, c| code == lower_ascii(c) as u32); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(&mut drive, end, |code, c| code != lower_ascii(c) as u32); + general_count_literal(drive, end, |code, c| code != lower_ascii(c) as u32); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(&mut drive, end, char_loc_ignore); + general_count_literal(drive, end, char_loc_ignore); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(&mut drive, end, |code, c| !char_loc_ignore(code, c)); + general_count_literal(drive, end, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(&mut drive, end, |code, c| code == lower_unicode(c) as u32); + general_count_literal(drive, end, |code, c| code == lower_unicode(c) as u32); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(&mut drive, end, |code, c| code != lower_unicode(c) as u32); + general_count_literal(drive, end, |code, c| code != lower_unicode(c) as u32); } _ => { - return general_count(stack_drive, maxcount); + return general_count(drive, stacks, max_count); } } - drive.ctx().string_position - drive.state().string_position + let count = drive.ctx.string_position - drive.state.string_position; + drive.ctx = save_ctx; + count } -fn general_count_literal bool>(drive: &mut WrapDrive, end: usize, mut f: F) { +fn general_count_literal bool>( + drive: &mut StateContext, + end: usize, + mut f: F, +) { let ch = drive.peek_code(1); - while !drive.ctx().string_position < end && f(ch, drive.peek_char()) { + while !drive.ctx.string_position < end && f(ch, drive.peek_char()) { drive.skip_char(1); } } @@ -1065,7 +1439,9 @@ fn upper_locate(ch: u32) -> u32 { } fn is_uni_digit(ch: u32) -> bool { // TODO: check with cpython - char::try_from(ch).map(|x| x.is_digit(10)).unwrap_or(false) + char::try_from(ch) + .map(|x| x.is_ascii_digit()) + .unwrap_or(false) } fn is_uni_space(ch: u32) -> bool { // TODO: check with cpython @@ -1155,413 +1531,3 @@ fn utf8_back_peek_offset(bytes: &[u8], offset: usize) -> usize { } offset } - -#[derive(Debug, Copy, Clone)] -struct RepeatContext { - count: isize, - code_position: usize, - // zero-width match protection - last_position: usize, - mincount: usize, - maxcount: usize, -} - -#[derive(Default)] -struct OpMinRepeatOne { - jump_id: usize, - mincount: usize, - maxcount: usize, - count: usize, -} -impl OpcodeExecutor for OpMinRepeatOne { - /* <1=min> <2=max> item tail */ - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - self.mincount = drive.peek_code(2) as usize; - self.maxcount = drive.peek_code(3) as usize; - - if drive.remaining_chars() < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - - drive.state.string_position = drive.ctx().string_position; - - self.count = if self.mincount == 0 { - 0 - } else { - let count = count(drive, self.mincount); - if count < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.skip_char(count); - count - }; - - let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { - // tail is empty. we're finished - drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); - return None; - } - - drive.state.marks_push(); - self.jump_id = 1; - self.next(drive) - } - 1 => { - if self.maxcount == MAXREPEAT || self.count <= self.maxcount { - drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(drive.peek_code(1) as usize + 1); - self.jump_id = 2; - return Some(()); - } - - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - None - } - 2 => { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.string_position = drive.ctx().string_position; - if count(drive, 1) == 0 { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.skip_char(1); - self.count += 1; - drive.state.marks_pop_keep(); - self.jump_id = 1; - self.next(drive) - } - _ => unreachable!(), - } - } -} - -#[derive(Default)] -struct OpMaxUntil { - jump_id: usize, - count: isize, - save_last_position: usize, -} -impl OpcodeExecutor for OpMaxUntil { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - let RepeatContext { - count, - code_position, - last_position, - mincount, - maxcount, - } = *drive.repeat_ctx(); - - drive.state.string_position = drive.ctx().string_position; - self.count = count + 1; - - if (self.count as usize) < mincount { - // not enough matches - drive.repeat_ctx_mut().count = self.count; - drive.push_new_context_at(code_position + 4); - self.jump_id = 1; - return Some(()); - } - - if ((self.count as usize) < maxcount || maxcount == MAXREPEAT) - && drive.state.string_position != last_position - { - // we may have enough matches, if we can match another item, do so - drive.repeat_ctx_mut().count = self.count; - drive.state.marks_push(); - self.save_last_position = last_position; - drive.repeat_ctx_mut().last_position = drive.state.string_position; - drive.push_new_context_at(code_position + 4); - self.jump_id = 2; - return Some(()); - } - - self.jump_id = 3; - self.next(drive) - } - 1 => { - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.repeat_ctx_mut().count = self.count - 1; - drive.state.string_position = drive.ctx().string_position; - } - None - } - 2 => { - drive.repeat_ctx_mut().last_position = self.save_last_position; - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.marks_pop(); - drive.repeat_ctx_mut().count = self.count - 1; - drive.state.string_position = drive.ctx().string_position; - self.jump_id = 3; - self.next(drive) - } - 3 => { - // cannot match more repeated items here. make sure the tail matches - drive.push_new_context(1); - self.jump_id = 4; - Some(()) - } - 4 => { - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.state.string_position = drive.ctx().string_position; - } - None - } - _ => unreachable!(), - } - } -} - -#[derive(Default)] -struct OpMinUntil { - jump_id: usize, - count: isize, - save_repeat: Option, - save_last_position: usize, -} -impl OpcodeExecutor for OpMinUntil { - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - let RepeatContext { - count, - code_position, - last_position: _, - mincount, - maxcount: _, - } = *drive.repeat_ctx(); - drive.state.string_position = drive.ctx().string_position; - self.count = count + 1; - - if (self.count as usize) < mincount { - // not enough matches - drive.repeat_ctx_mut().count = self.count; - drive.push_new_context_at(code_position + 4); - self.jump_id = 1; - return Some(()); - } - - // see if the tail matches - drive.state.marks_push(); - self.save_repeat = drive.state.repeat_stack.pop(); - drive.push_new_context(1); - self.jump_id = 2; - Some(()) - } - 1 => { - let child_ctx = drive.state.popped_context.unwrap(); - drive.ctx_mut().has_matched = child_ctx.has_matched; - if drive.ctx().has_matched != Some(true) { - drive.repeat_ctx_mut().count = self.count - 1; - drive.repeat_ctx_mut().last_position = self.save_last_position; - drive.state.string_position = drive.ctx().string_position; - } - None - } - 2 => { - // restore repeat before return - drive.state.repeat_stack.push(self.save_repeat.unwrap()); - - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.string_position = drive.ctx().string_position; - drive.state.marks_pop(); - - // match more until tail matches - let RepeatContext { - count: _, - code_position, - last_position, - mincount: _, - maxcount, - } = *drive.repeat_ctx(); - - if self.count as usize >= maxcount && maxcount != MAXREPEAT - || drive.state.string_position == last_position - { - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.repeat_ctx_mut().count = self.count; - - /* zero-width match protection */ - self.save_last_position = last_position; - drive.repeat_ctx_mut().last_position = drive.state.string_position; - - drive.push_new_context_at(code_position + 4); - self.jump_id = 1; - Some(()) - } - _ => unreachable!(), - } - } -} - -#[derive(Default)] -struct OpBranch { - jump_id: usize, - branch_offset: usize, -} -impl OpcodeExecutor for OpBranch { - // alternation - // <0=skip> code ... - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - drive.state.marks_push(); - // jump out the head - self.branch_offset = 1; - self.jump_id = 1; - self.next(drive) - } - 1 => { - let next_branch_length = drive.peek_code(self.branch_offset) as usize; - if next_branch_length == 0 { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(self.branch_offset + 1); - self.branch_offset += next_branch_length; - self.jump_id = 2; - Some(()) - } - 2 => { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(true); - return None; - } - drive.state.marks_pop_keep(); - self.jump_id = 1; - Some(()) - } - _ => unreachable!(), - } - } -} - -#[derive(Default)] -struct OpRepeatOne { - jump_id: usize, - mincount: usize, - maxcount: usize, - count: usize, - following_literal: Option, -} -impl OpcodeExecutor for OpRepeatOne { - /* match repeated sequence (maximizing regexp) */ - - /* this operator only works if the repeated item is - exactly one character wide, and we're not already - collecting backtracking points. for other cases, - use the MAX_REPEAT operator */ - - /* <1=min> <2=max> item tail */ - fn next(&mut self, drive: &mut StackDrive) -> Option<()> { - match self.jump_id { - 0 => { - self.mincount = drive.peek_code(2) as usize; - self.maxcount = drive.peek_code(3) as usize; - - if drive.remaining_chars() < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - - drive.state.string_position = drive.ctx().string_position; - - self.count = count(drive, self.maxcount); - drive.skip_char(self.count); - if self.count < self.mincount { - drive.ctx_mut().has_matched = Some(false); - return None; - } - - let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { - // tail is empty. we're finished - drive.state.string_position = drive.ctx().string_position; - drive.ctx_mut().has_matched = Some(true); - return None; - } - - drive.state.marks_push(); - - // Special case: Tail starts with a literal. Skip positions where - // the rest of the pattern cannot possibly match. - if next_code == SreOpcode::LITERAL as u32 { - self.following_literal = Some(drive.peek_code(drive.peek_code(1) as usize + 2)) - } - - self.jump_id = 1; - self.next(drive) - } - 1 => { - if let Some(c) = self.following_literal { - while drive.at_end() || drive.peek_char() != c { - if self.count <= self.mincount { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - return None; - } - drive.back_skip_char(1); - self.count -= 1; - } - } - - // General case: backtracking - drive.state.string_position = drive.ctx().string_position; - drive.push_new_context(drive.peek_code(1) as usize + 1); - self.jump_id = 2; - Some(()) - } - 2 => { - let child_ctx = drive.state.popped_context.unwrap(); - if child_ctx.has_matched == Some(true) { - drive.ctx_mut().has_matched = Some(true); - return None; - } - if self.count <= self.mincount { - drive.state.marks_pop_discard(); - drive.ctx_mut().has_matched = Some(false); - return None; - } - - drive.back_skip_char(1); - self.count -= 1; - - drive.state.marks_pop_keep(); - - self.jump_id = 1; - self.next(drive) - } - _ => unreachable!(), - } - } -} diff --git a/src/lib.rs b/src/lib.rs index 4a3ed1b754..c23e807501 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,7 @@ pub mod engine; pub const CODESIZE: usize = 4; #[cfg(target_pointer_width = "32")] -pub const MAXREPEAT: usize = usize::MAX; +pub const MAXREPEAT: usize = usize::MAX - 1; #[cfg(target_pointer_width = "64")] pub const MAXREPEAT: usize = u32::MAX as usize; diff --git a/tests/tests.rs b/tests/tests.rs index b430947a9b..e8ae487029 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -70,4 +70,15 @@ fn test_repeat_context_panic() { let mut state = p.state("axxzaz", 0..usize::MAX); state = state.pymatch(); assert!(state.marks == vec![Some(1), Some(3)]); -} \ No newline at end of file +} + +#[test] +fn test_double_max_until() { + // pattern p = re.compile(r'((1)?)*') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 18, 0, 4294967295, 18, 0, 24, 9, 0, 1, 18, 2, 17, 49, 18, 3, 19, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + // END GENERATED + let mut state = p.state("1111", 0..usize::MAX); + state = state.pymatch(); + assert!(state.string_position == 4); +} From 4007f8276550efb6aa1ada429a3e89c042eae86c Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 27 Jul 2022 21:33:20 +0200 Subject: [PATCH 052/705] optimize max_until and min_until --- src/engine.rs | 96 +++++++++++++++------------------------------------ 1 file changed, 27 insertions(+), 69 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 81903ccfdd..223aa3425c 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -420,7 +420,7 @@ fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { let count = if min_count == 0 { 0 } else { - let count = count(drive, stacks, min_count); + let count = _count(drive, stacks, min_count); if count < min_count { return drive.failure(); } @@ -462,7 +462,7 @@ fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { drive.sync_string_position(); - if crate::engine::count(drive, stacks, 1) == 0 { + if _count(drive, stacks, 1) == 0 { drive.state.marks_pop_discard(); stacks.min_repeat_one.pop(); return drive.failure(); @@ -500,7 +500,7 @@ fn op_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { drive.sync_string_position(); - let count = count(drive, stacks, max_count); + let count = _count(drive, stacks, max_count); drive.skip_char(count); if count < min_count { return drive.failure(); @@ -614,9 +614,7 @@ fn op_repeat(drive: &mut StateContext, stacks: &mut Stacks) { #[derive(Debug, Clone, Copy)] struct MinUntilContext { - count: isize, - save_repeat_ctx: Option, - save_last_position: usize, + save_repeat_ctx_id: usize, } /* minimizing repeat */ @@ -625,50 +623,35 @@ fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { drive.sync_string_position(); - let count = repeat_ctx.count + 1; - - stacks.min_until.push(MinUntilContext { - count, - save_repeat_ctx: None, - save_last_position: repeat_ctx.last_position, - }); + repeat_ctx.count += 1; - if (count as usize) < repeat_ctx.min_count { + if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - repeat_ctx.count = count; drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { if drive.popped_ctx().has_matched == Some(true) { - stacks.min_until.pop(); - return drive.success(); + drive.success(); + } else { + stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; + drive.sync_string_position(); + drive.failure(); } - - stacks.repeat_last().count = stacks.min_until_last().count - 1; - drive.sync_string_position(); - stacks.min_until.pop(); - drive.failure(); }); return; } drive.state.marks_push(); - // see if the tail matches - stacks.min_until_last().save_repeat_ctx = stacks.repeat.pop(); + stacks.min_until.push(MinUntilContext { + save_repeat_ctx_id: drive.ctx.repeat_ctx_id, + }); - drive.next_ctx(1, |drive, stacks| { - let MinUntilContext { - count, - save_repeat_ctx, - save_last_position, - } = stacks.min_until_last(); - let count = *count; + // see if the tail matches + let next_ctx = drive.next_ctx(1, |drive, stacks| { + drive.ctx.repeat_ctx_id = stacks.min_until.pop().unwrap().save_repeat_ctx_id; - let mut repeat_ctx = save_repeat_ctx.take().unwrap(); + let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; if drive.popped_ctx().has_matched == Some(true) { - stacks.min_until.pop(); - // restore repeat before return - stacks.repeat.push(repeat_ctx); return drive.success(); } @@ -678,34 +661,27 @@ fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { // match more until tail matches - if count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT + if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT || drive.state.string_position == repeat_ctx.last_position { - stacks.min_until.pop(); - // restore repeat before return - stacks.repeat.push(repeat_ctx); + repeat_ctx.count -= 1; return drive.failure(); } - repeat_ctx.count = count; /* zero-width match protection */ - *save_last_position = repeat_ctx.last_position; repeat_ctx.last_position = drive.state.string_position; - stacks.repeat.push(repeat_ctx); - drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { if drive.popped_ctx().has_matched == Some(true) { - stacks.min_until.pop(); drive.success(); } else { - stacks.repeat_last().count = stacks.min_until_last().count - 1; + stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; drive.sync_string_position(); - stacks.min_until.pop(); drive.failure(); } }); }); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; } #[derive(Debug, Clone, Copy)] @@ -715,28 +691,20 @@ struct MaxUntilContext { /* maximizing repeat */ fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { - // let repeat_ctx = stacks.repeat.last_mut().unwrap(); let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; drive.sync_string_position(); repeat_ctx.count += 1; - // let count = repeat_ctx.count + 1; - if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - // repeat_ctx.count = count; drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { if drive.popped_ctx().has_matched == Some(true) { - // stacks.max_until.pop(); drive.success(); } else { - // let count = stacks.max_until_last().count; - // stacks.repeat_last().count -= 1; stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; drive.sync_string_position(); - // stacks.max_until.pop(); drive.failure(); } }); @@ -757,14 +725,15 @@ fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { drive.state.marks_push(); drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { - let save_last_position = stacks.max_until_last().save_last_position; + let save_last_position = stacks.max_until.pop().unwrap().save_last_position; let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; repeat_ctx.last_position = save_last_position; + if drive.popped_ctx().has_matched == Some(true) { drive.state.marks_pop_discard(); - stacks.max_until.pop(); return drive.success(); } + drive.state.marks_pop(); repeat_ctx.count -= 1; drive.sync_string_position(); @@ -782,9 +751,7 @@ fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { let next_ctx = drive.next_ctx(1, tail_callback); next_ctx.repeat_ctx_id = repeat_ctx.prev_id; - fn tail_callback(drive: &mut StateContext, stacks: &mut Stacks) { - stacks.max_until.pop(); - + fn tail_callback(drive: &mut StateContext, _stacks: &mut Stacks) { if drive.popped_ctx().has_matched == Some(true) { drive.success(); } else { @@ -823,15 +790,6 @@ impl Stacks { fn repeat_one_last(&mut self) -> &mut RepeatOneContext { self.repeat_one.last_mut().unwrap() } - fn repeat_last(&mut self) -> &mut RepeatContext { - self.repeat.last_mut().unwrap() - } - fn min_until_last(&mut self) -> &mut MinUntilContext { - self.min_until.last_mut().unwrap() - } - fn max_until_last(&mut self) -> &mut MaxUntilContext { - self.max_until.last_mut().unwrap() - } } #[derive(Debug, Clone, Copy)] @@ -1327,7 +1285,7 @@ fn general_count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize count } -fn count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { +fn _count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { let save_ctx = drive.ctx; let max_count = std::cmp::min(max_count, drive.remaining_chars()); let end = drive.ctx.string_position + max_count; From bf57f289bff1ec5633316a924e82fbb5f5ed6eb0 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 27 Jul 2022 21:33:43 +0200 Subject: [PATCH 053/705] update version to 0.2.1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 6ba3996947..00123d92c5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.2.0" +version = "0.2.1" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" From 9058f287881af7fc0f004759184a0ff5d0811967 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 28 Jul 2022 22:46:28 +0200 Subject: [PATCH 054/705] refactor trait StrDrive instead enum --- src/engine.rs | 2049 +++++++++++++++++++++++++------------------------ 1 file changed, 1055 insertions(+), 994 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 223aa3425c..8865eb6a39 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -9,8 +9,8 @@ const fn is_py_ascii_whitespace(b: u8) -> bool { } #[derive(Debug)] -pub struct State<'a> { - pub string: StrDrive<'a>, +pub struct State<'a, S: StrDrive> { + pub string: S, pub start: usize, pub end: usize, _flags: SreFlag, @@ -18,18 +18,25 @@ pub struct State<'a> { pub marks: Vec>, pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, - context_stack: Vec, - _stacks: Option>, + context_stack: Vec>, + // branch_stack: Vec, + // min_repeat_one_stack: Vec, + // repeat_one_stack: Vec, + // repeat_stack: Vec, + // min_until_stack: Vec, + // max_until_stack: Vec, + // _stacks: Option>, pub string_position: usize, - popped_context: Option, + popped_context: Option>, + next_context: Option>, pub has_matched: bool, pub match_all: bool, pub must_advance: bool, } -impl<'a> State<'a> { +impl<'a, S: StrDrive> State<'a, S> { pub fn new( - string: StrDrive<'a>, + string: S, start: usize, end: usize, flags: SreFlag, @@ -47,9 +54,16 @@ impl<'a> State<'a> { lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), - _stacks: Default::default(), + // branch_stack: Vec::new(), + // min_repeat_one_stack: Vec::new(), + // repeat_one_stack: Vec::new(), + // repeat_stack: Vec::new(), + // min_until_stack: Vec::new(), + // max_until_stack: Vec::new(), + // _stacks: Default::default(), string_position: start, popped_context: None, + next_context: None, has_matched: false, match_all: false, must_advance: false, @@ -61,11 +75,15 @@ impl<'a> State<'a> { self.marks.clear(); self.marks_stack.clear(); self.context_stack.clear(); - if let Some(x) = self._stacks.as_mut() { - x.clear() - }; + // self.branch_stack.clear(); + // self.min_repeat_one_stack.clear(); + // self.repeat_one_stack.clear(); + // self.repeat_stack.clear(); + // self.min_until_stack.clear(); + // self.max_until_stack.clear(); self.string_position = self.start; self.popped_context = None; + self.next_context = None; self.has_matched = false; } @@ -103,47 +121,46 @@ impl<'a> State<'a> { self.marks_stack.pop(); } - fn _match(mut self, stacks: &mut Stacks) -> Self { - while let Some(ctx) = self.context_stack.pop() { - let mut drive = StateContext { - state: self, - ctx, - next_ctx: None, - }; + fn _match(&mut self) { + while let Some(mut ctx) = self.context_stack.pop() { + // let mut drive = StateContext { + // state: self, + // ctx, + // next_ctx: None, + // }; + // let mut state = self; - if let Some(handler) = drive.ctx.handler { - handler(&mut drive, stacks); - } else if drive.remaining_codes() > 0 { - let code = drive.peek_code(0); + if let Some(handler) = ctx.handler { + handler(self, &mut ctx); + } else if ctx.remaining_codes(self) > 0 { + let code = ctx.peek_code(self, 0); let code = SreOpcode::try_from(code).unwrap(); - dispatch(code, &mut drive, stacks); + self.dispatch(code, &mut ctx); } else { - drive.failure(); + ctx.failure(); } - let StateContext { - mut state, - ctx, - next_ctx, - } = drive; + // let StateContext { + // mut state, + // ctx, + // next_ctx, + // } = drive; if ctx.has_matched.is_some() { - state.popped_context = Some(ctx); + self.popped_context = Some(ctx); } else { - state.context_stack.push(ctx); - if let Some(next_ctx) = next_ctx { - state.context_stack.push(next_ctx); + self.context_stack.push(ctx); + if let Some(next_ctx) = self.next_context.take() { + self.context_stack.push(next_ctx); } } - self = state + // self = state } - self.has_matched = self.popped_context.unwrap().has_matched == Some(true); - self + self.has_matched = self.popped_context.take().unwrap().has_matched == Some(true); + // self } pub fn pymatch(mut self) -> Self { - let mut stacks = self._stacks.take().unwrap_or_default(); - let ctx = MatchContext { string_position: self.start, string_offset: self.string.offset(0, self.start), @@ -155,13 +172,11 @@ impl<'a> State<'a> { }; self.context_stack.push(ctx); - self = self._match(&mut stacks); - self._stacks = Some(stacks); + self._match(); self } pub fn search(mut self) -> Self { - let mut stacks = self._stacks.take().unwrap_or_default(); // TODO: optimize by op info and skip prefix if self.start > self.end { @@ -180,14 +195,13 @@ impl<'a> State<'a> { repeat_ctx_id: usize::MAX, }; self.context_stack.push(ctx); - self = self._match(&mut stacks); + self._match(); self.must_advance = false; while !self.has_matched && self.start < self.end { self.start += 1; start_offset = self.string.offset(start_offset, 1); self.reset(); - stacks.clear(); let ctx = MatchContext { string_position: self.start, @@ -199,697 +213,730 @@ impl<'a> State<'a> { repeat_ctx_id: usize::MAX, }; self.context_stack.push(ctx); - self = self._match(&mut stacks); + self._match(); } - self._stacks = Some(stacks); self } -} -fn dispatch(opcode: SreOpcode, drive: &mut StateContext, stacks: &mut Stacks) { - match opcode { - SreOpcode::FAILURE => { - drive.failure(); - } - SreOpcode::SUCCESS => { - drive.ctx.has_matched = Some(drive.can_success()); - if drive.ctx.has_matched == Some(true) { - drive.state.string_position = drive.ctx.string_position; - } - } - SreOpcode::ANY => { - if drive.at_end() || drive.at_linebreak() { - drive.failure(); - } else { - drive.skip_code(1); - drive.skip_char(1); - } - } - SreOpcode::ANY_ALL => { - if drive.at_end() { - drive.failure(); - } else { - drive.skip_code(1); - drive.skip_char(1); - } - } - SreOpcode::ASSERT => op_assert(drive), - SreOpcode::ASSERT_NOT => op_assert_not(drive), - SreOpcode::AT => { - let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); - if at(drive, atcode) { - drive.skip_code(2); - } else { - drive.failure(); - } - } - SreOpcode::BRANCH => op_branch(drive, stacks), - SreOpcode::CATEGORY => { - let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); - if drive.at_end() || !category(catcode, drive.peek_char()) { - drive.failure(); - } else { - drive.skip_code(2); - drive.skip_char(1); - } - } - SreOpcode::IN => general_op_in(drive, charset), - SreOpcode::IN_IGNORE => general_op_in(drive, |set, c| charset(set, lower_ascii(c))), - SreOpcode::IN_UNI_IGNORE => general_op_in(drive, |set, c| charset(set, lower_unicode(c))), - SreOpcode::IN_LOC_IGNORE => general_op_in(drive, charset_loc_ignore), - SreOpcode::INFO | SreOpcode::JUMP => drive.skip_code_from(1), - SreOpcode::LITERAL => general_op_literal(drive, |code, c| code == c), - SreOpcode::NOT_LITERAL => general_op_literal(drive, |code, c| code != c), - SreOpcode::LITERAL_IGNORE => general_op_literal(drive, |code, c| code == lower_ascii(c)), - SreOpcode::NOT_LITERAL_IGNORE => { - general_op_literal(drive, |code, c| code != lower_ascii(c)) - } - SreOpcode::LITERAL_UNI_IGNORE => { - general_op_literal(drive, |code, c| code == lower_unicode(c)) - } - SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_op_literal(drive, |code, c| code != lower_unicode(c)) - } - SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(drive, char_loc_ignore), - SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_op_literal(drive, |code, c| !char_loc_ignore(code, c)) - } - SreOpcode::MARK => { - drive - .state - .set_mark(drive.peek_code(1) as usize, drive.ctx.string_position); - drive.skip_code(2); - } - SreOpcode::MAX_UNTIL => op_max_until(drive, stacks), - SreOpcode::MIN_UNTIL => op_min_until(drive, stacks), - SreOpcode::REPEAT => op_repeat(drive, stacks), - SreOpcode::REPEAT_ONE => op_repeat_one(drive, stacks), - SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(drive, stacks), - SreOpcode::GROUPREF => general_op_groupref(drive, |x| x), - SreOpcode::GROUPREF_IGNORE => general_op_groupref(drive, lower_ascii), - SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(drive, lower_locate), - SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(drive, lower_unicode), - SreOpcode::GROUPREF_EXISTS => { - let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); - match (group_start, group_end) { - (Some(start), Some(end)) if start <= end => { - drive.skip_code(3); - } - _ => drive.skip_code_from(2), + fn dispatch(&mut self, opcode: SreOpcode, ctx: &mut MatchContext<'a, S>) { + match opcode { + SreOpcode::FAILURE => { + ctx.has_matched = Some(false); } + SreOpcode::SUCCESS => todo!(), + SreOpcode::ANY => todo!(), + SreOpcode::ANY_ALL => todo!(), + SreOpcode::ASSERT => todo!(), + SreOpcode::ASSERT_NOT => todo!(), + SreOpcode::AT => todo!(), + SreOpcode::BRANCH => todo!(), + SreOpcode::CALL => todo!(), + SreOpcode::CATEGORY => todo!(), + SreOpcode::CHARSET => todo!(), + SreOpcode::BIGCHARSET => todo!(), + SreOpcode::GROUPREF => todo!(), + SreOpcode::GROUPREF_EXISTS => todo!(), + SreOpcode::IN => todo!(), + SreOpcode::INFO => todo!(), + SreOpcode::JUMP => todo!(), + SreOpcode::LITERAL => todo!(), + SreOpcode::MARK => todo!(), + SreOpcode::MAX_UNTIL => todo!(), + SreOpcode::MIN_UNTIL => todo!(), + SreOpcode::NOT_LITERAL => todo!(), + SreOpcode::NEGATE => todo!(), + SreOpcode::RANGE => todo!(), + SreOpcode::REPEAT => todo!(), + SreOpcode::REPEAT_ONE => todo!(), + SreOpcode::SUBPATTERN => todo!(), + SreOpcode::MIN_REPEAT_ONE => todo!(), + SreOpcode::GROUPREF_IGNORE => todo!(), + SreOpcode::IN_IGNORE => todo!(), + SreOpcode::LITERAL_IGNORE => todo!(), + SreOpcode::NOT_LITERAL_IGNORE => todo!(), + SreOpcode::GROUPREF_LOC_IGNORE => todo!(), + SreOpcode::IN_LOC_IGNORE => todo!(), + SreOpcode::LITERAL_LOC_IGNORE => todo!(), + SreOpcode::NOT_LITERAL_LOC_IGNORE => todo!(), + SreOpcode::GROUPREF_UNI_IGNORE => todo!(), + SreOpcode::IN_UNI_IGNORE => todo!(), + SreOpcode::LITERAL_UNI_IGNORE => todo!(), + SreOpcode::NOT_LITERAL_UNI_IGNORE => todo!(), + SreOpcode::RANGE_UNI_IGNORE => todo!(), } - _ => unreachable!("unexpected opcode"), - } -} - -/* assert subpattern */ -/* */ -fn op_assert(drive: &mut StateContext) { - let back = drive.peek_code(2) as usize; - - if drive.ctx.string_position < back { - return drive.failure(); - } - - let offset = drive - .state - .string - .back_offset(drive.ctx.string_offset, back); - let position = drive.ctx.string_position - back; - - drive.state.string_position = position; - - let next_ctx = drive.next_ctx(3, |drive, _| { - if drive.popped_ctx().has_matched == Some(true) { - drive.ctx.handler = None; - drive.skip_code_from(1); - } else { - drive.failure(); - } - }); - next_ctx.string_position = position; - next_ctx.string_offset = offset; - next_ctx.toplevel = false; -} - -/* assert not subpattern */ -/* */ -fn op_assert_not(drive: &mut StateContext) { - let back = drive.peek_code(2) as usize; - - if drive.ctx.string_position < back { - return drive.skip_code_from(1); - } - - let offset = drive - .state - .string - .back_offset(drive.ctx.string_offset, back); - let position = drive.ctx.string_position - back; - - drive.state.string_position = position; - - let next_ctx = drive.next_ctx(3, |drive, _| { - if drive.popped_ctx().has_matched == Some(true) { - drive.failure(); - } else { - drive.ctx.handler = None; - drive.skip_code_from(1); - } - }); - next_ctx.string_position = position; - next_ctx.string_offset = offset; - next_ctx.toplevel = false; -} - -#[derive(Debug)] -struct BranchContext { - branch_offset: usize, -} - -// alternation -// <0=skip> code ... -fn op_branch(drive: &mut StateContext, stacks: &mut Stacks) { - drive.state.marks_push(); - stacks.branch.push(BranchContext { branch_offset: 1 }); - create_context(drive, stacks); - - fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { - let branch_offset = stacks.branch_last().branch_offset; - let next_length = drive.peek_code(branch_offset) as usize; - if next_length == 0 { - drive.state.marks_pop_discard(); - stacks.branch.pop(); - return drive.failure(); - } - - drive.sync_string_position(); - - stacks.branch_last().branch_offset += next_length; - drive.next_ctx(branch_offset + 1, callback); - } - - fn callback(drive: &mut StateContext, stacks: &mut Stacks) { - if drive.popped_ctx().has_matched == Some(true) { - stacks.branch.pop(); - return drive.success(); - } - drive.state.marks_pop_keep(); - drive.ctx.handler = Some(create_context) - } -} - -#[derive(Debug, Copy, Clone)] -struct MinRepeatOneContext { - count: usize, - max_count: usize, -} - -/* <1=min> <2=max> item tail */ -fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { - let min_count = drive.peek_code(2) as usize; - let max_count = drive.peek_code(3) as usize; - - if drive.remaining_chars() < min_count { - return drive.failure(); - } - - drive.sync_string_position(); - - let count = if min_count == 0 { - 0 - } else { - let count = _count(drive, stacks, min_count); - if count < min_count { - return drive.failure(); - } - drive.skip_char(count); - count - }; - - let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { - // tail is empty. we're finished - drive.sync_string_position(); - return drive.success(); - } - - drive.state.marks_push(); - stacks - .min_repeat_one - .push(MinRepeatOneContext { count, max_count }); - create_context(drive, stacks); - - fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { - let MinRepeatOneContext { count, max_count } = *stacks.min_repeat_one_last(); - - if max_count == MAXREPEAT || count <= max_count { - drive.sync_string_position(); - drive.next_ctx_from(1, callback); - } else { - drive.state.marks_pop_discard(); - stacks.min_repeat_one.pop(); - drive.failure(); - } - } - - fn callback(drive: &mut StateContext, stacks: &mut Stacks) { - if drive.popped_ctx().has_matched == Some(true) { - stacks.min_repeat_one.pop(); - return drive.success(); - } - - drive.sync_string_position(); - - if _count(drive, stacks, 1) == 0 { - drive.state.marks_pop_discard(); - stacks.min_repeat_one.pop(); - return drive.failure(); - } - - drive.skip_char(1); - stacks.min_repeat_one_last().count += 1; - drive.state.marks_pop_keep(); - create_context(drive, stacks); } } -#[derive(Debug, Copy, Clone)] -struct RepeatOneContext { - count: usize, - min_count: usize, - following_literal: Option, +// fn dispatch(opcode: SreOpcode, drive: &mut StateContext, stacks: &mut Stacks) { +// match opcode { +// SreOpcode::FAILURE => { +// drive.failure(); +// } +// SreOpcode::SUCCESS => { +// drive.ctx.has_matched = Some(drive.can_success()); +// if drive.ctx.has_matched == Some(true) { +// drive.state.string_position = drive.ctx.string_position; +// } +// } +// SreOpcode::ANY => { +// if drive.at_end() || drive.at_linebreak() { +// drive.failure(); +// } else { +// drive.skip_code(1); +// drive.skip_char(1); +// } +// } +// SreOpcode::ANY_ALL => { +// if drive.at_end() { +// drive.failure(); +// } else { +// drive.skip_code(1); +// drive.skip_char(1); +// } +// } +// SreOpcode::ASSERT => op_assert(drive), +// SreOpcode::ASSERT_NOT => op_assert_not(drive), +// SreOpcode::AT => { +// let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); +// if at(drive, atcode) { +// drive.skip_code(2); +// } else { +// drive.failure(); +// } +// } +// SreOpcode::BRANCH => op_branch(drive, stacks), +// SreOpcode::CATEGORY => { +// let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); +// if drive.at_end() || !category(catcode, drive.peek_char()) { +// drive.failure(); +// } else { +// drive.skip_code(2); +// drive.skip_char(1); +// } +// } +// SreOpcode::IN => general_op_in(drive, charset), +// SreOpcode::IN_IGNORE => general_op_in(drive, |set, c| charset(set, lower_ascii(c))), +// SreOpcode::IN_UNI_IGNORE => general_op_in(drive, |set, c| charset(set, lower_unicode(c))), +// SreOpcode::IN_LOC_IGNORE => general_op_in(drive, charset_loc_ignore), +// SreOpcode::INFO | SreOpcode::JUMP => drive.skip_code_from(1), +// SreOpcode::LITERAL => general_op_literal(drive, |code, c| code == c), +// SreOpcode::NOT_LITERAL => general_op_literal(drive, |code, c| code != c), +// SreOpcode::LITERAL_IGNORE => general_op_literal(drive, |code, c| code == lower_ascii(c)), +// SreOpcode::NOT_LITERAL_IGNORE => { +// general_op_literal(drive, |code, c| code != lower_ascii(c)) +// } +// SreOpcode::LITERAL_UNI_IGNORE => { +// general_op_literal(drive, |code, c| code == lower_unicode(c)) +// } +// SreOpcode::NOT_LITERAL_UNI_IGNORE => { +// general_op_literal(drive, |code, c| code != lower_unicode(c)) +// } +// SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(drive, char_loc_ignore), +// SreOpcode::NOT_LITERAL_LOC_IGNORE => { +// general_op_literal(drive, |code, c| !char_loc_ignore(code, c)) +// } +// SreOpcode::MARK => { +// drive +// .state +// .set_mark(drive.peek_code(1) as usize, drive.ctx.string_position); +// drive.skip_code(2); +// } +// SreOpcode::MAX_UNTIL => op_max_until(drive, stacks), +// SreOpcode::MIN_UNTIL => op_min_until(drive, stacks), +// SreOpcode::REPEAT => op_repeat(drive, stacks), +// SreOpcode::REPEAT_ONE => op_repeat_one(drive, stacks), +// SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(drive, stacks), +// SreOpcode::GROUPREF => general_op_groupref(drive, |x| x), +// SreOpcode::GROUPREF_IGNORE => general_op_groupref(drive, lower_ascii), +// SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(drive, lower_locate), +// SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(drive, lower_unicode), +// SreOpcode::GROUPREF_EXISTS => { +// let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); +// match (group_start, group_end) { +// (Some(start), Some(end)) if start <= end => { +// drive.skip_code(3); +// } +// _ => drive.skip_code_from(2), +// } +// } +// _ => unreachable!("unexpected opcode"), +// } +// } + +// /* assert subpattern */ +// /* */ +// fn op_assert(drive: &mut StateContext) { +// let back = drive.peek_code(2) as usize; + +// if drive.ctx.string_position < back { +// return drive.failure(); +// } + +// let offset = drive +// .state +// .string +// .back_offset(drive.ctx.string_offset, back); +// let position = drive.ctx.string_position - back; + +// drive.state.string_position = position; + +// let next_ctx = drive.next_ctx(3, |drive, _| { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.ctx.handler = None; +// drive.skip_code_from(1); +// } else { +// drive.failure(); +// } +// }); +// next_ctx.string_position = position; +// next_ctx.string_offset = offset; +// next_ctx.toplevel = false; +// } + +// /* assert not subpattern */ +// /* */ +// fn op_assert_not(drive: &mut StateContext) { +// let back = drive.peek_code(2) as usize; + +// if drive.ctx.string_position < back { +// return drive.skip_code_from(1); +// } + +// let offset = drive +// .state +// .string +// .back_offset(drive.ctx.string_offset, back); +// let position = drive.ctx.string_position - back; + +// drive.state.string_position = position; + +// let next_ctx = drive.next_ctx(3, |drive, _| { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.failure(); +// } else { +// drive.ctx.handler = None; +// drive.skip_code_from(1); +// } +// }); +// next_ctx.string_position = position; +// next_ctx.string_offset = offset; +// next_ctx.toplevel = false; +// } + +// #[derive(Debug)] +// struct BranchContext { +// branch_offset: usize, +// } + +// // alternation +// // <0=skip> code ... +// fn op_branch(drive: &mut StateContext, stacks: &mut Stacks) { +// drive.state.marks_push(); +// stacks.branch.push(BranchContext { branch_offset: 1 }); +// create_context(drive, stacks); + +// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { +// let branch_offset = stacks.branch_last().branch_offset; +// let next_length = drive.peek_code(branch_offset) as usize; +// if next_length == 0 { +// drive.state.marks_pop_discard(); +// stacks.branch.pop(); +// return drive.failure(); +// } + +// drive.sync_string_position(); + +// stacks.branch_last().branch_offset += next_length; +// drive.next_ctx(branch_offset + 1, callback); +// } + +// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { +// if drive.popped_ctx().has_matched == Some(true) { +// stacks.branch.pop(); +// return drive.success(); +// } +// drive.state.marks_pop_keep(); +// drive.ctx.handler = Some(create_context) +// } +// } + +// #[derive(Debug, Copy, Clone)] +// struct MinRepeatOneContext { +// count: usize, +// max_count: usize, +// } + +// /* <1=min> <2=max> item tail */ +// fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { +// let min_count = drive.peek_code(2) as usize; +// let max_count = drive.peek_code(3) as usize; + +// if drive.remaining_chars() < min_count { +// return drive.failure(); +// } + +// drive.sync_string_position(); + +// let count = if min_count == 0 { +// 0 +// } else { +// let count = _count(drive, stacks, min_count); +// if count < min_count { +// return drive.failure(); +// } +// drive.skip_char(count); +// count +// }; + +// let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); +// if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { +// // tail is empty. we're finished +// drive.sync_string_position(); +// return drive.success(); +// } + +// drive.state.marks_push(); +// stacks +// .min_repeat_one +// .push(MinRepeatOneContext { count, max_count }); +// create_context(drive, stacks); + +// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { +// let MinRepeatOneContext { count, max_count } = *stacks.min_repeat_one_last(); + +// if max_count == MAXREPEAT || count <= max_count { +// drive.sync_string_position(); +// drive.next_ctx_from(1, callback); +// } else { +// drive.state.marks_pop_discard(); +// stacks.min_repeat_one.pop(); +// drive.failure(); +// } +// } + +// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { +// if drive.popped_ctx().has_matched == Some(true) { +// stacks.min_repeat_one.pop(); +// return drive.success(); +// } + +// drive.sync_string_position(); + +// if _count(drive, stacks, 1) == 0 { +// drive.state.marks_pop_discard(); +// stacks.min_repeat_one.pop(); +// return drive.failure(); +// } + +// drive.skip_char(1); +// stacks.min_repeat_one_last().count += 1; +// drive.state.marks_pop_keep(); +// create_context(drive, stacks); +// } +// } + +// #[derive(Debug, Copy, Clone)] +// struct RepeatOneContext { +// count: usize, +// min_count: usize, +// following_literal: Option, +// } + +// /* match repeated sequence (maximizing regexp) */ + +// /* this operator only works if the repeated item is +// exactly one character wide, and we're not already +// collecting backtracking points. for other cases, +// use the MAX_REPEAT operator */ + +// /* <1=min> <2=max> item tail */ +// fn op_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { +// let min_count = drive.peek_code(2) as usize; +// let max_count = drive.peek_code(3) as usize; + +// if drive.remaining_chars() < min_count { +// return drive.failure(); +// } + +// drive.sync_string_position(); + +// let count = _count(drive, stacks, max_count); +// drive.skip_char(count); +// if count < min_count { +// return drive.failure(); +// } + +// let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); +// if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { +// // tail is empty. we're finished +// drive.sync_string_position(); +// return drive.success(); +// } + +// // Special case: Tail starts with a literal. Skip positions where +// // the rest of the pattern cannot possibly match. +// let following_literal = (next_code == SreOpcode::LITERAL as u32) +// .then(|| drive.peek_code(drive.peek_code(1) as usize + 2)); + +// drive.state.marks_push(); +// stacks.repeat_one.push(RepeatOneContext { +// count, +// min_count, +// following_literal, +// }); +// create_context(drive, stacks); + +// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { +// let RepeatOneContext { +// mut count, +// min_count, +// following_literal, +// } = *stacks.repeat_one_last(); + +// if let Some(c) = following_literal { +// while drive.at_end() || drive.peek_char() != c { +// if count <= min_count { +// drive.state.marks_pop_discard(); +// stacks.repeat_one.pop(); +// return drive.failure(); +// } +// drive.back_skip_char(1); +// count -= 1; +// } +// } +// stacks.repeat_one_last().count = count; + +// drive.sync_string_position(); + +// // General case: backtracking +// drive.next_ctx_from(1, callback); +// } + +// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { +// if drive.popped_ctx().has_matched == Some(true) { +// stacks.repeat_one.pop(); +// return drive.success(); +// } + +// let RepeatOneContext { +// count, +// min_count, +// following_literal: _, +// } = stacks.repeat_one_last(); + +// if count <= min_count { +// drive.state.marks_pop_discard(); +// stacks.repeat_one.pop(); +// return drive.failure(); +// } + +// drive.back_skip_char(1); +// *count -= 1; + +// drive.state.marks_pop_keep(); +// create_context(drive, stacks); +// } +// } + +// #[derive(Debug, Clone, Copy)] +// struct RepeatContext { +// count: isize, +// min_count: usize, +// max_count: usize, +// code_position: usize, +// last_position: usize, +// prev_id: usize, +// } + +// /* create repeat context. all the hard work is done +// by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ +// /* <1=min> <2=max> item tail */ +// fn op_repeat(drive: &mut StateContext, stacks: &mut Stacks) { +// let repeat_ctx = RepeatContext { +// count: -1, +// min_count: drive.peek_code(2) as usize, +// max_count: drive.peek_code(3) as usize, +// code_position: drive.ctx.code_position, +// last_position: std::usize::MAX, +// prev_id: drive.ctx.repeat_ctx_id, +// }; + +// stacks.repeat.push(repeat_ctx); + +// drive.sync_string_position(); + +// let next_ctx = drive.next_ctx_from(1, |drive, stacks| { +// drive.ctx.has_matched = drive.popped_ctx().has_matched; +// stacks.repeat.pop(); +// }); +// next_ctx.repeat_ctx_id = stacks.repeat.len() - 1; +// } + +// #[derive(Debug, Clone, Copy)] +// struct MinUntilContext { +// save_repeat_ctx_id: usize, +// } + +// /* minimizing repeat */ +// fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { +// let repeat_ctx = stacks.repeat.last_mut().unwrap(); + +// drive.sync_string_position(); + +// repeat_ctx.count += 1; + +// if (repeat_ctx.count as usize) < repeat_ctx.min_count { +// // not enough matches +// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.success(); +// } else { +// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; +// drive.sync_string_position(); +// drive.failure(); +// } +// }); +// return; +// } + +// drive.state.marks_push(); + +// stacks.min_until.push(MinUntilContext { +// save_repeat_ctx_id: drive.ctx.repeat_ctx_id, +// }); + +// // see if the tail matches +// let next_ctx = drive.next_ctx(1, |drive, stacks| { +// drive.ctx.repeat_ctx_id = stacks.min_until.pop().unwrap().save_repeat_ctx_id; + +// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; + +// if drive.popped_ctx().has_matched == Some(true) { +// return drive.success(); +// } + +// drive.sync_string_position(); + +// drive.state.marks_pop(); + +// // match more until tail matches + +// if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT +// || drive.state.string_position == repeat_ctx.last_position +// { +// repeat_ctx.count -= 1; +// return drive.failure(); +// } + +// /* zero-width match protection */ +// repeat_ctx.last_position = drive.state.string_position; + +// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.success(); +// } else { +// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; +// drive.sync_string_position(); +// drive.failure(); +// } +// }); +// }); +// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; +// } + +// #[derive(Debug, Clone, Copy)] +// struct MaxUntilContext { +// save_last_position: usize, +// } + +// /* maximizing repeat */ +// fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { +// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; + +// drive.sync_string_position(); + +// repeat_ctx.count += 1; + +// if (repeat_ctx.count as usize) < repeat_ctx.min_count { +// // not enough matches +// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.success(); +// } else { +// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; +// drive.sync_string_position(); +// drive.failure(); +// } +// }); +// return; +// } + +// stacks.max_until.push(MaxUntilContext { +// save_last_position: repeat_ctx.last_position, +// }); + +// if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) +// && drive.state.string_position != repeat_ctx.last_position +// { +// /* we may have enough matches, but if we can +// match another item, do so */ +// repeat_ctx.last_position = drive.state.string_position; + +// drive.state.marks_push(); + +// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { +// let save_last_position = stacks.max_until.pop().unwrap().save_last_position; +// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; +// repeat_ctx.last_position = save_last_position; + +// if drive.popped_ctx().has_matched == Some(true) { +// drive.state.marks_pop_discard(); +// return drive.success(); +// } + +// drive.state.marks_pop(); +// repeat_ctx.count -= 1; +// drive.sync_string_position(); + +// /* cannot match more repeated items here. make sure the +// tail matches */ +// let next_ctx = drive.next_ctx(1, tail_callback); +// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; +// }); +// return; +// } + +// /* cannot match more repeated items here. make sure the +// tail matches */ +// let next_ctx = drive.next_ctx(1, tail_callback); +// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + +// fn tail_callback(drive: &mut StateContext, _stacks: &mut Stacks) { +// if drive.popped_ctx().has_matched == Some(true) { +// drive.success(); +// } else { +// drive.sync_string_position(); +// drive.failure(); +// } +// } +// } + +// #[derive(Debug, Default)] +// struct Stacks { +// } + +// impl Stacks { +// fn clear(&mut self) { +// self.branch.clear(); +// self.min_repeat_one.clear(); +// self.repeat_one.clear(); +// self.repeat.clear(); +// self.min_until.clear(); +// self.max_until.clear(); +// } + +// fn branch_last(&mut self) -> &mut BranchContext { +// self.branch.last_mut().unwrap() +// } +// fn min_repeat_one_last(&mut self) -> &mut MinRepeatOneContext { +// self.min_repeat_one.last_mut().unwrap() +// } +// fn repeat_one_last(&mut self) -> &mut RepeatOneContext { +// self.repeat_one.last_mut().unwrap() +// } +// } + +pub trait StrDrive { + fn offset(&self, offset: usize, skip: usize) -> usize; + fn count(&self) -> usize; + fn peek(&self, offset: usize) -> u32; + fn back_peek(&self, offset: usize) -> u32; + fn back_offset(&self, offset: usize, skip: usize) -> usize; } -/* match repeated sequence (maximizing regexp) */ - -/* this operator only works if the repeated item is -exactly one character wide, and we're not already -collecting backtracking points. for other cases, -use the MAX_REPEAT operator */ - -/* <1=min> <2=max> item tail */ -fn op_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { - let min_count = drive.peek_code(2) as usize; - let max_count = drive.peek_code(3) as usize; - - if drive.remaining_chars() < min_count { - return drive.failure(); - } - - drive.sync_string_position(); - let count = _count(drive, stacks, max_count); - drive.skip_char(count); - if count < min_count { - return drive.failure(); +impl<'a> StrDrive for &'a str { + fn offset(&self, offset: usize, skip: usize) -> usize { + self.get(offset..) + .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) + .unwrap_or(self.len()) } - let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { - // tail is empty. we're finished - drive.sync_string_position(); - return drive.success(); + fn count(&self) -> usize { + self.chars().count() } - // Special case: Tail starts with a literal. Skip positions where - // the rest of the pattern cannot possibly match. - let following_literal = (next_code == SreOpcode::LITERAL as u32) - .then(|| drive.peek_code(drive.peek_code(1) as usize + 2)); - - drive.state.marks_push(); - stacks.repeat_one.push(RepeatOneContext { - count, - min_count, - following_literal, - }); - create_context(drive, stacks); - - fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { - let RepeatOneContext { - mut count, - min_count, - following_literal, - } = *stacks.repeat_one_last(); - - if let Some(c) = following_literal { - while drive.at_end() || drive.peek_char() != c { - if count <= min_count { - drive.state.marks_pop_discard(); - stacks.repeat_one.pop(); - return drive.failure(); - } - drive.back_skip_char(1); - count -= 1; - } - } - stacks.repeat_one_last().count = count; - - drive.sync_string_position(); - - // General case: backtracking - drive.next_ctx_from(1, callback); + fn peek(&self, offset: usize) -> u32 { + unsafe { self.get_unchecked(offset..) } + .chars() + .next() + .unwrap() as u32 } - fn callback(drive: &mut StateContext, stacks: &mut Stacks) { - if drive.popped_ctx().has_matched == Some(true) { - stacks.repeat_one.pop(); - return drive.success(); - } - - let RepeatOneContext { - count, - min_count, - following_literal: _, - } = stacks.repeat_one_last(); - - if count <= min_count { - drive.state.marks_pop_discard(); - stacks.repeat_one.pop(); - return drive.failure(); + fn back_peek(&self, offset: usize) -> u32 { + let bytes = self.as_bytes(); + let back_offset = utf8_back_peek_offset(bytes, offset); + match offset - back_offset { + 1 => u32::from_be_bytes([0, 0, 0, bytes[offset - 1]]), + 2 => u32::from_be_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), + 3 => u32::from_be_bytes([0, bytes[offset - 3], bytes[offset - 2], bytes[offset - 1]]), + 4 => u32::from_be_bytes([ + bytes[offset - 4], + bytes[offset - 3], + bytes[offset - 2], + bytes[offset - 1], + ]), + _ => unreachable!(), } - - drive.back_skip_char(1); - *count -= 1; - - drive.state.marks_pop_keep(); - create_context(drive, stacks); } -} - -#[derive(Debug, Clone, Copy)] -struct RepeatContext { - count: isize, - min_count: usize, - max_count: usize, - code_position: usize, - last_position: usize, - prev_id: usize, -} - -/* create repeat context. all the hard work is done -by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ -/* <1=min> <2=max> item tail */ -fn op_repeat(drive: &mut StateContext, stacks: &mut Stacks) { - let repeat_ctx = RepeatContext { - count: -1, - min_count: drive.peek_code(2) as usize, - max_count: drive.peek_code(3) as usize, - code_position: drive.ctx.code_position, - last_position: std::usize::MAX, - prev_id: drive.ctx.repeat_ctx_id, - }; - - stacks.repeat.push(repeat_ctx); - - drive.sync_string_position(); - - let next_ctx = drive.next_ctx_from(1, |drive, stacks| { - drive.ctx.has_matched = drive.popped_ctx().has_matched; - stacks.repeat.pop(); - }); - next_ctx.repeat_ctx_id = stacks.repeat.len() - 1; -} - -#[derive(Debug, Clone, Copy)] -struct MinUntilContext { - save_repeat_ctx_id: usize, -} - -/* minimizing repeat */ -fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { - let repeat_ctx = stacks.repeat.last_mut().unwrap(); - - drive.sync_string_position(); - - repeat_ctx.count += 1; - - if (repeat_ctx.count as usize) < repeat_ctx.min_count { - // not enough matches - drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { - if drive.popped_ctx().has_matched == Some(true) { - drive.success(); - } else { - stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; - drive.sync_string_position(); - drive.failure(); - } - }); - return; - } - - drive.state.marks_push(); - - stacks.min_until.push(MinUntilContext { - save_repeat_ctx_id: drive.ctx.repeat_ctx_id, - }); - - // see if the tail matches - let next_ctx = drive.next_ctx(1, |drive, stacks| { - drive.ctx.repeat_ctx_id = stacks.min_until.pop().unwrap().save_repeat_ctx_id; - - let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; - - if drive.popped_ctx().has_matched == Some(true) { - return drive.success(); - } - - drive.sync_string_position(); - - drive.state.marks_pop(); - - // match more until tail matches - - if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT - || drive.state.string_position == repeat_ctx.last_position - { - repeat_ctx.count -= 1; - return drive.failure(); - } - - /* zero-width match protection */ - repeat_ctx.last_position = drive.state.string_position; - - drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { - if drive.popped_ctx().has_matched == Some(true) { - drive.success(); - } else { - stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; - drive.sync_string_position(); - drive.failure(); - } - }); - }); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; -} - -#[derive(Debug, Clone, Copy)] -struct MaxUntilContext { - save_last_position: usize, -} - -/* maximizing repeat */ -fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { - let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; - - drive.sync_string_position(); - repeat_ctx.count += 1; - - if (repeat_ctx.count as usize) < repeat_ctx.min_count { - // not enough matches - drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { - if drive.popped_ctx().has_matched == Some(true) { - drive.success(); - } else { - stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; - drive.sync_string_position(); - drive.failure(); - } - }); - return; - } - - stacks.max_until.push(MaxUntilContext { - save_last_position: repeat_ctx.last_position, - }); - - if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) - && drive.state.string_position != repeat_ctx.last_position - { - /* we may have enough matches, but if we can - match another item, do so */ - repeat_ctx.last_position = drive.state.string_position; - - drive.state.marks_push(); - - drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { - let save_last_position = stacks.max_until.pop().unwrap().save_last_position; - let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; - repeat_ctx.last_position = save_last_position; - - if drive.popped_ctx().has_matched == Some(true) { - drive.state.marks_pop_discard(); - return drive.success(); - } - - drive.state.marks_pop(); - repeat_ctx.count -= 1; - drive.sync_string_position(); - - /* cannot match more repeated items here. make sure the - tail matches */ - let next_ctx = drive.next_ctx(1, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; - }); - return; - } - - /* cannot match more repeated items here. make sure the - tail matches */ - let next_ctx = drive.next_ctx(1, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; - - fn tail_callback(drive: &mut StateContext, _stacks: &mut Stacks) { - if drive.popped_ctx().has_matched == Some(true) { - drive.success(); - } else { - drive.sync_string_position(); - drive.failure(); + fn back_offset(&self, offset: usize, skip: usize) -> usize { + let bytes = self.as_bytes(); + let mut back_offset = offset; + for _ in 0..skip { + back_offset = utf8_back_peek_offset(bytes, back_offset); } + back_offset } } -#[derive(Debug, Default)] -struct Stacks { - branch: Vec, - min_repeat_one: Vec, - repeat_one: Vec, - repeat: Vec, - min_until: Vec, - max_until: Vec, -} - -impl Stacks { - fn clear(&mut self) { - self.branch.clear(); - self.min_repeat_one.clear(); - self.repeat_one.clear(); - self.repeat.clear(); - self.min_until.clear(); - self.max_until.clear(); - } - - fn branch_last(&mut self) -> &mut BranchContext { - self.branch.last_mut().unwrap() - } - fn min_repeat_one_last(&mut self) -> &mut MinRepeatOneContext { - self.min_repeat_one.last_mut().unwrap() - } - fn repeat_one_last(&mut self) -> &mut RepeatOneContext { - self.repeat_one.last_mut().unwrap() - } -} - -#[derive(Debug, Clone, Copy)] -pub enum StrDrive<'a> { - Str(&'a str), - Bytes(&'a [u8]), -} - -impl<'a> From<&'a str> for StrDrive<'a> { - fn from(s: &'a str) -> Self { - Self::Str(s) - } -} -impl<'a> From<&'a [u8]> for StrDrive<'a> { - fn from(b: &'a [u8]) -> Self { - Self::Bytes(b) - } -} - -impl<'a> StrDrive<'a> { +impl<'a> StrDrive for &'a [u8] { fn offset(&self, offset: usize, skip: usize) -> usize { - match *self { - StrDrive::Str(s) => s - .get(offset..) - .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) - .unwrap_or(s.len()), - StrDrive::Bytes(_) => offset + skip, - } + offset + skip } - pub fn count(&self) -> usize { - match *self { - StrDrive::Str(s) => s.chars().count(), - StrDrive::Bytes(b) => b.len(), - } + fn count(&self) -> usize { + self.len() } fn peek(&self, offset: usize) -> u32 { - match *self { - StrDrive::Str(s) => unsafe { s.get_unchecked(offset..) }.chars().next().unwrap() as u32, - StrDrive::Bytes(b) => b[offset] as u32, - } + self[offset] as u32 } fn back_peek(&self, offset: usize) -> u32 { - match *self { - StrDrive::Str(s) => { - let bytes = s.as_bytes(); - let back_offset = utf8_back_peek_offset(bytes, offset); - match offset - back_offset { - 1 => u32::from_be_bytes([0, 0, 0, bytes[offset - 1]]), - 2 => u32::from_be_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), - 3 => u32::from_be_bytes([ - 0, - bytes[offset - 3], - bytes[offset - 2], - bytes[offset - 1], - ]), - 4 => u32::from_be_bytes([ - bytes[offset - 4], - bytes[offset - 3], - bytes[offset - 2], - bytes[offset - 1], - ]), - _ => unreachable!(), - } - } - StrDrive::Bytes(b) => b[offset - 1] as u32, - } + self[offset - 1] as u32 } fn back_offset(&self, offset: usize, skip: usize) -> usize { - match *self { - StrDrive::Str(s) => { - let bytes = s.as_bytes(); - let mut back_offset = offset; - for _ in 0..skip { - back_offset = utf8_back_peek_offset(bytes, back_offset); - } - back_offset - } - StrDrive::Bytes(_) => offset - skip, - } + offset - skip } } -type OpcodeHandler = fn(&mut StateContext, &mut Stacks); +// type OpcodeHandler = for<'a>fn(&mut StateContext<'a, S>, &mut Stacks); #[derive(Clone, Copy)] -struct MatchContext { +struct MatchContext<'a, S: StrDrive> { string_position: usize, string_offset: usize, code_position: usize, has_matched: Option, toplevel: bool, - handler: Option, + handler: Option, &mut Self)>, repeat_ctx_id: usize, } -impl std::fmt::Debug for MatchContext { +impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("MatchContext") .field("string_position", &self.string_position) @@ -902,164 +949,178 @@ impl std::fmt::Debug for MatchContext { } } -trait ContextDrive { - fn ctx(&self) -> &MatchContext; - fn ctx_mut(&mut self) -> &mut MatchContext; - fn state(&self) -> &State; - - fn popped_ctx(&self) -> &MatchContext { - self.state().popped_context.as_ref().unwrap() +impl<'a, S: StrDrive> MatchContext<'a, S> { + fn remaining_codes(&self, state: &State<'a, S>) -> usize { + state.pattern_codes.len() - self.code_position } - - fn pattern(&self) -> &[u32] { - &self.state().pattern_codes[self.ctx().code_position..] - } - - fn peek_char(&self) -> u32 { - self.state().string.peek(self.ctx().string_offset) - } - fn peek_code(&self, peek: usize) -> u32 { - self.state().pattern_codes[self.ctx().code_position + peek] - } - - fn back_peek_char(&self) -> u32 { - self.state().string.back_peek(self.ctx().string_offset) - } - fn back_skip_char(&mut self, skip_count: usize) { - self.ctx_mut().string_position -= skip_count; - self.ctx_mut().string_offset = self - .state() - .string - .back_offset(self.ctx().string_offset, skip_count); - } - - fn skip_char(&mut self, skip_count: usize) { - self.ctx_mut().string_offset = self - .state() - .string - .offset(self.ctx().string_offset, skip_count); - self.ctx_mut().string_position += skip_count; - } - fn skip_code(&mut self, skip_count: usize) { - self.ctx_mut().code_position += skip_count; - } - fn skip_code_from(&mut self, peek: usize) { - self.skip_code(self.peek_code(peek) as usize + 1); - } - - fn remaining_chars(&self) -> usize { - self.state().end - self.ctx().string_position - } - fn remaining_codes(&self) -> usize { - self.state().pattern_codes.len() - self.ctx().code_position - } - - fn at_beginning(&self) -> bool { - // self.ctx().string_position == self.state().start - self.ctx().string_position == 0 - } - fn at_end(&self) -> bool { - self.ctx().string_position == self.state().end - } - fn at_linebreak(&self) -> bool { - !self.at_end() && is_linebreak(self.peek_char()) - } - fn at_boundary bool>(&self, mut word_checker: F) -> bool { - if self.at_beginning() && self.at_end() { - return false; - } - let that = !self.at_beginning() && word_checker(self.back_peek_char()); - let this = !self.at_end() && word_checker(self.peek_char()); - this != that - } - fn at_non_boundary bool>(&self, mut word_checker: F) -> bool { - if self.at_beginning() && self.at_end() { - return false; - } - let that = !self.at_beginning() && word_checker(self.back_peek_char()); - let this = !self.at_end() && word_checker(self.peek_char()); - this == that - } - - fn can_success(&self) -> bool { - if !self.ctx().toplevel { - return true; - } - if self.state().match_all && !self.at_end() { - return false; - } - if self.state().must_advance && self.ctx().string_position == self.state().start { - return false; - } - true - } - - fn success(&mut self) { - self.ctx_mut().has_matched = Some(true); + + fn peek_code(&self, state: &State<'a, S>, peek: usize) -> u32 { + state.pattern_codes[self.code_position + peek] } fn failure(&mut self) { - self.ctx_mut().has_matched = Some(false); + self.has_matched = Some(false); } } -struct StateContext<'a> { - state: State<'a>, - ctx: MatchContext, - next_ctx: Option, -} - -impl ContextDrive for StateContext<'_> { - fn ctx(&self) -> &MatchContext { - &self.ctx - } - fn ctx_mut(&mut self) -> &mut MatchContext { - &mut self.ctx - } - fn state(&self) -> &State { - &self.state - } -} - -impl StateContext<'_> { - fn next_ctx_from(&mut self, peek: usize, handler: OpcodeHandler) -> &mut MatchContext { - self.next_ctx(self.peek_code(peek) as usize + 1, handler) - } - fn next_ctx(&mut self, offset: usize, handler: OpcodeHandler) -> &mut MatchContext { - self.next_ctx_at(self.ctx.code_position + offset, handler) - } - fn next_ctx_at(&mut self, code_position: usize, handler: OpcodeHandler) -> &mut MatchContext { - self.next_ctx = Some(MatchContext { - code_position, - has_matched: None, - handler: None, - ..self.ctx - }); - self.ctx.handler = Some(handler); - self.next_ctx.as_mut().unwrap() - } - - fn sync_string_position(&mut self) { - self.state.string_position = self.ctx.string_position; - } -} - -struct StateRefContext<'a> { - entity: &'a StateContext<'a>, - ctx: MatchContext, -} - -impl ContextDrive for StateRefContext<'_> { - fn ctx(&self) -> &MatchContext { - &self.ctx - } - fn ctx_mut(&mut self) -> &mut MatchContext { - &mut self.ctx - } - fn state(&self) -> &State { - &self.entity.state - } -} +// trait ContextDrive<'a, T: StrDrive<'a>> { +// fn ctx(&self) -> &MatchContext; +// fn ctx_mut(&mut self) -> &mut MatchContext; +// fn state(&self) -> &State<'a, T>; + +// fn popped_ctx(&self) -> &MatchContext { +// self.state().popped_context.as_ref().unwrap() +// } + +// fn pattern(&self) -> &[u32] { +// &self.state().pattern_codes[self.ctx().code_position..] +// } + +// fn peek_char(&self) -> u32 { +// self.state().string.peek(self.ctx().string_offset) +// } +// fn peek_code(&self, peek: usize) -> u32 { +// self.state().pattern_codes[self.ctx().code_position + peek] +// } + +// fn back_peek_char(&self) -> u32 { +// self.state().string.back_peek(self.ctx().string_offset) +// } +// fn back_skip_char(&mut self, skip_count: usize) { +// self.ctx_mut().string_position -= skip_count; +// self.ctx_mut().string_offset = self +// .state() +// .string +// .back_offset(self.ctx().string_offset, skip_count); +// } + +// fn skip_char(&mut self, skip_count: usize) { +// self.ctx_mut().string_offset = self +// .state() +// .string +// .offset(self.ctx().string_offset, skip_count); +// self.ctx_mut().string_position += skip_count; +// } +// fn skip_code(&mut self, skip_count: usize) { +// self.ctx_mut().code_position += skip_count; +// } +// fn skip_code_from(&mut self, peek: usize) { +// self.skip_code(self.peek_code(peek) as usize + 1); +// } + +// fn remaining_chars(&self) -> usize { +// self.state().end - self.ctx().string_position +// } +// fn remaining_codes(&self) -> usize { +// self.state().pattern_codes.len() - self.ctx().code_position +// } + +// fn at_beginning(&self) -> bool { +// // self.ctx().string_position == self.state().start +// self.ctx().string_position == 0 +// } +// fn at_end(&self) -> bool { +// self.ctx().string_position == self.state().end +// } +// fn at_linebreak(&self) -> bool { +// !self.at_end() && is_linebreak(self.peek_char()) +// } +// fn at_boundary bool>(&self, mut word_checker: F) -> bool { +// if self.at_beginning() && self.at_end() { +// return false; +// } +// let that = !self.at_beginning() && word_checker(self.back_peek_char()); +// let this = !self.at_end() && word_checker(self.peek_char()); +// this != that +// } +// fn at_non_boundary bool>(&self, mut word_checker: F) -> bool { +// if self.at_beginning() && self.at_end() { +// return false; +// } +// let that = !self.at_beginning() && word_checker(self.back_peek_char()); +// let this = !self.at_end() && word_checker(self.peek_char()); +// this == that +// } + +// fn can_success(&self) -> bool { +// if !self.ctx().toplevel { +// return true; +// } +// if self.state().match_all && !self.at_end() { +// return false; +// } +// if self.state().must_advance && self.ctx().string_position == self.state().start { +// return false; +// } +// true +// } + +// fn success(&mut self) { +// self.ctx_mut().has_matched = Some(true); +// } + +// fn failure(&mut self) { +// self.ctx_mut().has_matched = Some(false); +// } +// } + +// struct StateContext<'a, S: StrDrive<'a>> { +// state: State<'a, S>, +// ctx: MatchContext, +// next_ctx: Option, +// } + +// impl<'a, S: StrDrive<'a>> ContextDrive<'a, S> for StateContext<'a, S> { +// fn ctx(&self) -> &MatchContext { +// &self.ctx +// } +// fn ctx_mut(&mut self) -> &mut MatchContext { +// &mut self.ctx +// } +// fn state(&self) -> &State<'a, S> { +// &self.state +// } +// } + +// impl StateContext<'_> { +// fn next_ctx_from(&mut self, peek: usize, handler: OpcodeHandler) -> &mut MatchContext { +// self.next_ctx(self.peek_code(peek) as usize + 1, handler) +// } +// fn next_ctx(&mut self, offset: usize, handler: OpcodeHandler) -> &mut MatchContext { +// self.next_ctx_at(self.ctx.code_position + offset, handler) +// } +// fn next_ctx_at(&mut self, code_position: usize, handler: OpcodeHandler) -> &mut MatchContext { +// self.next_ctx = Some(MatchContext { +// code_position, +// has_matched: None, +// handler: None, +// ..self.ctx +// }); +// self.ctx.handler = Some(handler); +// self.next_ctx.as_mut().unwrap() +// } + +// fn sync_string_position(&mut self) { +// self.state.string_position = self.ctx.string_position; +// } +// } + +// struct StateRefContext<'a> { +// entity: &'a StateContext<'a>, +// ctx: MatchContext, +// } + +// impl ContextDrive for StateRefContext<'_> { +// fn ctx(&self) -> &MatchContext { +// &self.ctx +// } +// fn ctx_mut(&mut self) -> &mut MatchContext { +// &mut self.ctx +// } +// fn state(&self) -> &State { +// &self.entity.state +// } +// } fn char_loc_ignore(code: u32, c: u32) -> bool { code == c || code == lower_locate(c) || code == upper_locate(c) @@ -1074,77 +1135,77 @@ fn charset_loc_ignore(set: &[u32], c: u32) -> bool { up != lo && charset(set, up) } -fn general_op_groupref u32>(drive: &mut StateContext, mut f: F) { - let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); - let (group_start, group_end) = match (group_start, group_end) { - (Some(start), Some(end)) if start <= end => (start, end), - _ => { - return drive.failure(); - } - }; - - let mut wdrive = StateRefContext { - entity: drive, - ctx: drive.ctx, - }; - let mut gdrive = StateRefContext { - entity: drive, - ctx: MatchContext { - string_position: group_start, - // TODO: cache the offset - string_offset: drive.state.string.offset(0, group_start), - ..drive.ctx - }, - }; - - for _ in group_start..group_end { - if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) { - return drive.failure(); - } - wdrive.skip_char(1); - gdrive.skip_char(1); - } - - let position = wdrive.ctx.string_position; - let offset = wdrive.ctx.string_offset; - drive.skip_code(2); - drive.ctx.string_position = position; - drive.ctx.string_offset = offset; -} - -fn general_op_literal bool>(drive: &mut StateContext, f: F) { - if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { - drive.failure(); - } else { - drive.skip_code(2); - drive.skip_char(1); - } -} - -fn general_op_in bool>(drive: &mut StateContext, f: F) { - if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { - drive.failure(); - } else { - drive.skip_code_from(1); - drive.skip_char(1); - } -} - -fn at(drive: &StateContext, atcode: SreAtCode) -> bool { - match atcode { - SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), - SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), - SreAtCode::BOUNDARY => drive.at_boundary(is_word), - SreAtCode::NON_BOUNDARY => drive.at_non_boundary(is_word), - SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(), - SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(), - SreAtCode::END_STRING => drive.at_end(), - SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word), - SreAtCode::LOC_NON_BOUNDARY => drive.at_non_boundary(is_loc_word), - SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word), - SreAtCode::UNI_NON_BOUNDARY => drive.at_non_boundary(is_uni_word), - } -} +// fn general_op_groupref u32>(drive: &mut StateContext, mut f: F) { +// let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); +// let (group_start, group_end) = match (group_start, group_end) { +// (Some(start), Some(end)) if start <= end => (start, end), +// _ => { +// return drive.failure(); +// } +// }; + +// let mut wdrive = StateRefContext { +// entity: drive, +// ctx: drive.ctx, +// }; +// let mut gdrive = StateRefContext { +// entity: drive, +// ctx: MatchContext { +// string_position: group_start, +// // TODO: cache the offset +// string_offset: drive.state.string.offset(0, group_start), +// ..drive.ctx +// }, +// }; + +// for _ in group_start..group_end { +// if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) { +// return drive.failure(); +// } +// wdrive.skip_char(1); +// gdrive.skip_char(1); +// } + +// let position = wdrive.ctx.string_position; +// let offset = wdrive.ctx.string_offset; +// drive.skip_code(2); +// drive.ctx.string_position = position; +// drive.ctx.string_offset = offset; +// } + +// fn general_op_literal bool>(drive: &mut StateContext, f: F) { +// if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { +// drive.failure(); +// } else { +// drive.skip_code(2); +// drive.skip_char(1); +// } +// } + +// fn general_op_in bool>(drive: &mut StateContext, f: F) { +// if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { +// drive.failure(); +// } else { +// drive.skip_code_from(1); +// drive.skip_char(1); +// } +// } + +// fn at(drive: &StateContext, atcode: SreAtCode) -> bool { +// match atcode { +// SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), +// SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), +// SreAtCode::BOUNDARY => drive.at_boundary(is_word), +// SreAtCode::NON_BOUNDARY => drive.at_non_boundary(is_word), +// SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(), +// SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(), +// SreAtCode::END_STRING => drive.at_end(), +// SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word), +// SreAtCode::LOC_NON_BOUNDARY => drive.at_non_boundary(is_loc_word), +// SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word), +// SreAtCode::UNI_NON_BOUNDARY => drive.at_non_boundary(is_uni_word), +// } +// } fn category(catcode: SreCatCode, c: u32) -> bool { match catcode { @@ -1262,95 +1323,95 @@ fn charset(set: &[u32], ch: u32) -> bool { false } -/* General case */ -fn general_count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { - let mut count = 0; - let max_count = std::cmp::min(max_count, drive.remaining_chars()); - - let save_ctx = drive.ctx; - drive.skip_code(4); - let reset_position = drive.ctx.code_position; - - while count < max_count { - drive.ctx.code_position = reset_position; - let code = drive.peek_code(0); - let code = SreOpcode::try_from(code).unwrap(); - dispatch(code, drive, stacks); - if drive.ctx.has_matched == Some(false) { - break; - } - count += 1; - } - drive.ctx = save_ctx; - count -} - -fn _count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { - let save_ctx = drive.ctx; - let max_count = std::cmp::min(max_count, drive.remaining_chars()); - let end = drive.ctx.string_position + max_count; - let opcode = SreOpcode::try_from(drive.peek_code(0)).unwrap(); - - match opcode { - SreOpcode::ANY => { - while !drive.ctx.string_position < end && !drive.at_linebreak() { - drive.skip_char(1); - } - } - SreOpcode::ANY_ALL => { - drive.skip_char(max_count); - } - SreOpcode::IN => { - while !drive.ctx.string_position < end - && charset(&drive.pattern()[2..], drive.peek_char()) - { - drive.skip_char(1); - } - } - SreOpcode::LITERAL => { - general_count_literal(drive, end, |code, c| code == c as u32); - } - SreOpcode::NOT_LITERAL => { - general_count_literal(drive, end, |code, c| code != c as u32); - } - SreOpcode::LITERAL_IGNORE => { - general_count_literal(drive, end, |code, c| code == lower_ascii(c) as u32); - } - SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(drive, end, |code, c| code != lower_ascii(c) as u32); - } - SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(drive, end, char_loc_ignore); - } - SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(drive, end, |code, c| !char_loc_ignore(code, c)); - } - SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(drive, end, |code, c| code == lower_unicode(c) as u32); - } - SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(drive, end, |code, c| code != lower_unicode(c) as u32); - } - _ => { - return general_count(drive, stacks, max_count); - } - } - - let count = drive.ctx.string_position - drive.state.string_position; - drive.ctx = save_ctx; - count -} - -fn general_count_literal bool>( - drive: &mut StateContext, - end: usize, - mut f: F, -) { - let ch = drive.peek_code(1); - while !drive.ctx.string_position < end && f(ch, drive.peek_char()) { - drive.skip_char(1); - } -} +// /* General case */ +// fn general_count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { +// let mut count = 0; +// let max_count = std::cmp::min(max_count, drive.remaining_chars()); + +// let save_ctx = drive.ctx; +// drive.skip_code(4); +// let reset_position = drive.ctx.code_position; + +// while count < max_count { +// drive.ctx.code_position = reset_position; +// let code = drive.peek_code(0); +// let code = SreOpcode::try_from(code).unwrap(); +// dispatch(code, drive, stacks); +// if drive.ctx.has_matched == Some(false) { +// break; +// } +// count += 1; +// } +// drive.ctx = save_ctx; +// count +// } + +// fn _count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { +// let save_ctx = drive.ctx; +// let max_count = std::cmp::min(max_count, drive.remaining_chars()); +// let end = drive.ctx.string_position + max_count; +// let opcode = SreOpcode::try_from(drive.peek_code(0)).unwrap(); + +// match opcode { +// SreOpcode::ANY => { +// while !drive.ctx.string_position < end && !drive.at_linebreak() { +// drive.skip_char(1); +// } +// } +// SreOpcode::ANY_ALL => { +// drive.skip_char(max_count); +// } +// SreOpcode::IN => { +// while !drive.ctx.string_position < end +// && charset(&drive.pattern()[2..], drive.peek_char()) +// { +// drive.skip_char(1); +// } +// } +// SreOpcode::LITERAL => { +// general_count_literal(drive, end, |code, c| code == c as u32); +// } +// SreOpcode::NOT_LITERAL => { +// general_count_literal(drive, end, |code, c| code != c as u32); +// } +// SreOpcode::LITERAL_IGNORE => { +// general_count_literal(drive, end, |code, c| code == lower_ascii(c) as u32); +// } +// SreOpcode::NOT_LITERAL_IGNORE => { +// general_count_literal(drive, end, |code, c| code != lower_ascii(c) as u32); +// } +// SreOpcode::LITERAL_LOC_IGNORE => { +// general_count_literal(drive, end, char_loc_ignore); +// } +// SreOpcode::NOT_LITERAL_LOC_IGNORE => { +// general_count_literal(drive, end, |code, c| !char_loc_ignore(code, c)); +// } +// SreOpcode::LITERAL_UNI_IGNORE => { +// general_count_literal(drive, end, |code, c| code == lower_unicode(c) as u32); +// } +// SreOpcode::NOT_LITERAL_UNI_IGNORE => { +// general_count_literal(drive, end, |code, c| code != lower_unicode(c) as u32); +// } +// _ => { +// return general_count(drive, stacks, max_count); +// } +// } + +// let count = drive.ctx.string_position - drive.state.string_position; +// drive.ctx = save_ctx; +// count +// } + +// fn general_count_literal bool>( +// drive: &mut StateContext, +// end: usize, +// mut f: F, +// ) { +// let ch = drive.peek_code(1); +// while !drive.ctx.string_position < end && f(ch, drive.peek_char()) { +// drive.skip_char(1); +// } +// } fn is_word(ch: u32) -> bool { ch == '_' as u32 From 34bde45a2c0906a3a3e03dca79a4426b4cf57655 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 1 Aug 2022 22:19:49 +0200 Subject: [PATCH 055/705] pass compile --- benches/benches.rs | 22 +- src/engine.rs | 1802 ++++++++++++++++++++------------------------ tests/tests.rs | 20 +- 3 files changed, 821 insertions(+), 1023 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index b86a592967..d000ceb62e 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -11,12 +11,12 @@ pub struct Pattern { } impl Pattern { - pub fn state<'a>( + pub fn state<'a, S: engine::StrDrive>( &self, - string: impl Into>, + string: S, range: std::ops::Range, - ) -> engine::State<'a> { - engine::State::new(string.into(), range.start, range.end, self.flags, self.code) + ) -> engine::State<'a, S> { + engine::State::new(string, range.start, range.end, self.flags, self.code) } } #[bench] @@ -84,28 +84,28 @@ fn benchmarks(b: &mut Bencher) { b.iter(move || { for (p, s) in &tests { let mut state = p.state(s.clone(), 0..usize::MAX); - state = state.search(); + state.search(); assert!(state.has_matched); state = p.state(s.clone(), 0..usize::MAX); - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); state = p.state(s.clone(), 0..usize::MAX); state.match_all = true; - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000)); state = p.state(s2.as_str(), 0..usize::MAX); - state = state.search(); + state.search(); assert!(state.has_matched); state = p.state(s2.as_str(), 10000..usize::MAX); - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); state = p.state(s2.as_str(), 10000..10000 + s.len()); - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); state = p.state(s2.as_str(), 10000..10000 + s.len()); state.match_all = true; - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); } }) diff --git a/src/engine.rs b/src/engine.rs index 8865eb6a39..b0717e1671 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -19,21 +19,45 @@ pub struct State<'a, S: StrDrive> { pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, context_stack: Vec>, - // branch_stack: Vec, - // min_repeat_one_stack: Vec, - // repeat_one_stack: Vec, - // repeat_stack: Vec, - // min_until_stack: Vec, - // max_until_stack: Vec, - // _stacks: Option>, + repeat_stack: Vec, pub string_position: usize, - popped_context: Option>, next_context: Option>, + popped_has_matched: bool, pub has_matched: bool, pub match_all: bool, pub must_advance: bool, } +macro_rules! next_ctx { + (offset $offset:expr, $state:expr, $ctx:expr, $handler:expr) => { + next_ctx!(position $ctx.code_position + $offset, $state, $ctx, $handler) + }; + (from $peek:expr, $state:expr, $ctx:expr, $handler:expr) => { + next_ctx!(position $ctx.peek_code($state, $peek) as usize + 1, $state, $ctx, $handler) + }; + (position $position:expr, $state:expr, $ctx:expr, $handler:expr) => { + {$state.next_context.insert(MatchContext { + code_position: $position, + has_matched: None, + handler: Some($handler), + ..*$ctx + })} + }; +} + +macro_rules! mark { + (push, $state:expr) => { + $state + .marks_stack + .push(($state.marks.clone(), $state.lastindex)) + }; + (pop, $state:expr) => { + let (marks, lastindex) = $state.marks_stack.pop().unwrap(); + $state.marks = marks; + $state.lastindex = lastindex; + }; +} + impl<'a, S: StrDrive> State<'a, S> { pub fn new( string: S, @@ -54,16 +78,10 @@ impl<'a, S: StrDrive> State<'a, S> { lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), - // branch_stack: Vec::new(), - // min_repeat_one_stack: Vec::new(), - // repeat_one_stack: Vec::new(), - // repeat_stack: Vec::new(), - // min_until_stack: Vec::new(), - // max_until_stack: Vec::new(), - // _stacks: Default::default(), + repeat_stack: Vec::new(), string_position: start, - popped_context: None, next_context: None, + popped_has_matched: false, has_matched: false, match_all: false, must_advance: false, @@ -75,15 +93,10 @@ impl<'a, S: StrDrive> State<'a, S> { self.marks.clear(); self.marks_stack.clear(); self.context_stack.clear(); - // self.branch_stack.clear(); - // self.min_repeat_one_stack.clear(); - // self.repeat_one_stack.clear(); - // self.repeat_stack.clear(); - // self.min_until_stack.clear(); - // self.max_until_stack.clear(); + self.repeat_stack.clear(); self.string_position = self.start; - self.popped_context = None; self.next_context = None; + self.popped_has_matched = false; self.has_matched = false; } @@ -104,14 +117,14 @@ impl<'a, S: StrDrive> State<'a, S> { (None, None) } } - fn marks_push(&mut self) { - self.marks_stack.push((self.marks.clone(), self.lastindex)); - } - fn marks_pop(&mut self) { - let (marks, lastindex) = self.marks_stack.pop().unwrap(); - self.marks = marks; - self.lastindex = lastindex; - } + // fn marks_push(&mut self) { + // self.marks_stack.push((self.marks.clone(), self.lastindex)); + // } + // fn marks_pop(&mut self) { + // let (marks, lastindex) = self.marks_stack.pop().unwrap(); + // self.marks = marks; + // self.lastindex = lastindex; + // } fn marks_pop_keep(&mut self) { let (marks, lastindex) = self.marks_stack.last().unwrap().clone(); self.marks = marks; @@ -121,46 +134,31 @@ impl<'a, S: StrDrive> State<'a, S> { self.marks_stack.pop(); } - fn _match(&mut self) { + fn _match(&mut self) { while let Some(mut ctx) = self.context_stack.pop() { - // let mut drive = StateContext { - // state: self, - // ctx, - // next_ctx: None, - // }; - // let mut state = self; - - if let Some(handler) = ctx.handler { + if let Some(handler) = ctx.handler.take() { handler(self, &mut ctx); } else if ctx.remaining_codes(self) > 0 { let code = ctx.peek_code(self, 0); let code = SreOpcode::try_from(code).unwrap(); - self.dispatch(code, &mut ctx); + dispatch(self, &mut ctx, code); } else { ctx.failure(); } - // let StateContext { - // mut state, - // ctx, - // next_ctx, - // } = drive; - - if ctx.has_matched.is_some() { - self.popped_context = Some(ctx); + if let Some(has_matched) = ctx.has_matched { + self.popped_has_matched = has_matched; } else { self.context_stack.push(ctx); if let Some(next_ctx) = self.next_context.take() { self.context_stack.push(next_ctx); } } - // self = state } - self.has_matched = self.popped_context.take().unwrap().has_matched == Some(true); - // self + self.has_matched = self.popped_has_matched; } - pub fn pymatch(mut self) -> Self { + pub fn pymatch(&mut self) { let ctx = MatchContext { string_position: self.start, string_offset: self.string.offset(0, self.start), @@ -169,18 +167,18 @@ impl<'a, S: StrDrive> State<'a, S> { toplevel: true, handler: None, repeat_ctx_id: usize::MAX, + count: -1, }; self.context_stack.push(ctx); self._match(); - self } - pub fn search(mut self) -> Self { + pub fn search(&mut self) { // TODO: optimize by op info and skip prefix if self.start > self.end { - return self; + return; } let mut start_offset = self.string.offset(0, self.start); @@ -193,6 +191,7 @@ impl<'a, S: StrDrive> State<'a, S> { toplevel: true, handler: None, repeat_ctx_id: usize::MAX, + count: -1, }; self.context_stack.push(ctx); self._match(); @@ -211,643 +210,503 @@ impl<'a, S: StrDrive> State<'a, S> { toplevel: false, handler: None, repeat_ctx_id: usize::MAX, + count: -1, }; self.context_stack.push(ctx); self._match(); } + } +} + +fn dispatch<'a, S: StrDrive>( + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + opcode: SreOpcode, +) { + match opcode { + SreOpcode::FAILURE => { + ctx.failure(); + } + SreOpcode::SUCCESS => { + if ctx.can_success(state) { + state.string_position = ctx.string_position; + ctx.success(); + } else { + ctx.failure(); + } + } + SreOpcode::ANY => { + if ctx.at_end(state) || ctx.at_linebreak(state) { + ctx.failure(); + } else { + ctx.skip_code(1); + ctx.skip_char(state, 1); + } + } + SreOpcode::ANY_ALL => { + if ctx.at_end(state) { + ctx.failure(); + } else { + ctx.skip_code(1); + ctx.skip_char(state, 1); + } + } + /* assert subpattern */ + /* */ + SreOpcode::ASSERT => op_assert(state, ctx), + SreOpcode::ASSERT_NOT => op_assert_not(state, ctx), + SreOpcode::AT => { + let atcode = SreAtCode::try_from(ctx.peek_code(state, 1)).unwrap(); + if at(state, ctx, atcode) { + ctx.skip_code(2); + } else { + ctx.failure(); + } + } + SreOpcode::BRANCH => op_branch(state, ctx), + SreOpcode::CATEGORY => { + let catcode = SreCatCode::try_from(ctx.peek_code(state, 1)).unwrap(); + if ctx.at_end(state) || !category(catcode, ctx.peek_char(state)) { + ctx.failure(); + } else { + ctx.skip_code(2); + ctx.skip_char(state, 1); + } + } + SreOpcode::IN => general_op_in(state, ctx, charset), + SreOpcode::IN_IGNORE => general_op_in(state, ctx, |set, c| charset(set, lower_ascii(c))), + SreOpcode::IN_UNI_IGNORE => { + general_op_in(state, ctx, |set, c| charset(set, lower_unicode(c))) + } + SreOpcode::IN_LOC_IGNORE => general_op_in(state, ctx, charset_loc_ignore), + SreOpcode::INFO | SreOpcode::JUMP => ctx.skip_code_from(state, 1), + SreOpcode::LITERAL => general_op_literal(state, ctx, |code, c| code == c), + SreOpcode::NOT_LITERAL => general_op_literal(state, ctx, |code, c| code != c), + SreOpcode::LITERAL_IGNORE => { + general_op_literal(state, ctx, |code, c| code == lower_ascii(c)) + } + SreOpcode::NOT_LITERAL_IGNORE => { + general_op_literal(state, ctx, |code, c| code != lower_ascii(c)) + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_op_literal(state, ctx, |code, c| code == lower_unicode(c)) + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_op_literal(state, ctx, |code, c| code != lower_unicode(c)) + } + SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(state, ctx, char_loc_ignore), + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_op_literal(state, ctx, |code, c| !char_loc_ignore(code, c)) + } + SreOpcode::MARK => { + state.set_mark(ctx.peek_code(state, 1) as usize, ctx.string_position); + ctx.skip_code(2); + } + SreOpcode::MAX_UNTIL => op_max_until(state, ctx), + SreOpcode::MIN_UNTIL => op_min_until(state, ctx), + SreOpcode::REPEAT => op_repeat(state, ctx), + SreOpcode::REPEAT_ONE => op_repeat_one(state, ctx), + SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(state, ctx), + SreOpcode::GROUPREF => general_op_groupref(state, ctx, |x| x), + SreOpcode::GROUPREF_IGNORE => general_op_groupref(state, ctx, lower_ascii), + SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(state, ctx, lower_locate), + SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(state, ctx, lower_unicode), + SreOpcode::GROUPREF_EXISTS => { + let (group_start, group_end) = state.get_marks(ctx.peek_code(state, 1) as usize); + match (group_start, group_end) { + (Some(start), Some(end)) if start <= end => { + ctx.skip_code(3); + } + _ => ctx.skip_code_from(state, 2), + } + } + _ => unreachable!("unexpected opcode"), + } +} - self +/* assert subpattern */ +/* */ +fn op_assert<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let back = ctx.peek_code(state, 2) as usize; + if ctx.string_position < back { + return ctx.failure(); } - fn dispatch(&mut self, opcode: SreOpcode, ctx: &mut MatchContext<'a, S>) { - match opcode { - SreOpcode::FAILURE => { - ctx.has_matched = Some(false); + // let next_ctx = state.next_ctx(ctx, 3, |state, ctx| { + let next_ctx = next_ctx!(offset 3, state, ctx, |state, ctx| { + if state.popped_has_matched { + ctx.skip_code_from(state, 1); + } else { + ctx.failure(); + } + }); + next_ctx.back_skip_char(&state.string, back); + state.string_position = next_ctx.string_position; + next_ctx.toplevel = false; +} + +/* assert not subpattern */ +/* */ +fn op_assert_not<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let back = ctx.peek_code(state, 2) as usize; + + if ctx.string_position < back { + return ctx.skip_code_from(state, 1); + } + + let next_ctx = next_ctx!(offset 3, state, ctx, |state, ctx| { + if state.popped_has_matched { + ctx.failure(); + } else { + ctx.skip_code_from(state, 1); + } + }); + next_ctx.back_skip_char(&state.string, back); + state.string_position = next_ctx.string_position; + next_ctx.toplevel = false; +} + +// alternation +// <0=skip> code ... +fn op_branch<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + // state.marks_push(); + mark!(push, state); + + ctx.count = 1; + create_context(state, ctx); + + fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let branch_offset = ctx.count as usize; + let next_length = ctx.peek_code(state, branch_offset) as isize; + if next_length == 0 { + state.marks_pop_discard(); + return ctx.failure(); + } + + state.string_position = ctx.string_position; + + ctx.count += next_length; + next_ctx!(offset branch_offset + 1, state, ctx, callback); + } + + fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + if state.popped_has_matched { + return ctx.success(); + } + state.marks_pop_keep(); + create_context(state, ctx); + } +} + +/* <1=min> <2=max> item tail */ +fn op_min_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let min_count = ctx.peek_code(state, 2) as usize; + // let max_count = ctx.peek_code(state, 3) as usize; + + if ctx.remaining_chars(state) < min_count { + return ctx.failure(); + } + + state.string_position = ctx.string_position; + + ctx.count = if min_count == 0 { + 0 + } else { + let count = _count(state, ctx, min_count); + if count < min_count { + return ctx.failure(); + } + ctx.skip_char(state, count); + count as isize + }; + + let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(state) { + // tail is empty. we're finished + state.string_position = ctx.string_position; + return ctx.success(); + } + + mark!(push, state); + create_context(state, ctx); + + fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let max_count = ctx.peek_code(state, 3) as usize; + + if max_count == MAXREPEAT || ctx.count as usize <= max_count { + state.string_position = ctx.string_position; + next_ctx!(from 1, state, ctx, callback); + } else { + state.marks_pop_discard(); + ctx.failure(); + } + } + + fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + if state.popped_has_matched { + return ctx.success(); + } + + state.string_position = ctx.string_position; + + if _count(state, ctx, 1) == 0 { + state.marks_pop_discard(); + return ctx.failure(); + } + + ctx.skip_char(state, 1); + ctx.count += 1; + state.marks_pop_keep(); + create_context(state, ctx); + } +} + +/* match repeated sequence (maximizing regexp) */ +/* this operator only works if the repeated item is +exactly one character wide, and we're not already +collecting backtracking points. for other cases, +use the MAX_REPEAT operator */ +/* <1=min> <2=max> item tail */ +fn op_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let min_count = ctx.peek_code(state, 2) as usize; + let max_count = ctx.peek_code(state, 3) as usize; + + if ctx.remaining_chars(state) < min_count { + return ctx.failure(); + } + + state.string_position = ctx.string_position; + + let count = _count(state, ctx, max_count); + ctx.skip_char(state, count); + if count < min_count { + return ctx.failure(); + } + + let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(state) { + // tail is empty. we're finished + state.string_position = ctx.string_position; + return ctx.success(); + } + + mark!(push, state); + ctx.count = count as isize; + create_context(state, ctx); + + fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let min_count = ctx.peek_code(state, 2) as isize; + let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); + if next_code == SreOpcode::LITERAL as u32 { + // Special case: Tail starts with a literal. Skip positions where + // the rest of the pattern cannot possibly match. + let c = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 2); + while ctx.at_end(state) || ctx.peek_char(state) != c { + if ctx.count <= min_count { + state.marks_pop_discard(); + return ctx.failure(); + } + ctx.back_skip_char(&state.string, 1); + ctx.count -= 1; + } + } + + state.string_position = ctx.string_position; + + // General case: backtracking + next_ctx!(from 1, state, ctx, callback); + } + + fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + if state.popped_has_matched { + return ctx.success(); + } + + let min_count = ctx.peek_code(state, 2) as isize; + + if ctx.count <= min_count { + state.marks_pop_discard(); + return ctx.failure(); + } + + ctx.back_skip_char(&state.string, 1); + ctx.count -= 1; + + state.marks_pop_keep(); + create_context(state, ctx); + } +} + +#[derive(Debug, Clone, Copy)] +struct RepeatContext { + count: isize, + min_count: usize, + max_count: usize, + code_position: usize, + last_position: usize, + prev_id: usize, +} + +/* create repeat context. all the hard work is done +by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ +/* <1=min> <2=max> item tail */ +fn op_repeat<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let repeat_ctx = RepeatContext { + count: -1, + min_count: ctx.peek_code(state, 2) as usize, + max_count: ctx.peek_code(state, 3) as usize, + code_position: ctx.code_position, + last_position: std::usize::MAX, + prev_id: ctx.repeat_ctx_id, + }; + + state.repeat_stack.push(repeat_ctx); + + state.string_position = ctx.string_position; + + let next_ctx = next_ctx!(from 1, state, ctx, |state, ctx| { + ctx.has_matched = Some(state.popped_has_matched); + state.repeat_stack.pop(); + }); + next_ctx.repeat_ctx_id = state.repeat_stack.len() - 1; +} + +/* minimizing repeat */ +fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let repeat_ctx = state.repeat_stack.last_mut().unwrap(); + + state.string_position = ctx.string_position; + + repeat_ctx.count += 1; + + if (repeat_ctx.count as usize) < repeat_ctx.min_count { + // not enough matches + next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + if state.popped_has_matched { + ctx.success(); + } else { + state.repeat_stack[ctx.repeat_ctx_id].count -= 1; + state.string_position = ctx.string_position; + ctx.failure(); + } + }); + return; + } + + mark!(push, state); + + ctx.count = ctx.repeat_ctx_id as isize; + + // see if the tail matches + let next_ctx = next_ctx!(offset 1, state, ctx, |state, ctx| { + if state.popped_has_matched { + return ctx.success(); + } + + ctx.repeat_ctx_id = ctx.count as usize; + + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + + state.string_position = ctx.string_position; + + mark!(pop, state); + + // match more until tail matches + + if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT + || state.string_position == repeat_ctx.last_position + { + repeat_ctx.count -= 1; + return ctx.failure(); + } + + /* zero-width match protection */ + repeat_ctx.last_position = state.string_position; + + next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + if state.popped_has_matched { + ctx.success(); + } else { + state.repeat_stack[ctx.repeat_ctx_id].count -= 1; + state.string_position = ctx.string_position; + ctx.failure(); + } + }); + }); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; +} + +/* maximizing repeat */ +fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + + state.string_position = ctx.string_position; + + repeat_ctx.count += 1; + + if (repeat_ctx.count as usize) < repeat_ctx.min_count { + // not enough matches + next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + if state.popped_has_matched { + ctx.success(); + } else { + state.repeat_stack[ctx.repeat_ctx_id].count -= 1; + state.string_position = ctx.string_position; + ctx.failure(); + } + }); + return; + } + + if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) + && state.string_position != repeat_ctx.last_position + { + /* we may have enough matches, but if we can + match another item, do so */ + mark!(push, state); + + ctx.count = repeat_ctx.last_position as isize; + repeat_ctx.last_position = state.string_position; + + next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + let save_last_position = ctx.count as usize; + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + repeat_ctx.last_position = save_last_position; + + if state.popped_has_matched { + state.marks_pop_discard(); + return ctx.success(); } - SreOpcode::SUCCESS => todo!(), - SreOpcode::ANY => todo!(), - SreOpcode::ANY_ALL => todo!(), - SreOpcode::ASSERT => todo!(), - SreOpcode::ASSERT_NOT => todo!(), - SreOpcode::AT => todo!(), - SreOpcode::BRANCH => todo!(), - SreOpcode::CALL => todo!(), - SreOpcode::CATEGORY => todo!(), - SreOpcode::CHARSET => todo!(), - SreOpcode::BIGCHARSET => todo!(), - SreOpcode::GROUPREF => todo!(), - SreOpcode::GROUPREF_EXISTS => todo!(), - SreOpcode::IN => todo!(), - SreOpcode::INFO => todo!(), - SreOpcode::JUMP => todo!(), - SreOpcode::LITERAL => todo!(), - SreOpcode::MARK => todo!(), - SreOpcode::MAX_UNTIL => todo!(), - SreOpcode::MIN_UNTIL => todo!(), - SreOpcode::NOT_LITERAL => todo!(), - SreOpcode::NEGATE => todo!(), - SreOpcode::RANGE => todo!(), - SreOpcode::REPEAT => todo!(), - SreOpcode::REPEAT_ONE => todo!(), - SreOpcode::SUBPATTERN => todo!(), - SreOpcode::MIN_REPEAT_ONE => todo!(), - SreOpcode::GROUPREF_IGNORE => todo!(), - SreOpcode::IN_IGNORE => todo!(), - SreOpcode::LITERAL_IGNORE => todo!(), - SreOpcode::NOT_LITERAL_IGNORE => todo!(), - SreOpcode::GROUPREF_LOC_IGNORE => todo!(), - SreOpcode::IN_LOC_IGNORE => todo!(), - SreOpcode::LITERAL_LOC_IGNORE => todo!(), - SreOpcode::NOT_LITERAL_LOC_IGNORE => todo!(), - SreOpcode::GROUPREF_UNI_IGNORE => todo!(), - SreOpcode::IN_UNI_IGNORE => todo!(), - SreOpcode::LITERAL_UNI_IGNORE => todo!(), - SreOpcode::NOT_LITERAL_UNI_IGNORE => todo!(), - SreOpcode::RANGE_UNI_IGNORE => todo!(), + + mark!(pop, state); + repeat_ctx.count -= 1; + + state.string_position = ctx.string_position; + + /* cannot match more repeated items here. make sure the + tail matches */ + let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + }); + return; + } + + /* cannot match more repeated items here. make sure the + tail matches */ + let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + + fn tail_callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + if state.popped_has_matched { + ctx.success(); + } else { + state.string_position = ctx.string_position; + ctx.failure(); } } } -// fn dispatch(opcode: SreOpcode, drive: &mut StateContext, stacks: &mut Stacks) { -// match opcode { -// SreOpcode::FAILURE => { -// drive.failure(); -// } -// SreOpcode::SUCCESS => { -// drive.ctx.has_matched = Some(drive.can_success()); -// if drive.ctx.has_matched == Some(true) { -// drive.state.string_position = drive.ctx.string_position; -// } -// } -// SreOpcode::ANY => { -// if drive.at_end() || drive.at_linebreak() { -// drive.failure(); -// } else { -// drive.skip_code(1); -// drive.skip_char(1); -// } -// } -// SreOpcode::ANY_ALL => { -// if drive.at_end() { -// drive.failure(); -// } else { -// drive.skip_code(1); -// drive.skip_char(1); -// } -// } -// SreOpcode::ASSERT => op_assert(drive), -// SreOpcode::ASSERT_NOT => op_assert_not(drive), -// SreOpcode::AT => { -// let atcode = SreAtCode::try_from(drive.peek_code(1)).unwrap(); -// if at(drive, atcode) { -// drive.skip_code(2); -// } else { -// drive.failure(); -// } -// } -// SreOpcode::BRANCH => op_branch(drive, stacks), -// SreOpcode::CATEGORY => { -// let catcode = SreCatCode::try_from(drive.peek_code(1)).unwrap(); -// if drive.at_end() || !category(catcode, drive.peek_char()) { -// drive.failure(); -// } else { -// drive.skip_code(2); -// drive.skip_char(1); -// } -// } -// SreOpcode::IN => general_op_in(drive, charset), -// SreOpcode::IN_IGNORE => general_op_in(drive, |set, c| charset(set, lower_ascii(c))), -// SreOpcode::IN_UNI_IGNORE => general_op_in(drive, |set, c| charset(set, lower_unicode(c))), -// SreOpcode::IN_LOC_IGNORE => general_op_in(drive, charset_loc_ignore), -// SreOpcode::INFO | SreOpcode::JUMP => drive.skip_code_from(1), -// SreOpcode::LITERAL => general_op_literal(drive, |code, c| code == c), -// SreOpcode::NOT_LITERAL => general_op_literal(drive, |code, c| code != c), -// SreOpcode::LITERAL_IGNORE => general_op_literal(drive, |code, c| code == lower_ascii(c)), -// SreOpcode::NOT_LITERAL_IGNORE => { -// general_op_literal(drive, |code, c| code != lower_ascii(c)) -// } -// SreOpcode::LITERAL_UNI_IGNORE => { -// general_op_literal(drive, |code, c| code == lower_unicode(c)) -// } -// SreOpcode::NOT_LITERAL_UNI_IGNORE => { -// general_op_literal(drive, |code, c| code != lower_unicode(c)) -// } -// SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(drive, char_loc_ignore), -// SreOpcode::NOT_LITERAL_LOC_IGNORE => { -// general_op_literal(drive, |code, c| !char_loc_ignore(code, c)) -// } -// SreOpcode::MARK => { -// drive -// .state -// .set_mark(drive.peek_code(1) as usize, drive.ctx.string_position); -// drive.skip_code(2); -// } -// SreOpcode::MAX_UNTIL => op_max_until(drive, stacks), -// SreOpcode::MIN_UNTIL => op_min_until(drive, stacks), -// SreOpcode::REPEAT => op_repeat(drive, stacks), -// SreOpcode::REPEAT_ONE => op_repeat_one(drive, stacks), -// SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(drive, stacks), -// SreOpcode::GROUPREF => general_op_groupref(drive, |x| x), -// SreOpcode::GROUPREF_IGNORE => general_op_groupref(drive, lower_ascii), -// SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(drive, lower_locate), -// SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(drive, lower_unicode), -// SreOpcode::GROUPREF_EXISTS => { -// let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); -// match (group_start, group_end) { -// (Some(start), Some(end)) if start <= end => { -// drive.skip_code(3); -// } -// _ => drive.skip_code_from(2), -// } -// } -// _ => unreachable!("unexpected opcode"), -// } -// } - -// /* assert subpattern */ -// /* */ -// fn op_assert(drive: &mut StateContext) { -// let back = drive.peek_code(2) as usize; - -// if drive.ctx.string_position < back { -// return drive.failure(); -// } - -// let offset = drive -// .state -// .string -// .back_offset(drive.ctx.string_offset, back); -// let position = drive.ctx.string_position - back; - -// drive.state.string_position = position; - -// let next_ctx = drive.next_ctx(3, |drive, _| { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.ctx.handler = None; -// drive.skip_code_from(1); -// } else { -// drive.failure(); -// } -// }); -// next_ctx.string_position = position; -// next_ctx.string_offset = offset; -// next_ctx.toplevel = false; -// } - -// /* assert not subpattern */ -// /* */ -// fn op_assert_not(drive: &mut StateContext) { -// let back = drive.peek_code(2) as usize; - -// if drive.ctx.string_position < back { -// return drive.skip_code_from(1); -// } - -// let offset = drive -// .state -// .string -// .back_offset(drive.ctx.string_offset, back); -// let position = drive.ctx.string_position - back; - -// drive.state.string_position = position; - -// let next_ctx = drive.next_ctx(3, |drive, _| { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.failure(); -// } else { -// drive.ctx.handler = None; -// drive.skip_code_from(1); -// } -// }); -// next_ctx.string_position = position; -// next_ctx.string_offset = offset; -// next_ctx.toplevel = false; -// } - -// #[derive(Debug)] -// struct BranchContext { -// branch_offset: usize, -// } - -// // alternation -// // <0=skip> code ... -// fn op_branch(drive: &mut StateContext, stacks: &mut Stacks) { -// drive.state.marks_push(); -// stacks.branch.push(BranchContext { branch_offset: 1 }); -// create_context(drive, stacks); - -// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { -// let branch_offset = stacks.branch_last().branch_offset; -// let next_length = drive.peek_code(branch_offset) as usize; -// if next_length == 0 { -// drive.state.marks_pop_discard(); -// stacks.branch.pop(); -// return drive.failure(); -// } - -// drive.sync_string_position(); - -// stacks.branch_last().branch_offset += next_length; -// drive.next_ctx(branch_offset + 1, callback); -// } - -// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { -// if drive.popped_ctx().has_matched == Some(true) { -// stacks.branch.pop(); -// return drive.success(); -// } -// drive.state.marks_pop_keep(); -// drive.ctx.handler = Some(create_context) -// } -// } - -// #[derive(Debug, Copy, Clone)] -// struct MinRepeatOneContext { -// count: usize, -// max_count: usize, -// } - -// /* <1=min> <2=max> item tail */ -// fn op_min_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { -// let min_count = drive.peek_code(2) as usize; -// let max_count = drive.peek_code(3) as usize; - -// if drive.remaining_chars() < min_count { -// return drive.failure(); -// } - -// drive.sync_string_position(); - -// let count = if min_count == 0 { -// 0 -// } else { -// let count = _count(drive, stacks, min_count); -// if count < min_count { -// return drive.failure(); -// } -// drive.skip_char(count); -// count -// }; - -// let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); -// if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { -// // tail is empty. we're finished -// drive.sync_string_position(); -// return drive.success(); -// } - -// drive.state.marks_push(); -// stacks -// .min_repeat_one -// .push(MinRepeatOneContext { count, max_count }); -// create_context(drive, stacks); - -// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { -// let MinRepeatOneContext { count, max_count } = *stacks.min_repeat_one_last(); - -// if max_count == MAXREPEAT || count <= max_count { -// drive.sync_string_position(); -// drive.next_ctx_from(1, callback); -// } else { -// drive.state.marks_pop_discard(); -// stacks.min_repeat_one.pop(); -// drive.failure(); -// } -// } - -// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { -// if drive.popped_ctx().has_matched == Some(true) { -// stacks.min_repeat_one.pop(); -// return drive.success(); -// } - -// drive.sync_string_position(); - -// if _count(drive, stacks, 1) == 0 { -// drive.state.marks_pop_discard(); -// stacks.min_repeat_one.pop(); -// return drive.failure(); -// } - -// drive.skip_char(1); -// stacks.min_repeat_one_last().count += 1; -// drive.state.marks_pop_keep(); -// create_context(drive, stacks); -// } -// } - -// #[derive(Debug, Copy, Clone)] -// struct RepeatOneContext { -// count: usize, -// min_count: usize, -// following_literal: Option, -// } - -// /* match repeated sequence (maximizing regexp) */ - -// /* this operator only works if the repeated item is -// exactly one character wide, and we're not already -// collecting backtracking points. for other cases, -// use the MAX_REPEAT operator */ - -// /* <1=min> <2=max> item tail */ -// fn op_repeat_one(drive: &mut StateContext, stacks: &mut Stacks) { -// let min_count = drive.peek_code(2) as usize; -// let max_count = drive.peek_code(3) as usize; - -// if drive.remaining_chars() < min_count { -// return drive.failure(); -// } - -// drive.sync_string_position(); - -// let count = _count(drive, stacks, max_count); -// drive.skip_char(count); -// if count < min_count { -// return drive.failure(); -// } - -// let next_code = drive.peek_code(drive.peek_code(1) as usize + 1); -// if next_code == SreOpcode::SUCCESS as u32 && drive.can_success() { -// // tail is empty. we're finished -// drive.sync_string_position(); -// return drive.success(); -// } - -// // Special case: Tail starts with a literal. Skip positions where -// // the rest of the pattern cannot possibly match. -// let following_literal = (next_code == SreOpcode::LITERAL as u32) -// .then(|| drive.peek_code(drive.peek_code(1) as usize + 2)); - -// drive.state.marks_push(); -// stacks.repeat_one.push(RepeatOneContext { -// count, -// min_count, -// following_literal, -// }); -// create_context(drive, stacks); - -// fn create_context(drive: &mut StateContext, stacks: &mut Stacks) { -// let RepeatOneContext { -// mut count, -// min_count, -// following_literal, -// } = *stacks.repeat_one_last(); - -// if let Some(c) = following_literal { -// while drive.at_end() || drive.peek_char() != c { -// if count <= min_count { -// drive.state.marks_pop_discard(); -// stacks.repeat_one.pop(); -// return drive.failure(); -// } -// drive.back_skip_char(1); -// count -= 1; -// } -// } -// stacks.repeat_one_last().count = count; - -// drive.sync_string_position(); - -// // General case: backtracking -// drive.next_ctx_from(1, callback); -// } - -// fn callback(drive: &mut StateContext, stacks: &mut Stacks) { -// if drive.popped_ctx().has_matched == Some(true) { -// stacks.repeat_one.pop(); -// return drive.success(); -// } - -// let RepeatOneContext { -// count, -// min_count, -// following_literal: _, -// } = stacks.repeat_one_last(); - -// if count <= min_count { -// drive.state.marks_pop_discard(); -// stacks.repeat_one.pop(); -// return drive.failure(); -// } - -// drive.back_skip_char(1); -// *count -= 1; - -// drive.state.marks_pop_keep(); -// create_context(drive, stacks); -// } -// } - -// #[derive(Debug, Clone, Copy)] -// struct RepeatContext { -// count: isize, -// min_count: usize, -// max_count: usize, -// code_position: usize, -// last_position: usize, -// prev_id: usize, -// } - -// /* create repeat context. all the hard work is done -// by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ -// /* <1=min> <2=max> item tail */ -// fn op_repeat(drive: &mut StateContext, stacks: &mut Stacks) { -// let repeat_ctx = RepeatContext { -// count: -1, -// min_count: drive.peek_code(2) as usize, -// max_count: drive.peek_code(3) as usize, -// code_position: drive.ctx.code_position, -// last_position: std::usize::MAX, -// prev_id: drive.ctx.repeat_ctx_id, -// }; - -// stacks.repeat.push(repeat_ctx); - -// drive.sync_string_position(); - -// let next_ctx = drive.next_ctx_from(1, |drive, stacks| { -// drive.ctx.has_matched = drive.popped_ctx().has_matched; -// stacks.repeat.pop(); -// }); -// next_ctx.repeat_ctx_id = stacks.repeat.len() - 1; -// } - -// #[derive(Debug, Clone, Copy)] -// struct MinUntilContext { -// save_repeat_ctx_id: usize, -// } - -// /* minimizing repeat */ -// fn op_min_until(drive: &mut StateContext, stacks: &mut Stacks) { -// let repeat_ctx = stacks.repeat.last_mut().unwrap(); - -// drive.sync_string_position(); - -// repeat_ctx.count += 1; - -// if (repeat_ctx.count as usize) < repeat_ctx.min_count { -// // not enough matches -// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.success(); -// } else { -// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; -// drive.sync_string_position(); -// drive.failure(); -// } -// }); -// return; -// } - -// drive.state.marks_push(); - -// stacks.min_until.push(MinUntilContext { -// save_repeat_ctx_id: drive.ctx.repeat_ctx_id, -// }); - -// // see if the tail matches -// let next_ctx = drive.next_ctx(1, |drive, stacks| { -// drive.ctx.repeat_ctx_id = stacks.min_until.pop().unwrap().save_repeat_ctx_id; - -// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; - -// if drive.popped_ctx().has_matched == Some(true) { -// return drive.success(); -// } - -// drive.sync_string_position(); - -// drive.state.marks_pop(); - -// // match more until tail matches - -// if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT -// || drive.state.string_position == repeat_ctx.last_position -// { -// repeat_ctx.count -= 1; -// return drive.failure(); -// } - -// /* zero-width match protection */ -// repeat_ctx.last_position = drive.state.string_position; - -// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.success(); -// } else { -// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; -// drive.sync_string_position(); -// drive.failure(); -// } -// }); -// }); -// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; -// } - -// #[derive(Debug, Clone, Copy)] -// struct MaxUntilContext { -// save_last_position: usize, -// } - -// /* maximizing repeat */ -// fn op_max_until(drive: &mut StateContext, stacks: &mut Stacks) { -// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; - -// drive.sync_string_position(); - -// repeat_ctx.count += 1; - -// if (repeat_ctx.count as usize) < repeat_ctx.min_count { -// // not enough matches -// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.success(); -// } else { -// stacks.repeat[drive.ctx.repeat_ctx_id].count -= 1; -// drive.sync_string_position(); -// drive.failure(); -// } -// }); -// return; -// } - -// stacks.max_until.push(MaxUntilContext { -// save_last_position: repeat_ctx.last_position, -// }); - -// if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) -// && drive.state.string_position != repeat_ctx.last_position -// { -// /* we may have enough matches, but if we can -// match another item, do so */ -// repeat_ctx.last_position = drive.state.string_position; - -// drive.state.marks_push(); - -// drive.next_ctx_at(repeat_ctx.code_position + 4, |drive, stacks| { -// let save_last_position = stacks.max_until.pop().unwrap().save_last_position; -// let repeat_ctx = &mut stacks.repeat[drive.ctx.repeat_ctx_id]; -// repeat_ctx.last_position = save_last_position; - -// if drive.popped_ctx().has_matched == Some(true) { -// drive.state.marks_pop_discard(); -// return drive.success(); -// } - -// drive.state.marks_pop(); -// repeat_ctx.count -= 1; -// drive.sync_string_position(); - -// /* cannot match more repeated items here. make sure the -// tail matches */ -// let next_ctx = drive.next_ctx(1, tail_callback); -// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; -// }); -// return; -// } - -// /* cannot match more repeated items here. make sure the -// tail matches */ -// let next_ctx = drive.next_ctx(1, tail_callback); -// next_ctx.repeat_ctx_id = repeat_ctx.prev_id; - -// fn tail_callback(drive: &mut StateContext, _stacks: &mut Stacks) { -// if drive.popped_ctx().has_matched == Some(true) { -// drive.success(); -// } else { -// drive.sync_string_position(); -// drive.failure(); -// } -// } -// } - -// #[derive(Debug, Default)] -// struct Stacks { -// } - -// impl Stacks { -// fn clear(&mut self) { -// self.branch.clear(); -// self.min_repeat_one.clear(); -// self.repeat_one.clear(); -// self.repeat.clear(); -// self.min_until.clear(); -// self.max_until.clear(); -// } - -// fn branch_last(&mut self) -> &mut BranchContext { -// self.branch.last_mut().unwrap() -// } -// fn min_repeat_one_last(&mut self) -> &mut MinRepeatOneContext { -// self.min_repeat_one.last_mut().unwrap() -// } -// fn repeat_one_last(&mut self) -> &mut RepeatOneContext { -// self.repeat_one.last_mut().unwrap() -// } -// } - -pub trait StrDrive { +pub trait StrDrive: Copy { fn offset(&self, offset: usize, skip: usize) -> usize; fn count(&self) -> usize; fn peek(&self, offset: usize) -> u32; @@ -855,7 +714,6 @@ pub trait StrDrive { fn back_offset(&self, offset: usize, skip: usize) -> usize; } - impl<'a> StrDrive for &'a str { fn offset(&self, offset: usize, skip: usize) -> usize { self.get(offset..) @@ -934,6 +792,7 @@ struct MatchContext<'a, S: StrDrive> { toplevel: bool, handler: Option, &mut Self)>, repeat_ctx_id: usize, + count: isize, } impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { @@ -945,182 +804,188 @@ impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { .field("has_matched", &self.has_matched) .field("toplevel", &self.toplevel) .field("handler", &self.handler.map(|x| x as usize)) + .field("count", &self.count) .finish() } } impl<'a, S: StrDrive> MatchContext<'a, S> { + fn pattern(&self, state: &State<'a, S>) -> &[u32] { + &state.pattern_codes[self.code_position..] + } + fn remaining_codes(&self, state: &State<'a, S>) -> usize { state.pattern_codes.len() - self.code_position } - + + fn remaining_chars(&self, state: &State<'a, S>) -> usize { + state.end - self.string_position + } + + fn peek_char(&self, state: &State<'a, S>) -> u32 { + state.string.peek(self.string_offset) + } + + fn skip_char(&mut self, state: &State<'a, S>, skip: usize) { + self.string_position += skip; + self.string_offset = state.string.offset(self.string_offset, skip); + } + + fn back_peek_char(&self, state: &State<'a, S>) -> u32 { + state.string.back_peek(self.string_offset) + } + + fn back_skip_char(&mut self, string: &S, skip: usize) { + self.string_position -= skip; + self.string_offset = string.back_offset(self.string_offset, skip); + } + fn peek_code(&self, state: &State<'a, S>, peek: usize) -> u32 { state.pattern_codes[self.code_position + peek] } + fn skip_code(&mut self, skip: usize) { + self.code_position += skip; + } + + fn skip_code_from(&mut self, state: &State<'a, S>, peek: usize) { + self.skip_code(self.peek_code(state, peek) as usize + 1); + } + + fn at_beginning(&self) -> bool { + // self.ctx().string_position == self.state().start + self.string_position == 0 + } + + fn at_end(&self, state: &State<'a, S>) -> bool { + self.string_position == state.end + } + + fn at_linebreak(&self, state: &State<'a, S>) -> bool { + !self.at_end(state) && is_linebreak(self.peek_char(state)) + } + + fn at_boundary bool>( + &self, + state: &State<'a, S>, + mut word_checker: F, + ) -> bool { + if self.at_beginning() && self.at_end(state) { + return false; + } + let that = !self.at_beginning() && word_checker(self.back_peek_char(state)); + let this = !self.at_end(state) && word_checker(self.peek_char(state)); + this != that + } + + fn at_non_boundary bool>( + &self, + state: &State<'a, S>, + mut word_checker: F, + ) -> bool { + if self.at_beginning() && self.at_end(state) { + return false; + } + let that = !self.at_beginning() && word_checker(self.back_peek_char(state)); + let this = !self.at_end(state) && word_checker(self.peek_char(state)); + this == that + } + + fn can_success(&self, state: &State<'a, S>) -> bool { + if !self.toplevel { + return true; + } + if state.match_all && !self.at_end(state) { + return false; + } + if state.must_advance && self.string_position == state.start { + return false; + } + true + } + + fn success(&mut self) { + self.has_matched = Some(true); + } + fn failure(&mut self) { self.has_matched = Some(false); } } -// trait ContextDrive<'a, T: StrDrive<'a>> { -// fn ctx(&self) -> &MatchContext; -// fn ctx_mut(&mut self) -> &mut MatchContext; -// fn state(&self) -> &State<'a, T>; - -// fn popped_ctx(&self) -> &MatchContext { -// self.state().popped_context.as_ref().unwrap() -// } - -// fn pattern(&self) -> &[u32] { -// &self.state().pattern_codes[self.ctx().code_position..] -// } - -// fn peek_char(&self) -> u32 { -// self.state().string.peek(self.ctx().string_offset) -// } -// fn peek_code(&self, peek: usize) -> u32 { -// self.state().pattern_codes[self.ctx().code_position + peek] -// } - -// fn back_peek_char(&self) -> u32 { -// self.state().string.back_peek(self.ctx().string_offset) -// } -// fn back_skip_char(&mut self, skip_count: usize) { -// self.ctx_mut().string_position -= skip_count; -// self.ctx_mut().string_offset = self -// .state() -// .string -// .back_offset(self.ctx().string_offset, skip_count); -// } - -// fn skip_char(&mut self, skip_count: usize) { -// self.ctx_mut().string_offset = self -// .state() -// .string -// .offset(self.ctx().string_offset, skip_count); -// self.ctx_mut().string_position += skip_count; -// } -// fn skip_code(&mut self, skip_count: usize) { -// self.ctx_mut().code_position += skip_count; -// } -// fn skip_code_from(&mut self, peek: usize) { -// self.skip_code(self.peek_code(peek) as usize + 1); -// } - -// fn remaining_chars(&self) -> usize { -// self.state().end - self.ctx().string_position -// } -// fn remaining_codes(&self) -> usize { -// self.state().pattern_codes.len() - self.ctx().code_position -// } - -// fn at_beginning(&self) -> bool { -// // self.ctx().string_position == self.state().start -// self.ctx().string_position == 0 -// } -// fn at_end(&self) -> bool { -// self.ctx().string_position == self.state().end -// } -// fn at_linebreak(&self) -> bool { -// !self.at_end() && is_linebreak(self.peek_char()) -// } -// fn at_boundary bool>(&self, mut word_checker: F) -> bool { -// if self.at_beginning() && self.at_end() { -// return false; -// } -// let that = !self.at_beginning() && word_checker(self.back_peek_char()); -// let this = !self.at_end() && word_checker(self.peek_char()); -// this != that -// } -// fn at_non_boundary bool>(&self, mut word_checker: F) -> bool { -// if self.at_beginning() && self.at_end() { -// return false; -// } -// let that = !self.at_beginning() && word_checker(self.back_peek_char()); -// let this = !self.at_end() && word_checker(self.peek_char()); -// this == that -// } - -// fn can_success(&self) -> bool { -// if !self.ctx().toplevel { -// return true; -// } -// if self.state().match_all && !self.at_end() { -// return false; -// } -// if self.state().must_advance && self.ctx().string_position == self.state().start { -// return false; -// } -// true -// } - -// fn success(&mut self) { -// self.ctx_mut().has_matched = Some(true); -// } - -// fn failure(&mut self) { -// self.ctx_mut().has_matched = Some(false); -// } -// } - -// struct StateContext<'a, S: StrDrive<'a>> { -// state: State<'a, S>, -// ctx: MatchContext, -// next_ctx: Option, -// } - -// impl<'a, S: StrDrive<'a>> ContextDrive<'a, S> for StateContext<'a, S> { -// fn ctx(&self) -> &MatchContext { -// &self.ctx -// } -// fn ctx_mut(&mut self) -> &mut MatchContext { -// &mut self.ctx -// } -// fn state(&self) -> &State<'a, S> { -// &self.state -// } -// } - -// impl StateContext<'_> { -// fn next_ctx_from(&mut self, peek: usize, handler: OpcodeHandler) -> &mut MatchContext { -// self.next_ctx(self.peek_code(peek) as usize + 1, handler) -// } -// fn next_ctx(&mut self, offset: usize, handler: OpcodeHandler) -> &mut MatchContext { -// self.next_ctx_at(self.ctx.code_position + offset, handler) -// } -// fn next_ctx_at(&mut self, code_position: usize, handler: OpcodeHandler) -> &mut MatchContext { -// self.next_ctx = Some(MatchContext { -// code_position, -// has_matched: None, -// handler: None, -// ..self.ctx -// }); -// self.ctx.handler = Some(handler); -// self.next_ctx.as_mut().unwrap() -// } - -// fn sync_string_position(&mut self) { -// self.state.string_position = self.ctx.string_position; -// } -// } - -// struct StateRefContext<'a> { -// entity: &'a StateContext<'a>, -// ctx: MatchContext, -// } - -// impl ContextDrive for StateRefContext<'_> { -// fn ctx(&self) -> &MatchContext { -// &self.ctx -// } -// fn ctx_mut(&mut self) -> &mut MatchContext { -// &mut self.ctx -// } -// fn state(&self) -> &State { -// &self.entity.state -// } -// } +fn at<'a, S: StrDrive>(state: &State<'a, S>, ctx: &MatchContext<'a, S>, atcode: SreAtCode) -> bool { + match atcode { + SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), + SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(state)), + SreAtCode::BOUNDARY => ctx.at_boundary(state, is_word), + SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(state, is_word), + SreAtCode::END => { + (ctx.remaining_chars(state) == 1 && ctx.at_linebreak(state)) || ctx.at_end(state) + } + SreAtCode::END_LINE => ctx.at_linebreak(state) || ctx.at_end(state), + SreAtCode::END_STRING => ctx.at_end(state), + SreAtCode::LOC_BOUNDARY => ctx.at_boundary(state, is_loc_word), + SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(state, is_loc_word), + SreAtCode::UNI_BOUNDARY => ctx.at_boundary(state, is_uni_word), + SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(state, is_uni_word), + } +} + +fn general_op_literal<'a, S: StrDrive, F: FnOnce(u32, u32) -> bool>( + state: &State<'a, S>, + ctx: &mut MatchContext<'a, S>, + f: F, +) { + if ctx.at_end(state) || !f(ctx.peek_code(state, 1), ctx.peek_char(state)) { + ctx.failure(); + } else { + ctx.skip_code(2); + ctx.skip_char(state, 1); + } +} + +fn general_op_in<'a, S: StrDrive, F: FnOnce(&[u32], u32) -> bool>( + state: &State<'a, S>, + ctx: &mut MatchContext<'a, S>, + f: F, +) { + if ctx.at_end(state) || !f(&ctx.pattern(state)[2..], ctx.peek_char(state)) { + ctx.failure(); + } else { + ctx.skip_code_from(state, 1); + ctx.skip_char(state, 1); + } +} + +fn general_op_groupref<'a, S: StrDrive, F: FnMut(u32) -> u32>( + state: &State<'a, S>, + ctx: &mut MatchContext<'a, S>, + mut f: F, +) { + let (group_start, group_end) = state.get_marks(ctx.peek_code(state, 1) as usize); + let (group_start, group_end) = match (group_start, group_end) { + (Some(start), Some(end)) if start <= end => (start, end), + _ => { + return ctx.failure(); + } + }; + + let mut gctx = MatchContext { + string_position: group_start, + string_offset: state.string.offset(0, group_start), + ..*ctx + }; + + for _ in group_start..group_end { + if ctx.at_end(state) || f(ctx.peek_char(state)) != f(gctx.peek_char(state)) { + return ctx.failure(); + } + ctx.skip_char(state, 1); + gctx.skip_char(state, 1); + } + + ctx.skip_code(2); +} fn char_loc_ignore(code: u32, c: u32) -> bool { code == c || code == lower_locate(c) || code == upper_locate(c) @@ -1135,78 +1000,6 @@ fn charset_loc_ignore(set: &[u32], c: u32) -> bool { up != lo && charset(set, up) } -// fn general_op_groupref u32>(drive: &mut StateContext, mut f: F) { -// let (group_start, group_end) = drive.state.get_marks(drive.peek_code(1) as usize); -// let (group_start, group_end) = match (group_start, group_end) { -// (Some(start), Some(end)) if start <= end => (start, end), -// _ => { -// return drive.failure(); -// } -// }; - -// let mut wdrive = StateRefContext { -// entity: drive, -// ctx: drive.ctx, -// }; -// let mut gdrive = StateRefContext { -// entity: drive, -// ctx: MatchContext { -// string_position: group_start, -// // TODO: cache the offset -// string_offset: drive.state.string.offset(0, group_start), -// ..drive.ctx -// }, -// }; - -// for _ in group_start..group_end { -// if wdrive.at_end() || f(wdrive.peek_char()) != f(gdrive.peek_char()) { -// return drive.failure(); -// } -// wdrive.skip_char(1); -// gdrive.skip_char(1); -// } - -// let position = wdrive.ctx.string_position; -// let offset = wdrive.ctx.string_offset; -// drive.skip_code(2); -// drive.ctx.string_position = position; -// drive.ctx.string_offset = offset; -// } - -// fn general_op_literal bool>(drive: &mut StateContext, f: F) { -// if drive.at_end() || !f(drive.peek_code(1), drive.peek_char()) { -// drive.failure(); -// } else { -// drive.skip_code(2); -// drive.skip_char(1); -// } -// } - -// fn general_op_in bool>(drive: &mut StateContext, f: F) { -// if drive.at_end() || !f(&drive.pattern()[2..], drive.peek_char()) { -// drive.failure(); -// } else { -// drive.skip_code_from(1); -// drive.skip_char(1); -// } -// } - -// fn at(drive: &StateContext, atcode: SreAtCode) -> bool { -// match atcode { -// SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => drive.at_beginning(), -// SreAtCode::BEGINNING_LINE => drive.at_beginning() || is_linebreak(drive.back_peek_char()), -// SreAtCode::BOUNDARY => drive.at_boundary(is_word), -// SreAtCode::NON_BOUNDARY => drive.at_non_boundary(is_word), -// SreAtCode::END => (drive.remaining_chars() == 1 && drive.at_linebreak()) || drive.at_end(), -// SreAtCode::END_LINE => drive.at_linebreak() || drive.at_end(), -// SreAtCode::END_STRING => drive.at_end(), -// SreAtCode::LOC_BOUNDARY => drive.at_boundary(is_loc_word), -// SreAtCode::LOC_NON_BOUNDARY => drive.at_non_boundary(is_loc_word), -// SreAtCode::UNI_BOUNDARY => drive.at_boundary(is_uni_word), -// SreAtCode::UNI_NON_BOUNDARY => drive.at_non_boundary(is_uni_word), -// } -// } - fn category(catcode: SreCatCode, c: u32) -> bool { match catcode { SreCatCode::DIGIT => is_digit(c), @@ -1323,95 +1116,100 @@ fn charset(set: &[u32], ch: u32) -> bool { false } -// /* General case */ -// fn general_count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { -// let mut count = 0; -// let max_count = std::cmp::min(max_count, drive.remaining_chars()); - -// let save_ctx = drive.ctx; -// drive.skip_code(4); -// let reset_position = drive.ctx.code_position; - -// while count < max_count { -// drive.ctx.code_position = reset_position; -// let code = drive.peek_code(0); -// let code = SreOpcode::try_from(code).unwrap(); -// dispatch(code, drive, stacks); -// if drive.ctx.has_matched == Some(false) { -// break; -// } -// count += 1; -// } -// drive.ctx = save_ctx; -// count -// } - -// fn _count(drive: &mut StateContext, stacks: &mut Stacks, max_count: usize) -> usize { -// let save_ctx = drive.ctx; -// let max_count = std::cmp::min(max_count, drive.remaining_chars()); -// let end = drive.ctx.string_position + max_count; -// let opcode = SreOpcode::try_from(drive.peek_code(0)).unwrap(); - -// match opcode { -// SreOpcode::ANY => { -// while !drive.ctx.string_position < end && !drive.at_linebreak() { -// drive.skip_char(1); -// } -// } -// SreOpcode::ANY_ALL => { -// drive.skip_char(max_count); -// } -// SreOpcode::IN => { -// while !drive.ctx.string_position < end -// && charset(&drive.pattern()[2..], drive.peek_char()) -// { -// drive.skip_char(1); -// } -// } -// SreOpcode::LITERAL => { -// general_count_literal(drive, end, |code, c| code == c as u32); -// } -// SreOpcode::NOT_LITERAL => { -// general_count_literal(drive, end, |code, c| code != c as u32); -// } -// SreOpcode::LITERAL_IGNORE => { -// general_count_literal(drive, end, |code, c| code == lower_ascii(c) as u32); -// } -// SreOpcode::NOT_LITERAL_IGNORE => { -// general_count_literal(drive, end, |code, c| code != lower_ascii(c) as u32); -// } -// SreOpcode::LITERAL_LOC_IGNORE => { -// general_count_literal(drive, end, char_loc_ignore); -// } -// SreOpcode::NOT_LITERAL_LOC_IGNORE => { -// general_count_literal(drive, end, |code, c| !char_loc_ignore(code, c)); -// } -// SreOpcode::LITERAL_UNI_IGNORE => { -// general_count_literal(drive, end, |code, c| code == lower_unicode(c) as u32); -// } -// SreOpcode::NOT_LITERAL_UNI_IGNORE => { -// general_count_literal(drive, end, |code, c| code != lower_unicode(c) as u32); -// } -// _ => { -// return general_count(drive, stacks, max_count); -// } -// } - -// let count = drive.ctx.string_position - drive.state.string_position; -// drive.ctx = save_ctx; -// count -// } - -// fn general_count_literal bool>( -// drive: &mut StateContext, -// end: usize, -// mut f: F, -// ) { -// let ch = drive.peek_code(1); -// while !drive.ctx.string_position < end && f(ch, drive.peek_char()) { -// drive.skip_char(1); -// } -// } +fn _count<'a, S: StrDrive>( + state: &mut State<'a, S>, + ctx: &MatchContext<'a, S>, + max_count: usize, +) -> usize { + let mut ctx = *ctx; + let max_count = std::cmp::min(max_count, ctx.remaining_chars(state)); + let end = ctx.string_position + max_count; + let opcode = SreOpcode::try_from(ctx.peek_code(state, 0)).unwrap(); + + match opcode { + SreOpcode::ANY => { + while !ctx.string_position < end && !ctx.at_linebreak(state) { + ctx.skip_char(state, 1); + } + } + SreOpcode::ANY_ALL => { + ctx.skip_char(state, max_count); + } + SreOpcode::IN => { + while !ctx.string_position < end + && charset(&ctx.pattern(state)[2..], ctx.peek_char(state)) + { + ctx.skip_char(state, 1); + } + } + SreOpcode::LITERAL => { + general_count_literal(state, &mut ctx, end, |code, c| code == c as u32); + } + SreOpcode::NOT_LITERAL => { + general_count_literal(state, &mut ctx, end, |code, c| code != c as u32); + } + SreOpcode::LITERAL_IGNORE => { + general_count_literal(state, &mut ctx, end, |code, c| { + code == lower_ascii(c) as u32 + }); + } + SreOpcode::NOT_LITERAL_IGNORE => { + general_count_literal(state, &mut ctx, end, |code, c| { + code != lower_ascii(c) as u32 + }); + } + SreOpcode::LITERAL_LOC_IGNORE => { + general_count_literal(state, &mut ctx, end, char_loc_ignore); + } + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_count_literal(state, &mut ctx, end, |code, c| !char_loc_ignore(code, c)); + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_count_literal(state, &mut ctx, end, |code, c| { + code == lower_unicode(c) as u32 + }); + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_count_literal(state, &mut ctx, end, |code, c| { + code != lower_unicode(c) as u32 + }); + } + _ => { + /* General case */ + let mut count = 0; + + ctx.skip_code(4); + let reset_position = ctx.code_position; + + while count < max_count { + ctx.code_position = reset_position; + let code = ctx.peek_code(state, 0); + let code = SreOpcode::try_from(code).unwrap(); + dispatch(state, &mut ctx, code); + if ctx.has_matched == Some(false) { + break; + } + count += 1; + } + return count; + } + } + + // TODO: return offset + ctx.string_position - state.string_position +} + +fn general_count_literal<'a, S: StrDrive, F: FnMut(u32, u32) -> bool>( + state: &State<'a, S>, + ctx: &mut MatchContext<'a, S>, + end: usize, + mut f: F, +) { + let ch = ctx.peek_code(state, 1); + while !ctx.string_position < end && f(ch, ctx.peek_char(state)) { + ctx.skip_char(state, 1); + } +} fn is_word(ch: u32) -> bool { ch == '_' as u32 diff --git a/tests/tests.rs b/tests/tests.rs index e8ae487029..cc5c4d1f38 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -7,12 +7,12 @@ struct Pattern { } impl Pattern { - fn state<'a>( + fn state<'a, S: engine::StrDrive>( &self, - string: impl Into>, + string: S, range: std::ops::Range, - ) -> engine::State<'a> { - engine::State::new(string.into(), range.start, range.end, self.flags, self.code) + ) -> engine::State<'a, S> { + engine::State::new(string, range.start, range.end, self.flags, self.code) } } @@ -23,7 +23,7 @@ fn test_2427() { #[rustfmt::skip] let lookbehind = Pattern { code: &[15, 4, 0, 1, 1, 5, 5, 1, 17, 46, 1, 17, 120, 6, 10, 1], flags: SreFlag::from_bits_truncate(32) }; // END GENERATED let mut state = lookbehind.state("x", 0..usize::MAX); - state = state.pymatch(); + state.pymatch(); assert!(state.has_matched); } @@ -34,7 +34,7 @@ fn test_assert() { #[rustfmt::skip] let positive_lookbehind = Pattern { code: &[15, 4, 0, 3, 3, 4, 9, 3, 17, 97, 17, 98, 17, 99, 1, 17, 100, 17, 101, 17, 102, 1], flags: SreFlag::from_bits_truncate(32) }; // END GENERATED let mut state = positive_lookbehind.state("abcdef", 0..usize::MAX); - state = state.search(); + state.search(); assert!(state.has_matched); } @@ -45,7 +45,7 @@ fn test_string_boundaries() { #[rustfmt::skip] let big_b = Pattern { code: &[15, 4, 0, 0, 0, 6, 11, 1], flags: SreFlag::from_bits_truncate(32) }; // END GENERATED let mut state = big_b.state("", 0..usize::MAX); - state = state.search(); + state.search(); assert!(!state.has_matched); } @@ -57,7 +57,7 @@ fn test_zerowidth() { // END GENERATED let mut state = p.state("a:", 0..usize::MAX); state.must_advance = true; - state = state.search(); + state.search(); assert!(state.string_position == 1); } @@ -68,7 +68,7 @@ fn test_repeat_context_panic() { #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 25, 0, 4294967295, 27, 6, 0, 4294967295, 17, 97, 1, 24, 11, 0, 1, 18, 0, 17, 120, 17, 120, 18, 1, 20, 17, 122, 19, 1], flags: SreFlag::from_bits_truncate(32) }; // END GENERATED let mut state = p.state("axxzaz", 0..usize::MAX); - state = state.pymatch(); + state.pymatch(); assert!(state.marks == vec![Some(1), Some(3)]); } @@ -79,6 +79,6 @@ fn test_double_max_until() { #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 18, 0, 4294967295, 18, 0, 24, 9, 0, 1, 18, 2, 17, 49, 18, 3, 19, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; // END GENERATED let mut state = p.state("1111", 0..usize::MAX); - state = state.pymatch(); + state.pymatch(); assert!(state.string_position == 4); } From 982d8f53f2eea5eb80d37a0d22c807aebd694445 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 2 Aug 2022 21:10:31 +0200 Subject: [PATCH 056/705] fix next_ctx --- src/engine.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index b0717e1671..64043cee53 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -35,14 +35,16 @@ macro_rules! next_ctx { (from $peek:expr, $state:expr, $ctx:expr, $handler:expr) => { next_ctx!(position $ctx.peek_code($state, $peek) as usize + 1, $state, $ctx, $handler) }; - (position $position:expr, $state:expr, $ctx:expr, $handler:expr) => { - {$state.next_context.insert(MatchContext { + (position $position:expr, $state:expr, $ctx:expr, $handler:expr) => {{ + $ctx.handler = Some($handler); + $state.next_context.insert(MatchContext { code_position: $position, has_matched: None, - handler: Some($handler), + handler: None, + count: -1, ..*$ctx - })} - }; + }) + }}; } macro_rules! mark { @@ -678,7 +680,7 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex return ctx.success(); } - mark!(pop, state); + mark!(pop, state); repeat_ctx.count -= 1; state.string_position = ctx.string_position; From ccae898885496a1390a10fa6fca4cf394445dd35 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 2 Aug 2022 21:24:11 +0200 Subject: [PATCH 057/705] fix next_ctx bug --- src/engine.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine.rs b/src/engine.rs index 64043cee53..4acc172bc6 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -33,7 +33,7 @@ macro_rules! next_ctx { next_ctx!(position $ctx.code_position + $offset, $state, $ctx, $handler) }; (from $peek:expr, $state:expr, $ctx:expr, $handler:expr) => { - next_ctx!(position $ctx.peek_code($state, $peek) as usize + 1, $state, $ctx, $handler) + next_ctx!(offset $ctx.peek_code($state, $peek) as usize + 1, $state, $ctx, $handler) }; (position $position:expr, $state:expr, $ctx:expr, $handler:expr) => {{ $ctx.handler = Some($handler); From f3b30443aab82095ed2ce786482309e659f9c107 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 5 Aug 2022 21:05:10 +0200 Subject: [PATCH 058/705] fix lifetime --- src/engine.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine.rs b/src/engine.rs index 4acc172bc6..810e011c95 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -716,7 +716,7 @@ pub trait StrDrive: Copy { fn back_offset(&self, offset: usize, skip: usize) -> usize; } -impl<'a> StrDrive for &'a str { +impl StrDrive for &str { fn offset(&self, offset: usize, skip: usize) -> usize { self.get(offset..) .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) From ca20b5951d7092cf0ec25198f01d3620a69e61e2 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 5 Aug 2022 21:08:14 +0200 Subject: [PATCH 059/705] update version to 0.3.0 --- Cargo.toml | 2 +- src/engine.rs | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 00123d92c5..98b632a4dc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.2.1" +version = "0.3.0" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" diff --git a/src/engine.rs b/src/engine.rs index 810e011c95..1302667d1d 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -253,8 +253,6 @@ fn dispatch<'a, S: StrDrive>( ctx.skip_char(state, 1); } } - /* assert subpattern */ - /* */ SreOpcode::ASSERT => op_assert(state, ctx), SreOpcode::ASSERT_NOT => op_assert_not(state, ctx), SreOpcode::AT => { @@ -334,7 +332,6 @@ fn op_assert<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<' return ctx.failure(); } - // let next_ctx = state.next_ctx(ctx, 3, |state, ctx| { let next_ctx = next_ctx!(offset 3, state, ctx, |state, ctx| { if state.popped_has_matched { ctx.skip_code_from(state, 1); @@ -371,7 +368,6 @@ fn op_assert_not<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchConte // alternation // <0=skip> code ... fn op_branch<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - // state.marks_push(); mark!(push, state); ctx.count = 1; @@ -403,7 +399,6 @@ fn op_branch<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<' /* <1=min> <2=max> item tail */ fn op_min_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { let min_count = ctx.peek_code(state, 2) as usize; - // let max_count = ctx.peek_code(state, 3) as usize; if ctx.remaining_chars(state) < min_count { return ctx.failure(); From a48f5b07c5671b690e262b935b81f0a7a832b566 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 5 Aug 2022 21:49:46 +0200 Subject: [PATCH 060/705] impl op_info --- src/engine.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/engine.rs b/src/engine.rs index 1302667d1d..fcade829f2 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -279,7 +279,15 @@ fn dispatch<'a, S: StrDrive>( general_op_in(state, ctx, |set, c| charset(set, lower_unicode(c))) } SreOpcode::IN_LOC_IGNORE => general_op_in(state, ctx, charset_loc_ignore), - SreOpcode::INFO | SreOpcode::JUMP => ctx.skip_code_from(state, 1), + SreOpcode::INFO => { + let min = ctx.peek_code(state, 3) as usize; + if ctx.remaining_chars(state) < min { + ctx.failure(); + } else { + ctx.skip_code_from(state, 1); + } + } + SreOpcode::JUMP => ctx.skip_code_from(state, 1), SreOpcode::LITERAL => general_op_literal(state, ctx, |code, c| code == c), SreOpcode::NOT_LITERAL => general_op_literal(state, ctx, |code, c| code != c), SreOpcode::LITERAL_IGNORE => { From 8b1fcea7ec27aa22f698d485ee203dbe2a552334 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 7 Aug 2022 08:10:52 +0200 Subject: [PATCH 061/705] update version to 0.3.1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 98b632a4dc..4b403f2861 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.3.0" +version = "0.3.1" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" From c31462d51b7d3adbf8f121403c4e8b305a9dab6f Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 16:29:51 +0200 Subject: [PATCH 062/705] refactor split State with Request --- src/engine.rs | 669 +++++++++++++++++++++++++++++++------------------- 1 file changed, 420 insertions(+), 249 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index fcade829f2..5e1f1457ec 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,6 +1,6 @@ // good luck to those that follow; here be dragons -use super::constants::{SreAtCode, SreCatCode, SreFlag, SreOpcode}; +use super::constants::{SreAtCode, SreCatCode, SreInfo, SreOpcode}; use super::MAXREPEAT; use std::convert::TryFrom; @@ -8,26 +8,37 @@ const fn is_py_ascii_whitespace(b: u8) -> bool { matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') } -#[derive(Debug)] -pub struct State<'a, S: StrDrive> { +pub struct Request<'a, S: StrDrive> { pub string: S, pub start: usize, pub end: usize, - _flags: SreFlag, - pattern_codes: &'a [u32], - pub marks: Vec>, - pub lastindex: isize, - marks_stack: Vec<(Vec>, isize)>, - context_stack: Vec>, - repeat_stack: Vec, - pub string_position: usize, - next_context: Option>, - popped_has_matched: bool, - pub has_matched: bool, + pub pattern_codes: &'a [u32], pub match_all: bool, pub must_advance: bool, } +impl<'a, S: StrDrive> Request<'a, S> { + pub fn new( + string: S, + start: usize, + end: usize, + pattern_codes: &'a [u32], + match_all: bool, + ) -> Self { + let end = std::cmp::min(end, string.count()); + let start = std::cmp::min(start, end); + + Self { + string, + start, + end, + pattern_codes, + match_all, + must_advance: false, + } + } +} + macro_rules! next_ctx { (offset $offset:expr, $state:expr, $ctx:expr, $handler:expr) => { next_ctx!(position $ctx.code_position + $offset, $state, $ctx, $handler) @@ -60,43 +71,41 @@ macro_rules! mark { }; } +#[derive(Debug)] +pub struct State<'a, S: StrDrive> { + pub marks: Vec>, + pub lastindex: isize, + marks_stack: Vec<(Vec>, isize)>, + context_stack: Vec>, + repeat_stack: Vec, + pub string_position: usize, + next_context: Option>, + popped_has_matched: bool, + has_matched: bool, +} + impl<'a, S: StrDrive> State<'a, S> { - pub fn new( - string: S, - start: usize, - end: usize, - flags: SreFlag, - pattern_codes: &'a [u32], - ) -> Self { - let end = std::cmp::min(end, string.count()); - let start = std::cmp::min(start, end); + pub fn new(string_position: usize) -> Self { Self { - string, - start, - end, - _flags: flags, - pattern_codes, marks: Vec::new(), lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), repeat_stack: Vec::new(), - string_position: start, + string_position, next_context: None, popped_has_matched: false, has_matched: false, - match_all: false, - must_advance: false, } } - pub fn reset(&mut self) { + pub fn reset(&mut self, string_position: usize) { self.lastindex = -1; self.marks.clear(); self.marks_stack.clear(); self.context_stack.clear(); self.repeat_stack.clear(); - self.string_position = self.start; + self.string_position = string_position; self.next_context = None; self.popped_has_matched = false; self.has_matched = false; @@ -136,14 +145,14 @@ impl<'a, S: StrDrive> State<'a, S> { self.marks_stack.pop(); } - fn _match(&mut self) { + fn _match(&mut self, req: &mut Request<'a, S>) { while let Some(mut ctx) = self.context_stack.pop() { if let Some(handler) = ctx.handler.take() { - handler(self, &mut ctx); - } else if ctx.remaining_codes(self) > 0 { - let code = ctx.peek_code(self, 0); + handler(req, self, &mut ctx); + } else if ctx.remaining_codes(req) > 0 { + let code = ctx.peek_code(req, 0); let code = SreOpcode::try_from(code).unwrap(); - dispatch(self, &mut ctx, code); + dispatch(req, self, &mut ctx, code); } else { ctx.failure(); } @@ -160,10 +169,10 @@ impl<'a, S: StrDrive> State<'a, S> { self.has_matched = self.popped_has_matched; } - pub fn pymatch(&mut self) { + pub fn pymatch(&mut self, req: &mut Request<'a, S>) { let ctx = MatchContext { - string_position: self.start, - string_offset: self.string.offset(0, self.start), + string_position: req.start, + string_offset: req.string.offset(0, req.start), code_position: 0, has_matched: None, toplevel: true, @@ -173,20 +182,22 @@ impl<'a, S: StrDrive> State<'a, S> { }; self.context_stack.push(ctx); - self._match(); + self._match(req); } - pub fn search(&mut self) { + pub fn search(&mut self, req: &mut Request<'a, S>) { // TODO: optimize by op info and skip prefix - - if self.start > self.end { + if req.start > req.end { return; } - let mut start_offset = self.string.offset(0, self.start); + // let start = self.start; + // let end = self.end; - let ctx = MatchContext { - string_position: self.start, + let mut start_offset = req.string.offset(0, req.start); + + let mut ctx = MatchContext { + string_position: req.start, string_offset: start_offset, code_position: 0, has_matched: None, @@ -195,17 +206,26 @@ impl<'a, S: StrDrive> State<'a, S> { repeat_ctx_id: usize::MAX, count: -1, }; + + // if ctx.peek_code(self, 0) == SreOpcode::INFO as u32 { + // search_op_info(self, &mut ctx); + // if let Some(has_matched) = ctx.has_matched { + // self.has_matched = has_matched; + // return; + // } + // } + self.context_stack.push(ctx); - self._match(); + self._match(req); - self.must_advance = false; - while !self.has_matched && self.start < self.end { - self.start += 1; - start_offset = self.string.offset(start_offset, 1); - self.reset(); + req.must_advance = false; + while !self.has_matched && req.start < req.end { + req.start += 1; + start_offset = req.string.offset(start_offset, 1); + self.reset(req.start); let ctx = MatchContext { - string_position: self.start, + string_position: req.start, string_offset: start_offset, code_position: 0, has_matched: None, @@ -215,12 +235,13 @@ impl<'a, S: StrDrive> State<'a, S> { count: -1, }; self.context_stack.push(ctx); - self._match(); + self._match(req); } } } fn dispatch<'a, S: StrDrive>( + req: &Request<'a, S>, state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>, opcode: SreOpcode, @@ -230,7 +251,7 @@ fn dispatch<'a, S: StrDrive>( ctx.failure(); } SreOpcode::SUCCESS => { - if ctx.can_success(state) { + if ctx.can_success(req) { state.string_position = ctx.string_position; ctx.success(); } else { @@ -238,152 +259,224 @@ fn dispatch<'a, S: StrDrive>( } } SreOpcode::ANY => { - if ctx.at_end(state) || ctx.at_linebreak(state) { + if ctx.at_end(req) || ctx.at_linebreak(req) { ctx.failure(); } else { ctx.skip_code(1); - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); } } SreOpcode::ANY_ALL => { - if ctx.at_end(state) { + if ctx.at_end(req) { ctx.failure(); } else { ctx.skip_code(1); - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); } } - SreOpcode::ASSERT => op_assert(state, ctx), - SreOpcode::ASSERT_NOT => op_assert_not(state, ctx), + SreOpcode::ASSERT => op_assert(req, state, ctx), + SreOpcode::ASSERT_NOT => op_assert_not(req, state, ctx), SreOpcode::AT => { - let atcode = SreAtCode::try_from(ctx.peek_code(state, 1)).unwrap(); - if at(state, ctx, atcode) { + let atcode = SreAtCode::try_from(ctx.peek_code(req, 1)).unwrap(); + if at(req, ctx, atcode) { ctx.skip_code(2); } else { ctx.failure(); } } - SreOpcode::BRANCH => op_branch(state, ctx), + SreOpcode::BRANCH => op_branch(req, state, ctx), SreOpcode::CATEGORY => { - let catcode = SreCatCode::try_from(ctx.peek_code(state, 1)).unwrap(); - if ctx.at_end(state) || !category(catcode, ctx.peek_char(state)) { + let catcode = SreCatCode::try_from(ctx.peek_code(req, 1)).unwrap(); + if ctx.at_end(req) || !category(catcode, ctx.peek_char(req)) { ctx.failure(); } else { ctx.skip_code(2); - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); } } - SreOpcode::IN => general_op_in(state, ctx, charset), - SreOpcode::IN_IGNORE => general_op_in(state, ctx, |set, c| charset(set, lower_ascii(c))), + SreOpcode::IN => general_op_in(req, ctx, charset), + SreOpcode::IN_IGNORE => general_op_in(req, ctx, |set, c| charset(set, lower_ascii(c))), SreOpcode::IN_UNI_IGNORE => { - general_op_in(state, ctx, |set, c| charset(set, lower_unicode(c))) + general_op_in(req, ctx, |set, c| charset(set, lower_unicode(c))) } - SreOpcode::IN_LOC_IGNORE => general_op_in(state, ctx, charset_loc_ignore), + SreOpcode::IN_LOC_IGNORE => general_op_in(req, ctx, charset_loc_ignore), SreOpcode::INFO => { - let min = ctx.peek_code(state, 3) as usize; - if ctx.remaining_chars(state) < min { + let min = ctx.peek_code(req, 3) as usize; + if ctx.remaining_chars(req) < min { ctx.failure(); } else { - ctx.skip_code_from(state, 1); + ctx.skip_code_from(req, 1); } } - SreOpcode::JUMP => ctx.skip_code_from(state, 1), - SreOpcode::LITERAL => general_op_literal(state, ctx, |code, c| code == c), - SreOpcode::NOT_LITERAL => general_op_literal(state, ctx, |code, c| code != c), - SreOpcode::LITERAL_IGNORE => { - general_op_literal(state, ctx, |code, c| code == lower_ascii(c)) - } + SreOpcode::JUMP => ctx.skip_code_from(req, 1), + SreOpcode::LITERAL => general_op_literal(req, ctx, |code, c| code == c), + SreOpcode::NOT_LITERAL => general_op_literal(req, ctx, |code, c| code != c), + SreOpcode::LITERAL_IGNORE => general_op_literal(req, ctx, |code, c| code == lower_ascii(c)), SreOpcode::NOT_LITERAL_IGNORE => { - general_op_literal(state, ctx, |code, c| code != lower_ascii(c)) + general_op_literal(req, ctx, |code, c| code != lower_ascii(c)) } SreOpcode::LITERAL_UNI_IGNORE => { - general_op_literal(state, ctx, |code, c| code == lower_unicode(c)) + general_op_literal(req, ctx, |code, c| code == lower_unicode(c)) } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_op_literal(state, ctx, |code, c| code != lower_unicode(c)) + general_op_literal(req, ctx, |code, c| code != lower_unicode(c)) } - SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(state, ctx, char_loc_ignore), + SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(req, ctx, char_loc_ignore), SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_op_literal(state, ctx, |code, c| !char_loc_ignore(code, c)) + general_op_literal(req, ctx, |code, c| !char_loc_ignore(code, c)) } SreOpcode::MARK => { - state.set_mark(ctx.peek_code(state, 1) as usize, ctx.string_position); + state.set_mark(ctx.peek_code(req, 1) as usize, ctx.string_position); ctx.skip_code(2); } SreOpcode::MAX_UNTIL => op_max_until(state, ctx), SreOpcode::MIN_UNTIL => op_min_until(state, ctx), - SreOpcode::REPEAT => op_repeat(state, ctx), - SreOpcode::REPEAT_ONE => op_repeat_one(state, ctx), - SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(state, ctx), - SreOpcode::GROUPREF => general_op_groupref(state, ctx, |x| x), - SreOpcode::GROUPREF_IGNORE => general_op_groupref(state, ctx, lower_ascii), - SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(state, ctx, lower_locate), - SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(state, ctx, lower_unicode), + SreOpcode::REPEAT => op_repeat(req, state, ctx), + SreOpcode::REPEAT_ONE => op_repeat_one(req, state, ctx), + SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(req, state, ctx), + SreOpcode::GROUPREF => general_op_groupref(req, state, ctx, |x| x), + SreOpcode::GROUPREF_IGNORE => general_op_groupref(req, state, ctx, lower_ascii), + SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(req, state, ctx, lower_locate), + SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(req, state, ctx, lower_unicode), SreOpcode::GROUPREF_EXISTS => { - let (group_start, group_end) = state.get_marks(ctx.peek_code(state, 1) as usize); + let (group_start, group_end) = state.get_marks(ctx.peek_code(req, 1) as usize); match (group_start, group_end) { (Some(start), Some(end)) if start <= end => { ctx.skip_code(3); } - _ => ctx.skip_code_from(state, 2), + _ => ctx.skip_code_from(req, 2), } } _ => unreachable!("unexpected opcode"), } } +/* optimization info block */ +/* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ +// fn search_op_info<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { +// let min = ctx.peek_code(state, 3) as usize; + +// if ctx.remaining_chars(state) < min { +// return ctx.failure(); +// } + +// if min > 1 { +// /* adjust end point (but make sure we leave at least one +// character in there, so literal search will work) */ +// // no overflow can happen as remaining chars >= min +// state.end -= min - 1; + +// // adjust ctx position +// if state.end < ctx.string_position { +// ctx.string_position = state.end; +// ctx.string_offset = state.string.offset(0, ctx.string_position); +// } +// } + +// let flags = SreInfo::from_bits_truncate(ctx.peek_code(state, 2)); + +// if flags.contains(SreInfo::PREFIX) { +// /* pattern starts with a known prefix */ +// /* */ +// let len = ctx.peek_code(state, 5) as usize; +// let skip = ctx.peek_code(state, 6) as usize; +// let prefix = &ctx.pattern(state)[7..]; +// let overlap = &prefix[len - 1..]; + +// ctx.skip_code_from(state, 1); + +// if len == 1 { +// // pattern starts with a literal character +// let c = prefix[0]; +// let end = state.end; + +// while (!ctx.at_end(state)) { +// // find the next matched literal +// while (ctx.peek_char(state) != c) { +// ctx.skip_char(state, 1); +// if (ctx.at_end(state)) { +// return ctx.failure(); +// } +// } + +// // literal only +// if flags.contains(SreInfo::LITERAL) { +// return ctx.success(); +// } +// } +// } +// } +// } + /* assert subpattern */ /* */ -fn op_assert<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let back = ctx.peek_code(state, 2) as usize; +fn op_assert<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { + let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { return ctx.failure(); } - let next_ctx = next_ctx!(offset 3, state, ctx, |state, ctx| { + // let next_ctx = next_ctx!(offset 3, state, ctx, |req, state, ctx| { + let next_ctx = ctx.next_offset(3, state, |req, state, ctx| { if state.popped_has_matched { - ctx.skip_code_from(state, 1); + ctx.skip_code_from(req, 1); } else { ctx.failure(); } }); - next_ctx.back_skip_char(&state.string, back); - state.string_position = next_ctx.string_position; next_ctx.toplevel = false; + next_ctx.back_skip_char(req, back); + state.string_position = next_ctx.string_position; } /* assert not subpattern */ /* */ -fn op_assert_not<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let back = ctx.peek_code(state, 2) as usize; +fn op_assert_not<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { + let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { - return ctx.skip_code_from(state, 1); + return ctx.skip_code_from(req, 1); } - let next_ctx = next_ctx!(offset 3, state, ctx, |state, ctx| { + let next_ctx = next_ctx!(offset 3, state, ctx, |req, state, ctx| { if state.popped_has_matched { ctx.failure(); } else { - ctx.skip_code_from(state, 1); + ctx.skip_code_from(req, 1); } }); - next_ctx.back_skip_char(&state.string, back); - state.string_position = next_ctx.string_position; next_ctx.toplevel = false; + next_ctx.back_skip_char(req, back); + state.string_position = next_ctx.string_position; } // alternation // <0=skip> code ... -fn op_branch<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { +fn op_branch<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { mark!(push, state); ctx.count = 1; - create_context(state, ctx); + create_context(req, state, ctx); - fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + fn create_context<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { let branch_offset = ctx.count as usize; - let next_length = ctx.peek_code(state, branch_offset) as isize; + let next_length = ctx.peek_code(req, branch_offset) as isize; if next_length == 0 { state.marks_pop_discard(); return ctx.failure(); @@ -395,20 +488,28 @@ fn op_branch<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<' next_ctx!(offset branch_offset + 1, state, ctx, callback); } - fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + fn callback<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { if state.popped_has_matched { return ctx.success(); } state.marks_pop_keep(); - create_context(state, ctx); + create_context(req, state, ctx); } } /* <1=min> <2=max> item tail */ -fn op_min_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let min_count = ctx.peek_code(state, 2) as usize; +fn op_min_repeat_one<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { + let min_count = ctx.peek_code(req, 2) as usize; - if ctx.remaining_chars(state) < min_count { + if ctx.remaining_chars(req) < min_count { return ctx.failure(); } @@ -417,52 +518,61 @@ fn op_min_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchC ctx.count = if min_count == 0 { 0 } else { - let count = _count(state, ctx, min_count); + let count = _count(req, state, ctx, min_count); if count < min_count { return ctx.failure(); } - ctx.skip_char(state, count); + ctx.skip_char(req, count); count as isize }; - let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(state) { + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { // tail is empty. we're finished state.string_position = ctx.string_position; return ctx.success(); } mark!(push, state); - create_context(state, ctx); + create_context(req, state, ctx); - fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let max_count = ctx.peek_code(state, 3) as usize; + fn create_context<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { + let max_count = ctx.peek_code(req, 3) as usize; if max_count == MAXREPEAT || ctx.count as usize <= max_count { state.string_position = ctx.string_position; - next_ctx!(from 1, state, ctx, callback); + // next_ctx!(from 1, state, ctx, callback); + ctx.next_from(1, req, state, callback); } else { state.marks_pop_discard(); ctx.failure(); } } - fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + fn callback<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { if state.popped_has_matched { return ctx.success(); } state.string_position = ctx.string_position; - if _count(state, ctx, 1) == 0 { + if _count(req, state, ctx, 1) == 0 { state.marks_pop_discard(); return ctx.failure(); } - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); ctx.count += 1; state.marks_pop_keep(); - create_context(state, ctx); + create_context(req, state, ctx); } } @@ -472,24 +582,28 @@ exactly one character wide, and we're not already collecting backtracking points. for other cases, use the MAX_REPEAT operator */ /* <1=min> <2=max> item tail */ -fn op_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let min_count = ctx.peek_code(state, 2) as usize; - let max_count = ctx.peek_code(state, 3) as usize; +fn op_repeat_one<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { + let min_count = ctx.peek_code(req, 2) as usize; + let max_count = ctx.peek_code(req, 3) as usize; - if ctx.remaining_chars(state) < min_count { + if ctx.remaining_chars(req) < min_count { return ctx.failure(); } state.string_position = ctx.string_position; - let count = _count(state, ctx, max_count); - ctx.skip_char(state, count); + let count = _count(req, state, ctx, max_count); + ctx.skip_char(req, count); if count < min_count { return ctx.failure(); } - let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(state) { + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { // tail is empty. we're finished state.string_position = ctx.string_position; return ctx.success(); @@ -497,21 +611,25 @@ fn op_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchConte mark!(push, state); ctx.count = count as isize; - create_context(state, ctx); - - fn create_context<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { - let min_count = ctx.peek_code(state, 2) as isize; - let next_code = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 1); + create_context(req, state, ctx); + + fn create_context<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { + let min_count = ctx.peek_code(req, 2) as isize; + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); if next_code == SreOpcode::LITERAL as u32 { // Special case: Tail starts with a literal. Skip positions where // the rest of the pattern cannot possibly match. - let c = ctx.peek_code(state, ctx.peek_code(state, 1) as usize + 2); - while ctx.at_end(state) || ctx.peek_char(state) != c { + let c = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 2); + while ctx.at_end(req) || ctx.peek_char(req) != c { if ctx.count <= min_count { state.marks_pop_discard(); return ctx.failure(); } - ctx.back_skip_char(&state.string, 1); + ctx.back_skip_char(req, 1); ctx.count -= 1; } } @@ -519,26 +637,31 @@ fn op_repeat_one<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchConte state.string_position = ctx.string_position; // General case: backtracking - next_ctx!(from 1, state, ctx, callback); + // next_ctx!(from 1, state, ctx, callback); + ctx.next_from(1, req, state, callback); } - fn callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + fn callback<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { if state.popped_has_matched { return ctx.success(); } - let min_count = ctx.peek_code(state, 2) as isize; + let min_count = ctx.peek_code(req, 2) as isize; if ctx.count <= min_count { state.marks_pop_discard(); return ctx.failure(); } - ctx.back_skip_char(&state.string, 1); + ctx.back_skip_char(req, 1); ctx.count -= 1; state.marks_pop_keep(); - create_context(state, ctx); + create_context(req, state, ctx); } } @@ -555,11 +678,15 @@ struct RepeatContext { /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ /* <1=min> <2=max> item tail */ -fn op_repeat<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { +fn op_repeat<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, +) { let repeat_ctx = RepeatContext { count: -1, - min_count: ctx.peek_code(state, 2) as usize, - max_count: ctx.peek_code(state, 3) as usize, + min_count: ctx.peek_code(req, 2) as usize, + max_count: ctx.peek_code(req, 3) as usize, code_position: ctx.code_position, last_position: std::usize::MAX, prev_id: ctx.repeat_ctx_id, @@ -569,11 +696,14 @@ fn op_repeat<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<' state.string_position = ctx.string_position; - let next_ctx = next_ctx!(from 1, state, ctx, |state, ctx| { + let repeat_ctx_id = state.repeat_stack.len(); + + // let next_ctx = next_ctx!(from 1, state, ctx, |state, ctx| { + let next_ctx = ctx.next_from(1, req, state, |req, state, ctx| { ctx.has_matched = Some(state.popped_has_matched); state.repeat_stack.pop(); }); - next_ctx.repeat_ctx_id = state.repeat_stack.len() - 1; + next_ctx.repeat_ctx_id = repeat_ctx_id; } /* minimizing repeat */ @@ -586,7 +716,8 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -602,8 +733,11 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex ctx.count = ctx.repeat_ctx_id as isize; + let repeat_ctx_prev_id = repeat_ctx.prev_id; + // see if the tail matches - let next_ctx = next_ctx!(offset 1, state, ctx, |state, ctx| { + // let next_ctx = next_ctx!(offset 1, state, ctx, |state, ctx| { + let next_ctx = ctx.next_offset(1, state, |req, state, ctx| { if state.popped_has_matched { return ctx.success(); } @@ -628,7 +762,8 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex /* zero-width match protection */ repeat_ctx.last_position = state.string_position; - next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -638,7 +773,7 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex } }); }); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + next_ctx.repeat_ctx_id = repeat_ctx_prev_id; } /* maximizing repeat */ @@ -651,7 +786,8 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -673,7 +809,7 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex ctx.count = repeat_ctx.last_position as isize; repeat_ctx.last_position = state.string_position; - next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { let save_last_position = ctx.count as usize; let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; repeat_ctx.last_position = save_last_position; @@ -701,7 +837,11 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); next_ctx.repeat_ctx_id = repeat_ctx.prev_id; - fn tail_callback<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { + fn tail_callback<'a, S: StrDrive>( + req: &Request<'a, S>, + state: &mut State<'a, S>, + ctx: &mut MatchContext<'a, S>, + ) { if state.popped_has_matched { ctx.success(); } else { @@ -786,8 +926,6 @@ impl<'a> StrDrive for &'a [u8] { } } -// type OpcodeHandler = for<'a>fn(&mut StateContext<'a, S>, &mut Stacks); - #[derive(Clone, Copy)] struct MatchContext<'a, S: StrDrive> { string_position: usize, @@ -795,7 +933,7 @@ struct MatchContext<'a, S: StrDrive> { code_position: usize, has_matched: Option, toplevel: bool, - handler: Option, &mut Self)>, + handler: Option, &mut State<'a, S>, &mut Self)>, repeat_ctx_id: usize, count: isize, } @@ -809,52 +947,53 @@ impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { .field("has_matched", &self.has_matched) .field("toplevel", &self.toplevel) .field("handler", &self.handler.map(|x| x as usize)) + .field("repeat_ctx_id", &self.repeat_ctx_id) .field("count", &self.count) .finish() } } impl<'a, S: StrDrive> MatchContext<'a, S> { - fn pattern(&self, state: &State<'a, S>) -> &[u32] { - &state.pattern_codes[self.code_position..] + fn pattern(&self, req: &Request<'a, S>) -> &'a [u32] { + &req.pattern_codes[self.code_position..] } - fn remaining_codes(&self, state: &State<'a, S>) -> usize { - state.pattern_codes.len() - self.code_position + fn remaining_codes(&self, req: &Request<'a, S>) -> usize { + req.pattern_codes.len() - self.code_position } - fn remaining_chars(&self, state: &State<'a, S>) -> usize { - state.end - self.string_position + fn remaining_chars(&self, req: &Request<'a, S>) -> usize { + req.end - self.string_position } - fn peek_char(&self, state: &State<'a, S>) -> u32 { - state.string.peek(self.string_offset) + fn peek_char(&self, req: &Request<'a, S>) -> u32 { + req.string.peek(self.string_offset) } - fn skip_char(&mut self, state: &State<'a, S>, skip: usize) { + fn skip_char(&mut self, req: &Request<'a, S>, skip: usize) { self.string_position += skip; - self.string_offset = state.string.offset(self.string_offset, skip); + self.string_offset = req.string.offset(self.string_offset, skip); } - fn back_peek_char(&self, state: &State<'a, S>) -> u32 { - state.string.back_peek(self.string_offset) + fn back_peek_char(&self, req: &Request<'a, S>) -> u32 { + req.string.back_peek(self.string_offset) } - fn back_skip_char(&mut self, string: &S, skip: usize) { + fn back_skip_char(&mut self, req: &Request<'a, S>, skip: usize) { self.string_position -= skip; - self.string_offset = string.back_offset(self.string_offset, skip); + self.string_offset = req.string.back_offset(self.string_offset, skip); } - fn peek_code(&self, state: &State<'a, S>, peek: usize) -> u32 { - state.pattern_codes[self.code_position + peek] + fn peek_code(&self, req: &Request<'a, S>, peek: usize) -> u32 { + req.pattern_codes[self.code_position + peek] } fn skip_code(&mut self, skip: usize) { self.code_position += skip; } - fn skip_code_from(&mut self, state: &State<'a, S>, peek: usize) { - self.skip_code(self.peek_code(state, peek) as usize + 1); + fn skip_code_from(&mut self, req: &Request<'a, S>, peek: usize) { + self.skip_code(self.peek_code(req, peek) as usize + 1); } fn at_beginning(&self) -> bool { @@ -862,48 +1001,48 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { self.string_position == 0 } - fn at_end(&self, state: &State<'a, S>) -> bool { - self.string_position == state.end + fn at_end(&self, req: &Request<'a, S>) -> bool { + self.string_position == req.end } - fn at_linebreak(&self, state: &State<'a, S>) -> bool { - !self.at_end(state) && is_linebreak(self.peek_char(state)) + fn at_linebreak(&self, req: &Request<'a, S>) -> bool { + !self.at_end(req) && is_linebreak(self.peek_char(req)) } fn at_boundary bool>( &self, - state: &State<'a, S>, + req: &Request<'a, S>, mut word_checker: F, ) -> bool { - if self.at_beginning() && self.at_end(state) { + if self.at_beginning() && self.at_end(req) { return false; } - let that = !self.at_beginning() && word_checker(self.back_peek_char(state)); - let this = !self.at_end(state) && word_checker(self.peek_char(state)); + let that = !self.at_beginning() && word_checker(self.back_peek_char(req)); + let this = !self.at_end(req) && word_checker(self.peek_char(req)); this != that } fn at_non_boundary bool>( &self, - state: &State<'a, S>, + req: &Request<'a, S>, mut word_checker: F, ) -> bool { - if self.at_beginning() && self.at_end(state) { + if self.at_beginning() && self.at_end(req) { return false; } - let that = !self.at_beginning() && word_checker(self.back_peek_char(state)); - let this = !self.at_end(state) && word_checker(self.peek_char(state)); + let that = !self.at_beginning() && word_checker(self.back_peek_char(req)); + let this = !self.at_end(req) && word_checker(self.peek_char(req)); this == that } - fn can_success(&self, state: &State<'a, S>) -> bool { + fn can_success(&self, req: &Request<'a, S>) -> bool { if !self.toplevel { return true; } - if state.match_all && !self.at_end(state) { + if req.match_all && !self.at_end(req) { return false; } - if state.must_advance && self.string_position == state.start { + if req.must_advance && self.string_position == req.start { return false; } true @@ -916,58 +1055,94 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { fn failure(&mut self) { self.has_matched = Some(false); } + + fn next_from<'b>( + &mut self, + peek: usize, + req: &Request<'a, S>, + state: &'b mut State<'a, S>, + f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + ) -> &'b mut Self { + self.next_offset(self.peek_code(req, peek) as usize + 1, state, f) + } + + fn next_offset<'b>( + &mut self, + offset: usize, + state: &'b mut State<'a, S>, + f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + ) -> &'b mut Self { + self.next_at(self.code_position + offset, state, f) + } + + fn next_at<'b>( + &mut self, + code_position: usize, + state: &'b mut State<'a, S>, + f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + ) -> &'b mut Self { + self.handler = Some(f); + state.next_context.insert(MatchContext { + code_position, + has_matched: None, + handler: None, + count: -1, + ..*self + }) + } } -fn at<'a, S: StrDrive>(state: &State<'a, S>, ctx: &MatchContext<'a, S>, atcode: SreAtCode) -> bool { +fn at<'a, S: StrDrive>(req: &Request<'a, S>, ctx: &MatchContext<'a, S>, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), - SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(state)), - SreAtCode::BOUNDARY => ctx.at_boundary(state, is_word), - SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(state, is_word), + SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(req)), + SreAtCode::BOUNDARY => ctx.at_boundary(req, is_word), + SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(req, is_word), SreAtCode::END => { - (ctx.remaining_chars(state) == 1 && ctx.at_linebreak(state)) || ctx.at_end(state) + (ctx.remaining_chars(req) == 1 && ctx.at_linebreak(req)) || ctx.at_end(req) } - SreAtCode::END_LINE => ctx.at_linebreak(state) || ctx.at_end(state), - SreAtCode::END_STRING => ctx.at_end(state), - SreAtCode::LOC_BOUNDARY => ctx.at_boundary(state, is_loc_word), - SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(state, is_loc_word), - SreAtCode::UNI_BOUNDARY => ctx.at_boundary(state, is_uni_word), - SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(state, is_uni_word), + SreAtCode::END_LINE => ctx.at_linebreak(req) || ctx.at_end(req), + SreAtCode::END_STRING => ctx.at_end(req), + SreAtCode::LOC_BOUNDARY => ctx.at_boundary(req, is_loc_word), + SreAtCode::LOC_NON_BOUNDARY => ctx.at_non_boundary(req, is_loc_word), + SreAtCode::UNI_BOUNDARY => ctx.at_boundary(req, is_uni_word), + SreAtCode::UNI_NON_BOUNDARY => ctx.at_non_boundary(req, is_uni_word), } } fn general_op_literal<'a, S: StrDrive, F: FnOnce(u32, u32) -> bool>( - state: &State<'a, S>, + req: &Request<'a, S>, ctx: &mut MatchContext<'a, S>, f: F, ) { - if ctx.at_end(state) || !f(ctx.peek_code(state, 1), ctx.peek_char(state)) { + if ctx.at_end(req) || !f(ctx.peek_code(req, 1), ctx.peek_char(req)) { ctx.failure(); } else { ctx.skip_code(2); - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); } } fn general_op_in<'a, S: StrDrive, F: FnOnce(&[u32], u32) -> bool>( - state: &State<'a, S>, + req: &Request<'a, S>, ctx: &mut MatchContext<'a, S>, f: F, ) { - if ctx.at_end(state) || !f(&ctx.pattern(state)[2..], ctx.peek_char(state)) { + if ctx.at_end(req) || !f(&ctx.pattern(req)[2..], ctx.peek_char(req)) { ctx.failure(); } else { - ctx.skip_code_from(state, 1); - ctx.skip_char(state, 1); + ctx.skip_code_from(req, 1); + ctx.skip_char(req, 1); } } fn general_op_groupref<'a, S: StrDrive, F: FnMut(u32) -> u32>( + req: &Request<'a, S>, state: &State<'a, S>, ctx: &mut MatchContext<'a, S>, mut f: F, ) { - let (group_start, group_end) = state.get_marks(ctx.peek_code(state, 1) as usize); + let (group_start, group_end) = state.get_marks(ctx.peek_code(req, 1) as usize); let (group_start, group_end) = match (group_start, group_end) { (Some(start), Some(end)) if start <= end => (start, end), _ => { @@ -977,16 +1152,16 @@ fn general_op_groupref<'a, S: StrDrive, F: FnMut(u32) -> u32>( let mut gctx = MatchContext { string_position: group_start, - string_offset: state.string.offset(0, group_start), + string_offset: req.string.offset(0, group_start), ..*ctx }; for _ in group_start..group_end { - if ctx.at_end(state) || f(ctx.peek_char(state)) != f(gctx.peek_char(state)) { + if ctx.at_end(req) || f(ctx.peek_char(req)) != f(gctx.peek_char(req)) { return ctx.failure(); } - ctx.skip_char(state, 1); - gctx.skip_char(state, 1); + ctx.skip_char(req, 1); + gctx.skip_char(req, 1); } ctx.skip_code(2); @@ -1122,60 +1297,56 @@ fn charset(set: &[u32], ch: u32) -> bool { } fn _count<'a, S: StrDrive>( + req: &Request<'a, S>, state: &mut State<'a, S>, ctx: &MatchContext<'a, S>, max_count: usize, ) -> usize { let mut ctx = *ctx; - let max_count = std::cmp::min(max_count, ctx.remaining_chars(state)); + let max_count = std::cmp::min(max_count, ctx.remaining_chars(req)); let end = ctx.string_position + max_count; - let opcode = SreOpcode::try_from(ctx.peek_code(state, 0)).unwrap(); + let opcode = SreOpcode::try_from(ctx.peek_code(req, 0)).unwrap(); match opcode { SreOpcode::ANY => { - while !ctx.string_position < end && !ctx.at_linebreak(state) { - ctx.skip_char(state, 1); + while !ctx.string_position < end && !ctx.at_linebreak(req) { + ctx.skip_char(req, 1); } } SreOpcode::ANY_ALL => { - ctx.skip_char(state, max_count); + ctx.skip_char(req, max_count); } SreOpcode::IN => { - while !ctx.string_position < end - && charset(&ctx.pattern(state)[2..], ctx.peek_char(state)) + while !ctx.string_position < end && charset(&ctx.pattern(req)[2..], ctx.peek_char(req)) { - ctx.skip_char(state, 1); + ctx.skip_char(req, 1); } } SreOpcode::LITERAL => { - general_count_literal(state, &mut ctx, end, |code, c| code == c as u32); + general_count_literal(req, &mut ctx, end, |code, c| code == c as u32); } SreOpcode::NOT_LITERAL => { - general_count_literal(state, &mut ctx, end, |code, c| code != c as u32); + general_count_literal(req, &mut ctx, end, |code, c| code != c as u32); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(state, &mut ctx, end, |code, c| { - code == lower_ascii(c) as u32 - }); + general_count_literal(req, &mut ctx, end, |code, c| code == lower_ascii(c) as u32); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(state, &mut ctx, end, |code, c| { - code != lower_ascii(c) as u32 - }); + general_count_literal(req, &mut ctx, end, |code, c| code != lower_ascii(c) as u32); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(state, &mut ctx, end, char_loc_ignore); + general_count_literal(req, &mut ctx, end, char_loc_ignore); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(state, &mut ctx, end, |code, c| !char_loc_ignore(code, c)); + general_count_literal(req, &mut ctx, end, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(state, &mut ctx, end, |code, c| { + general_count_literal(req, &mut ctx, end, |code, c| { code == lower_unicode(c) as u32 }); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(state, &mut ctx, end, |code, c| { + general_count_literal(req, &mut ctx, end, |code, c| { code != lower_unicode(c) as u32 }); } @@ -1188,9 +1359,9 @@ fn _count<'a, S: StrDrive>( while count < max_count { ctx.code_position = reset_position; - let code = ctx.peek_code(state, 0); + let code = ctx.peek_code(req, 0); let code = SreOpcode::try_from(code).unwrap(); - dispatch(state, &mut ctx, code); + dispatch(req, state, &mut ctx, code); if ctx.has_matched == Some(false) { break; } @@ -1205,14 +1376,14 @@ fn _count<'a, S: StrDrive>( } fn general_count_literal<'a, S: StrDrive, F: FnMut(u32, u32) -> bool>( - state: &State<'a, S>, + req: &Request<'a, S>, ctx: &mut MatchContext<'a, S>, end: usize, mut f: F, ) { - let ch = ctx.peek_code(state, 1); - while !ctx.string_position < end && f(ch, ctx.peek_char(state)) { - ctx.skip_char(state, 1); + let ch = ctx.peek_code(req, 1); + while !ctx.string_position < end && f(ch, ctx.peek_char(req)) { + ctx.skip_char(req, 1); } } From c15387e97289386bbf0891a7ed18367220b59a15 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 16:44:47 +0200 Subject: [PATCH 063/705] refactor tests --- generate_tests.py | 2 +- src/engine.rs | 4 ++-- tests/tests.rs | 47 +++++++++++++++++++++++------------------------ 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/generate_tests.py b/generate_tests.py index b432720cd1..8adf043f29 100644 --- a/generate_tests.py +++ b/generate_tests.py @@ -33,7 +33,7 @@ def compile(cls, pattern, flags=0): def replace_compiled(m): line, indent, varname, pattern = m.groups() pattern = eval(pattern, {"re": CompiledPattern}) - pattern = f"Pattern {{ code: &{json.dumps(pattern.code)}, flags: SreFlag::from_bits_truncate({int(pattern.flags)}) }}" + pattern = f"Pattern {{ code: &{json.dumps(pattern.code)} }}" return f'''{line} {indent}// START GENERATED by generate_tests.py {indent}#[rustfmt::skip] let {varname} = {pattern}; diff --git a/src/engine.rs b/src/engine.rs index 5e1f1457ec..b9487a2fd2 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -81,7 +81,7 @@ pub struct State<'a, S: StrDrive> { pub string_position: usize, next_context: Option>, popped_has_matched: bool, - has_matched: bool, + pub has_matched: bool, } impl<'a, S: StrDrive> State<'a, S> { @@ -696,7 +696,7 @@ fn op_repeat<'a, S: StrDrive>( state.string_position = ctx.string_position; - let repeat_ctx_id = state.repeat_stack.len(); + let repeat_ctx_id = state.repeat_stack.len() - 1; // let next_ctx = next_ctx!(from 1, state, ctx, |state, ctx| { let next_ctx = ctx.next_from(1, req, state, |req, state, ctx| { diff --git a/tests/tests.rs b/tests/tests.rs index cc5c4d1f38..b4ad09f7be 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -1,18 +1,17 @@ -use sre_engine::constants::SreFlag; use sre_engine::engine; struct Pattern { code: &'static [u32], - flags: SreFlag, } impl Pattern { fn state<'a, S: engine::StrDrive>( &self, string: S, - range: std::ops::Range, - ) -> engine::State<'a, S> { - engine::State::new(string, range.start, range.end, self.flags, self.code) + ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + let req = engine::Request::new(string, 0, usize::MAX, self.code, false); + let state = engine::State::new(0); + (req, state) } } @@ -20,10 +19,10 @@ impl Pattern { fn test_2427() { // pattern lookbehind = re.compile(r'(? Date: Tue, 9 Aug 2022 16:54:41 +0200 Subject: [PATCH 064/705] refactor benches --- benches/benches.rs | 95 +++++++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 43 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index d000ceb62e..e24ea2f972 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -3,68 +3,77 @@ extern crate test; use test::Bencher; -use sre_engine::constants::SreFlag; use sre_engine::engine; -pub struct Pattern { - pub code: &'static [u32], - pub flags: SreFlag, + +struct Pattern { + code: &'static [u32], } impl Pattern { - pub fn state<'a, S: engine::StrDrive>( + fn state<'a, S: engine::StrDrive>( + &self, + string: S, + ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + self.state_range(string, 0..usize::MAX) + } + + fn state_range<'a, S: engine::StrDrive>( &self, string: S, range: std::ops::Range, - ) -> engine::State<'a, S> { - engine::State::new(string, range.start, range.end, self.flags, self.code) + ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + let req = engine::Request::new(string, range.start, range.end, self.code, false); + let state = engine::State::new(0); + (req, state) } } + #[bench] fn benchmarks(b: &mut Bencher) { // # test common prefix // pattern p1 = re.compile('Python|Perl') # , 'Perl'), # Alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p1 = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p1 = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1] }; // END GENERATED // pattern p2 = re.compile('(Python|Perl)') #, 'Perl'), # Grouped alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p2 = Pattern { code: &[15, 8, 1, 4, 6, 1, 0, 80, 0, 18, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p2 = Pattern { code: &[15, 8, 1, 4, 6, 1, 0, 80, 0, 18, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 18, 1, 1] }; // END GENERATED - // pattern pn = re.compile('Python|Perl|Tcl') #, 'Perl'), # Alternation + // pattern p3 = re.compile('Python|Perl|Tcl') #, 'Perl'), # Alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p3 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p3 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 1] }; // END GENERATED - // pattern pn = re.compile('(Python|Perl|Tcl)') #, 'Perl'), # Grouped alternation + // pattern p4 = re.compile('(Python|Perl|Tcl)') #, 'Perl'), # Grouped alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p4 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 18, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p4 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 18, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 18, 1, 1] }; // END GENERATED - // pattern pn = re.compile('(Python)\\1') #, 'PythonPython'), # Backreference + // pattern p5 = re.compile('(Python)\\1') #, 'PythonPython'), # Backreference // START GENERATED by generate_tests.py - #[rustfmt::skip] let p5 = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p5 = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1] }; // END GENERATED - // pattern pn = re.compile('([0a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # Disable the fastmap optimization + // pattern p6 = re.compile('([0a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # Disable the fastmap optimization // START GENERATED by generate_tests.py - #[rustfmt::skip] let p6 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 31, 1, 4294967295, 18, 0, 14, 7, 17, 48, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p6 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 31, 1, 4294967295, 18, 0, 14, 7, 17, 48, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1] }; // END GENERATED - // pattern pn = re.compile('([a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # A few sets + // pattern p7 = re.compile('([a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # A few sets // START GENERATED by generate_tests.py - #[rustfmt::skip] let p7 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 29, 1, 4294967295, 18, 0, 14, 5, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p7 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 29, 1, 4294967295, 18, 0, 14, 5, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1] }; // END GENERATED - // pattern pn = re.compile('Python') #, 'Python'), # Simple text literal + // pattern p8 = re.compile('Python') #, 'Python'), # Simple text literal // START GENERATED by generate_tests.py - #[rustfmt::skip] let p8 = Pattern { code: &[15, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p8 = Pattern { code: &[15, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1] }; // END GENERATED - // pattern pn = re.compile('.*Python') #, 'Python'), # Bad text literal + // pattern p9 = re.compile('.*Python') #, 'Python'), # Bad text literal // START GENERATED by generate_tests.py - #[rustfmt::skip] let p9 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p9 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1] }; // END GENERATED - // pattern pn = re.compile('.*Python.*') #, 'Python'), # Worse text literal + // pattern p10 = re.compile('.*Python.*') #, 'Python'), # Worse text literal // START GENERATED by generate_tests.py - #[rustfmt::skip] let p10 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 25, 5, 0, 4294967295, 2, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p10 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 25, 5, 0, 4294967295, 2, 1, 1] }; // END GENERATED - // pattern pn = re.compile('.*(Python)') #, 'Python'), # Bad text literal with grouping + // pattern p11 = re.compile('.*(Python)') #, 'Python'), # Bad text literal with grouping // START GENERATED by generate_tests.py - #[rustfmt::skip] let p11 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 1], flags: SreFlag::from_bits_truncate(32) }; + #[rustfmt::skip] let p11 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 1] }; // END GENERATED let tests = [ @@ -83,29 +92,29 @@ fn benchmarks(b: &mut Bencher) { b.iter(move || { for (p, s) in &tests { - let mut state = p.state(s.clone(), 0..usize::MAX); - state.search(); + let (mut req, mut state) = p.state(s.clone()); + state.search(&mut req); assert!(state.has_matched); - state = p.state(s.clone(), 0..usize::MAX); - state.pymatch(); + let (mut req, mut state) = p.state(s.clone()); + state.pymatch(&mut req); assert!(state.has_matched); - state = p.state(s.clone(), 0..usize::MAX); - state.match_all = true; - state.pymatch(); + let (mut req, mut state) = p.state(s.clone()); + req.match_all = true; + state.pymatch(&mut req); assert!(state.has_matched); let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000)); - state = p.state(s2.as_str(), 0..usize::MAX); - state.search(); + let (mut req, mut state) = p.state_range(s2.as_str(), 0..usize::MAX); + state.search(&mut req); assert!(state.has_matched); - state = p.state(s2.as_str(), 10000..usize::MAX); - state.pymatch(); + let (mut req, mut state) = p.state_range(s2.as_str(), 10000..usize::MAX); + state.pymatch(&mut req); assert!(state.has_matched); - state = p.state(s2.as_str(), 10000..10000 + s.len()); - state.pymatch(); + let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); + state.pymatch(&mut req); assert!(state.has_matched); - state = p.state(s2.as_str(), 10000..10000 + s.len()); - state.match_all = true; - state.pymatch(); + let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); + req.match_all = true; + state.pymatch(&mut req); assert!(state.has_matched); } }) From de8973d77a40303693e8e15da70fe63f6f974546 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 17:34:03 +0200 Subject: [PATCH 065/705] simplify lifetime --- benches/benches.rs | 6 +- src/engine.rs | 265 ++++++++++++++++++--------------------------- tests/tests.rs | 4 +- 3 files changed, 110 insertions(+), 165 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index e24ea2f972..8e0e87935a 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -13,7 +13,7 @@ impl Pattern { fn state<'a, S: engine::StrDrive>( &self, string: S, - ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + ) -> (engine::Request<'a, S>, engine::State) { self.state_range(string, 0..usize::MAX) } @@ -21,9 +21,9 @@ impl Pattern { &self, string: S, range: std::ops::Range, - ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, range.start, range.end, self.code, false); - let state = engine::State::new(0); + let state = engine::State::new(); (req, state) } } diff --git a/src/engine.rs b/src/engine.rs index b9487a2fd2..ace75d1a36 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -39,25 +39,6 @@ impl<'a, S: StrDrive> Request<'a, S> { } } -macro_rules! next_ctx { - (offset $offset:expr, $state:expr, $ctx:expr, $handler:expr) => { - next_ctx!(position $ctx.code_position + $offset, $state, $ctx, $handler) - }; - (from $peek:expr, $state:expr, $ctx:expr, $handler:expr) => { - next_ctx!(offset $ctx.peek_code($state, $peek) as usize + 1, $state, $ctx, $handler) - }; - (position $position:expr, $state:expr, $ctx:expr, $handler:expr) => {{ - $ctx.handler = Some($handler); - $state.next_context.insert(MatchContext { - code_position: $position, - has_matched: None, - handler: None, - count: -1, - ..*$ctx - }) - }}; -} - macro_rules! mark { (push, $state:expr) => { $state @@ -72,27 +53,27 @@ macro_rules! mark { } #[derive(Debug)] -pub struct State<'a, S: StrDrive> { +pub struct State { pub marks: Vec>, pub lastindex: isize, marks_stack: Vec<(Vec>, isize)>, - context_stack: Vec>, + context_stack: Vec>, repeat_stack: Vec, pub string_position: usize, - next_context: Option>, + next_context: Option>, popped_has_matched: bool, pub has_matched: bool, } -impl<'a, S: StrDrive> State<'a, S> { - pub fn new(string_position: usize) -> Self { +impl State { + pub fn new() -> Self { Self { marks: Vec::new(), lastindex: -1, marks_stack: Vec::new(), context_stack: Vec::new(), repeat_stack: Vec::new(), - string_position, + string_position: 0, next_context: None, popped_has_matched: false, has_matched: false, @@ -145,7 +126,7 @@ impl<'a, S: StrDrive> State<'a, S> { self.marks_stack.pop(); } - fn _match(&mut self, req: &mut Request<'a, S>) { + fn _match(&mut self, req: &mut Request) { while let Some(mut ctx) = self.context_stack.pop() { if let Some(handler) = ctx.handler.take() { handler(req, self, &mut ctx); @@ -169,7 +150,9 @@ impl<'a, S: StrDrive> State<'a, S> { self.has_matched = self.popped_has_matched; } - pub fn pymatch(&mut self, req: &mut Request<'a, S>) { + pub fn pymatch(&mut self, req: &mut Request) { + self.string_position = req.start; + let ctx = MatchContext { string_position: req.start, string_offset: req.string.offset(0, req.start), @@ -185,7 +168,9 @@ impl<'a, S: StrDrive> State<'a, S> { self._match(req); } - pub fn search(&mut self, req: &mut Request<'a, S>) { + pub fn search(&mut self, req: &mut Request) { + self.string_position = req.start; + // TODO: optimize by op info and skip prefix if req.start > req.end { return; @@ -196,7 +181,7 @@ impl<'a, S: StrDrive> State<'a, S> { let mut start_offset = req.string.offset(0, req.start); - let mut ctx = MatchContext { + let ctx = MatchContext { string_position: req.start, string_offset: start_offset, code_position: 0, @@ -240,10 +225,10 @@ impl<'a, S: StrDrive> State<'a, S> { } } -fn dispatch<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn dispatch( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, opcode: SreOpcode, ) { match opcode { @@ -410,11 +395,7 @@ fn dispatch<'a, S: StrDrive>( /* assert subpattern */ /* */ -fn op_assert<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, -) { +fn op_assert(req: &Request, state: &mut State, ctx: &mut MatchContext) { let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { return ctx.failure(); @@ -435,18 +416,14 @@ fn op_assert<'a, S: StrDrive>( /* assert not subpattern */ /* */ -fn op_assert_not<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, -) { +fn op_assert_not(req: &Request, state: &mut State, ctx: &mut MatchContext) { let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { return ctx.skip_code_from(req, 1); } - let next_ctx = next_ctx!(offset 3, state, ctx, |req, state, ctx| { + let next_ctx = ctx.next_offset(3, state, |req, state, ctx| { if state.popped_has_matched { ctx.failure(); } else { @@ -460,20 +437,16 @@ fn op_assert_not<'a, S: StrDrive>( // alternation // <0=skip> code ... -fn op_branch<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, -) { +fn op_branch(req: &Request, state: &mut State, ctx: &mut MatchContext) { mark!(push, state); ctx.count = 1; create_context(req, state, ctx); - fn create_context<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, + fn create_context( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, ) { let branch_offset = ctx.count as usize; let next_length = ctx.peek_code(req, branch_offset) as isize; @@ -485,14 +458,10 @@ fn op_branch<'a, S: StrDrive>( state.string_position = ctx.string_position; ctx.count += next_length; - next_ctx!(offset branch_offset + 1, state, ctx, callback); + ctx.next_offset(branch_offset + 1, state, callback); } - fn callback<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, - ) { + fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { return ctx.success(); } @@ -502,10 +471,10 @@ fn op_branch<'a, S: StrDrive>( } /* <1=min> <2=max> item tail */ -fn op_min_repeat_one<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn op_min_repeat_one( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, ) { let min_count = ctx.peek_code(req, 2) as usize; @@ -536,10 +505,10 @@ fn op_min_repeat_one<'a, S: StrDrive>( mark!(push, state); create_context(req, state, ctx); - fn create_context<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, + fn create_context( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, ) { let max_count = ctx.peek_code(req, 3) as usize; @@ -553,11 +522,7 @@ fn op_min_repeat_one<'a, S: StrDrive>( } } - fn callback<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, - ) { + fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { return ctx.success(); } @@ -582,11 +547,7 @@ exactly one character wide, and we're not already collecting backtracking points. for other cases, use the MAX_REPEAT operator */ /* <1=min> <2=max> item tail */ -fn op_repeat_one<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, -) { +fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut MatchContext) { let min_count = ctx.peek_code(req, 2) as usize; let max_count = ctx.peek_code(req, 3) as usize; @@ -613,10 +574,10 @@ fn op_repeat_one<'a, S: StrDrive>( ctx.count = count as isize; create_context(req, state, ctx); - fn create_context<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, + fn create_context( + req: &Request, + state: &mut State, + ctx: &mut MatchContext, ) { let min_count = ctx.peek_code(req, 2) as isize; let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); @@ -641,11 +602,7 @@ fn op_repeat_one<'a, S: StrDrive>( ctx.next_from(1, req, state, callback); } - fn callback<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, - ) { + fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { return ctx.success(); } @@ -678,11 +635,7 @@ struct RepeatContext { /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ /* <1=min> <2=max> item tail */ -fn op_repeat<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, -) { +fn op_repeat(req: &Request, state: &mut State, ctx: &mut MatchContext) { let repeat_ctx = RepeatContext { count: -1, min_count: ctx.peek_code(req, 2) as usize, @@ -698,8 +651,7 @@ fn op_repeat<'a, S: StrDrive>( let repeat_ctx_id = state.repeat_stack.len() - 1; - // let next_ctx = next_ctx!(from 1, state, ctx, |state, ctx| { - let next_ctx = ctx.next_from(1, req, state, |req, state, ctx| { + let next_ctx = ctx.next_from(1, req, state, |_, state, ctx| { ctx.has_matched = Some(state.popped_has_matched); state.repeat_stack.pop(); }); @@ -707,7 +659,7 @@ fn op_repeat<'a, S: StrDrive>( } /* minimizing repeat */ -fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { +fn op_min_until(state: &mut State, ctx: &mut MatchContext) { let repeat_ctx = state.repeat_stack.last_mut().unwrap(); state.string_position = ctx.string_position; @@ -716,8 +668,7 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { - ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -736,8 +687,7 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex let repeat_ctx_prev_id = repeat_ctx.prev_id; // see if the tail matches - // let next_ctx = next_ctx!(offset 1, state, ctx, |state, ctx| { - let next_ctx = ctx.next_offset(1, state, |req, state, ctx| { + let next_ctx = ctx.next_offset(1, state, |_, state, ctx| { if state.popped_has_matched { return ctx.success(); } @@ -762,8 +712,7 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex /* zero-width match protection */ repeat_ctx.last_position = state.string_position; - // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { - ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -777,7 +726,7 @@ fn op_min_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex } /* maximizing repeat */ -fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { +fn op_max_until(state: &mut State, ctx: &mut MatchContext) { let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; state.string_position = ctx.string_position; @@ -786,8 +735,7 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex if (repeat_ctx.count as usize) < repeat_ctx.min_count { // not enough matches - // next_ctx!(position repeat_ctx.code_position + 4, state, ctx, |state, ctx| { - ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { if state.popped_has_matched { ctx.success(); } else { @@ -809,7 +757,7 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex ctx.count = repeat_ctx.last_position as isize; repeat_ctx.last_position = state.string_position; - ctx.next_at(repeat_ctx.code_position + 4, state, |req, state, ctx| { + ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { let save_last_position = ctx.count as usize; let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; repeat_ctx.last_position = save_last_position; @@ -826,22 +774,21 @@ fn op_max_until<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContex /* cannot match more repeated items here. make sure the tail matches */ - let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + let repeat_ctx_prev_id = repeat_ctx.prev_id; + let next_ctx = ctx.next_offset(1, state, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx_prev_id; }); return; } /* cannot match more repeated items here. make sure the tail matches */ - let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + // let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); + let repeat_ctx_prev_id = repeat_ctx.prev_id; + let next_ctx = ctx.next_offset(1, state, tail_callback); + next_ctx.repeat_ctx_id = repeat_ctx_prev_id; - fn tail_callback<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &mut MatchContext<'a, S>, - ) { + fn tail_callback(_: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { ctx.success(); } else { @@ -926,19 +873,21 @@ impl<'a> StrDrive for &'a [u8] { } } +type OpFunc = for<'a> fn(&Request<'a, S>, &mut State, &mut MatchContext); + #[derive(Clone, Copy)] -struct MatchContext<'a, S: StrDrive> { +struct MatchContext { string_position: usize, string_offset: usize, code_position: usize, has_matched: Option, toplevel: bool, - handler: Option, &mut State<'a, S>, &mut Self)>, + handler: Option>, repeat_ctx_id: usize, count: isize, } -impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { +impl<'a, S: StrDrive> std::fmt::Debug for MatchContext { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("MatchContext") .field("string_position", &self.string_position) @@ -953,38 +902,38 @@ impl<'a, S: StrDrive> std::fmt::Debug for MatchContext<'a, S> { } } -impl<'a, S: StrDrive> MatchContext<'a, S> { - fn pattern(&self, req: &Request<'a, S>) -> &'a [u32] { +impl MatchContext { + fn pattern<'a>(&self, req: &Request<'a, S>) -> &'a [u32] { &req.pattern_codes[self.code_position..] } - fn remaining_codes(&self, req: &Request<'a, S>) -> usize { + fn remaining_codes(&self, req: &Request) -> usize { req.pattern_codes.len() - self.code_position } - fn remaining_chars(&self, req: &Request<'a, S>) -> usize { + fn remaining_chars(&self, req: &Request) -> usize { req.end - self.string_position } - fn peek_char(&self, req: &Request<'a, S>) -> u32 { + fn peek_char(&self, req: &Request) -> u32 { req.string.peek(self.string_offset) } - fn skip_char(&mut self, req: &Request<'a, S>, skip: usize) { + fn skip_char(&mut self, req: &Request, skip: usize) { self.string_position += skip; self.string_offset = req.string.offset(self.string_offset, skip); } - fn back_peek_char(&self, req: &Request<'a, S>) -> u32 { + fn back_peek_char(&self, req: &Request) -> u32 { req.string.back_peek(self.string_offset) } - fn back_skip_char(&mut self, req: &Request<'a, S>, skip: usize) { + fn back_skip_char(&mut self, req: &Request, skip: usize) { self.string_position -= skip; self.string_offset = req.string.back_offset(self.string_offset, skip); } - fn peek_code(&self, req: &Request<'a, S>, peek: usize) -> u32 { + fn peek_code(&self, req: &Request, peek: usize) -> u32 { req.pattern_codes[self.code_position + peek] } @@ -992,7 +941,7 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { self.code_position += skip; } - fn skip_code_from(&mut self, req: &Request<'a, S>, peek: usize) { + fn skip_code_from(&mut self, req: &Request, peek: usize) { self.skip_code(self.peek_code(req, peek) as usize + 1); } @@ -1001,19 +950,15 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { self.string_position == 0 } - fn at_end(&self, req: &Request<'a, S>) -> bool { + fn at_end(&self, req: &Request) -> bool { self.string_position == req.end } - fn at_linebreak(&self, req: &Request<'a, S>) -> bool { + fn at_linebreak(&self, req: &Request) -> bool { !self.at_end(req) && is_linebreak(self.peek_char(req)) } - fn at_boundary bool>( - &self, - req: &Request<'a, S>, - mut word_checker: F, - ) -> bool { + fn at_boundary bool>(&self, req: &Request, mut word_checker: F) -> bool { if self.at_beginning() && self.at_end(req) { return false; } @@ -1024,7 +969,7 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { fn at_non_boundary bool>( &self, - req: &Request<'a, S>, + req: &Request, mut word_checker: F, ) -> bool { if self.at_beginning() && self.at_end(req) { @@ -1035,7 +980,7 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { this == that } - fn can_success(&self, req: &Request<'a, S>) -> bool { + fn can_success(&self, req: &Request) -> bool { if !self.toplevel { return true; } @@ -1059,9 +1004,9 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { fn next_from<'b>( &mut self, peek: usize, - req: &Request<'a, S>, - state: &'b mut State<'a, S>, - f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + req: &Request, + state: &'b mut State, + f: OpFunc, ) -> &'b mut Self { self.next_offset(self.peek_code(req, peek) as usize + 1, state, f) } @@ -1069,8 +1014,8 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { fn next_offset<'b>( &mut self, offset: usize, - state: &'b mut State<'a, S>, - f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + state: &'b mut State, + f: OpFunc, ) -> &'b mut Self { self.next_at(self.code_position + offset, state, f) } @@ -1078,8 +1023,8 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { fn next_at<'b>( &mut self, code_position: usize, - state: &'b mut State<'a, S>, - f: fn(&Request<'a, S>, &mut State<'a, S>, &mut Self), + state: &'b mut State, + f: OpFunc, ) -> &'b mut Self { self.handler = Some(f); state.next_context.insert(MatchContext { @@ -1092,7 +1037,7 @@ impl<'a, S: StrDrive> MatchContext<'a, S> { } } -fn at<'a, S: StrDrive>(req: &Request<'a, S>, ctx: &MatchContext<'a, S>, atcode: SreAtCode) -> bool { +fn at(req: &Request, ctx: &MatchContext, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(req)), @@ -1110,9 +1055,9 @@ fn at<'a, S: StrDrive>(req: &Request<'a, S>, ctx: &MatchContext<'a, S>, atcode: } } -fn general_op_literal<'a, S: StrDrive, F: FnOnce(u32, u32) -> bool>( - req: &Request<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn general_op_literal bool>( + req: &Request, + ctx: &mut MatchContext, f: F, ) { if ctx.at_end(req) || !f(ctx.peek_code(req, 1), ctx.peek_char(req)) { @@ -1123,9 +1068,9 @@ fn general_op_literal<'a, S: StrDrive, F: FnOnce(u32, u32) -> bool>( } } -fn general_op_in<'a, S: StrDrive, F: FnOnce(&[u32], u32) -> bool>( - req: &Request<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn general_op_in bool>( + req: &Request, + ctx: &mut MatchContext, f: F, ) { if ctx.at_end(req) || !f(&ctx.pattern(req)[2..], ctx.peek_char(req)) { @@ -1136,10 +1081,10 @@ fn general_op_in<'a, S: StrDrive, F: FnOnce(&[u32], u32) -> bool>( } } -fn general_op_groupref<'a, S: StrDrive, F: FnMut(u32) -> u32>( - req: &Request<'a, S>, - state: &State<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn general_op_groupref u32>( + req: &Request, + state: &State, + ctx: &mut MatchContext, mut f: F, ) { let (group_start, group_end) = state.get_marks(ctx.peek_code(req, 1) as usize); @@ -1296,10 +1241,10 @@ fn charset(set: &[u32], ch: u32) -> bool { false } -fn _count<'a, S: StrDrive>( - req: &Request<'a, S>, - state: &mut State<'a, S>, - ctx: &MatchContext<'a, S>, +fn _count( + req: &Request, + state: &mut State, + ctx: &MatchContext, max_count: usize, ) -> usize { let mut ctx = *ctx; @@ -1375,9 +1320,9 @@ fn _count<'a, S: StrDrive>( ctx.string_position - state.string_position } -fn general_count_literal<'a, S: StrDrive, F: FnMut(u32, u32) -> bool>( - req: &Request<'a, S>, - ctx: &mut MatchContext<'a, S>, +fn general_count_literal bool>( + req: &Request, + ctx: &mut MatchContext, end: usize, mut f: F, ) { diff --git a/tests/tests.rs b/tests/tests.rs index b4ad09f7be..ead111c74a 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -8,9 +8,9 @@ impl Pattern { fn state<'a, S: engine::StrDrive>( &self, string: S, - ) -> (engine::Request<'a, S>, engine::State<'a, S>) { + ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, 0, usize::MAX, self.code, false); - let state = engine::State::new(0); + let state = engine::State::new(); (req, state) } } From c494feb7f776e8e15185710f72d41b603db995d8 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 21:34:06 +0200 Subject: [PATCH 066/705] refactor split Marks --- Cargo.toml | 1 + benches/benches.rs | 2 +- src/engine.rs | 242 ++++++++++++++++++++++++++++++--------------- tests/tests.rs | 5 +- 4 files changed, 166 insertions(+), 84 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4b403f2861..8993c1e71d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,3 +12,4 @@ include = ["LICENSE", "src/**/*.rs"] [dependencies] num_enum = "0.5" bitflags = "1.2" +optional = "0.5" diff --git a/benches/benches.rs b/benches/benches.rs index 8e0e87935a..f19b92d64b 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -23,7 +23,7 @@ impl Pattern { range: std::ops::Range, ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, range.start, range.end, self.code, false); - let state = engine::State::new(); + let state = engine::State::default(); (req, state) } } diff --git a/src/engine.rs b/src/engine.rs index ace75d1a36..087d64e8cd 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -2,7 +2,9 @@ use super::constants::{SreAtCode, SreCatCode, SreInfo, SreOpcode}; use super::MAXREPEAT; +use optional::Optioned; use std::convert::TryFrom; +use std::ops::Deref; const fn is_py_ascii_whitespace(b: u8) -> bool { matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') @@ -39,24 +41,98 @@ impl<'a, S: StrDrive> Request<'a, S> { } } -macro_rules! mark { - (push, $state:expr) => { - $state - .marks_stack - .push(($state.marks.clone(), $state.lastindex)) - }; - (pop, $state:expr) => { - let (marks, lastindex) = $state.marks_stack.pop().unwrap(); - $state.marks = marks; - $state.lastindex = lastindex; - }; +// macro_rules! mark { +// (push, $state:expr) => { +// $state +// .marks_stack +// .push(($state.marks.clone(), $state.lastindex)) +// }; +// (pop, $state:expr) => { +// let (marks, lastindex) = $state.marks_stack.pop().unwrap(); +// $state.marks = marks; +// $state.lastindex = lastindex; +// }; +// } + +#[derive(Debug)] +pub struct Marks { + last_index: isize, + marks: Vec>, + marks_stack: Vec<(Vec>, isize)>, +} + +impl Default for Marks { + fn default() -> Self { + Self { + last_index: -1, + marks: Vec::new(), + marks_stack: Vec::new(), + } + } +} + +impl Deref for Marks { + type Target = Vec>; + + fn deref(&self) -> &Self::Target { + &self.marks + } +} + +impl Marks { + pub fn get(&self, group_index: usize) -> (Optioned, Optioned) { + let marks_index = 2 * group_index; + if marks_index + 1 < self.marks.len() { + (self.marks[marks_index], self.marks[marks_index + 1]) + } else { + (Optioned::none(), Optioned::none()) + } + } + + pub fn last_index(&self) -> isize { + self.last_index + } + + fn set(&mut self, mark_nr: usize, position: usize) { + if mark_nr & 1 != 0 { + self.last_index = mark_nr as isize / 2 + 1; + } + if mark_nr >= self.marks.len() { + self.marks.resize(mark_nr + 1, Optioned::none()); + } + self.marks[mark_nr] = Optioned::some(position); + } + + fn push(&mut self) { + self.marks_stack.push((self.marks.clone(), self.last_index)); + } + + fn pop(&mut self) { + let (marks, last_index) = self.marks_stack.pop().unwrap(); + self.marks = marks; + self.last_index = last_index; + } + + fn pop_keep(&mut self) { + let (marks, last_index) = self.marks_stack.last().unwrap().clone(); + self.marks = marks; + self.last_index = last_index; + } + + fn pop_discard(&mut self) { + self.marks_stack.pop(); + } + + fn clear(&mut self) { + self.last_index = -1; + self.marks.clear(); + self.marks_stack.clear(); + } } #[derive(Debug)] pub struct State { - pub marks: Vec>, - pub lastindex: isize, - marks_stack: Vec<(Vec>, isize)>, + pub marks: Marks, context_stack: Vec>, repeat_stack: Vec, pub string_position: usize, @@ -65,25 +141,23 @@ pub struct State { pub has_matched: bool, } -impl State { - pub fn new() -> Self { +impl Default for State { + fn default() -> Self { Self { - marks: Vec::new(), - lastindex: -1, - marks_stack: Vec::new(), - context_stack: Vec::new(), - repeat_stack: Vec::new(), - string_position: 0, - next_context: None, - popped_has_matched: false, - has_matched: false, + marks: Default::default(), + context_stack: Default::default(), + repeat_stack: Default::default(), + string_position: Default::default(), + next_context: Default::default(), + popped_has_matched: Default::default(), + has_matched: Default::default(), } } +} +impl State { pub fn reset(&mut self, string_position: usize) { - self.lastindex = -1; self.marks.clear(); - self.marks_stack.clear(); self.context_stack.clear(); self.repeat_stack.clear(); self.string_position = string_position; @@ -92,23 +166,23 @@ impl State { self.has_matched = false; } - fn set_mark(&mut self, mark_nr: usize, position: usize) { - if mark_nr & 1 != 0 { - self.lastindex = mark_nr as isize / 2 + 1; - } - if mark_nr >= self.marks.len() { - self.marks.resize(mark_nr + 1, None); - } - self.marks[mark_nr] = Some(position); - } - fn get_marks(&self, group_index: usize) -> (Option, Option) { - let marks_index = 2 * group_index; - if marks_index + 1 < self.marks.len() { - (self.marks[marks_index], self.marks[marks_index + 1]) - } else { - (None, None) - } - } + // fn set_mark(&mut self, mark_nr: usize, position: usize) { + // if mark_nr & 1 != 0 { + // self.lastindex = mark_nr as isize / 2 + 1; + // } + // if mark_nr >= self.marks.len() { + // self.marks.resize(mark_nr + 1, None); + // } + // self.marks[mark_nr] = Some(position); + // } + // fn get_marks(&self, group_index: usize) -> (Option, Option) { + // let marks_index = 2 * group_index; + // if marks_index + 1 < self.marks.len() { + // (self.marks[marks_index], self.marks[marks_index + 1]) + // } else { + // (None, None) + // } + // } // fn marks_push(&mut self) { // self.marks_stack.push((self.marks.clone(), self.lastindex)); // } @@ -117,14 +191,14 @@ impl State { // self.marks = marks; // self.lastindex = lastindex; // } - fn marks_pop_keep(&mut self) { - let (marks, lastindex) = self.marks_stack.last().unwrap().clone(); - self.marks = marks; - self.lastindex = lastindex; - } - fn marks_pop_discard(&mut self) { - self.marks_stack.pop(); - } + // fn marks_pop_keep(&mut self) { + // let (marks, lastindex) = self.marks_stack.last().unwrap().clone(); + // self.marks = marks; + // self.lastindex = lastindex; + // } + // fn marks_pop_discard(&mut self) { + // self.marks_stack.pop(); + // } fn _match(&mut self, req: &mut Request) { while let Some(mut ctx) = self.context_stack.pop() { @@ -311,7 +385,9 @@ fn dispatch( general_op_literal(req, ctx, |code, c| !char_loc_ignore(code, c)) } SreOpcode::MARK => { - state.set_mark(ctx.peek_code(req, 1) as usize, ctx.string_position); + state + .marks + .set(ctx.peek_code(req, 1) as usize, ctx.string_position); ctx.skip_code(2); } SreOpcode::MAX_UNTIL => op_max_until(state, ctx), @@ -324,12 +400,14 @@ fn dispatch( SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(req, state, ctx, lower_locate), SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(req, state, ctx, lower_unicode), SreOpcode::GROUPREF_EXISTS => { - let (group_start, group_end) = state.get_marks(ctx.peek_code(req, 1) as usize); - match (group_start, group_end) { - (Some(start), Some(end)) if start <= end => { - ctx.skip_code(3); - } - _ => ctx.skip_code_from(req, 2), + let (group_start, group_end) = state.marks.get(ctx.peek_code(req, 1) as usize); + if group_start.is_some() + && group_end.is_some() + && group_start.unpack() <= group_end.unpack() + { + ctx.skip_code(3); + } else { + ctx.skip_code_from(req, 2) } } _ => unreachable!("unexpected opcode"), @@ -438,7 +516,7 @@ fn op_assert_not(req: &Request, state: &mut State, ctx: &mut // alternation // <0=skip> code ... fn op_branch(req: &Request, state: &mut State, ctx: &mut MatchContext) { - mark!(push, state); + state.marks.push(); ctx.count = 1; create_context(req, state, ctx); @@ -451,7 +529,7 @@ fn op_branch(req: &Request, state: &mut State, ctx: &mut Matc let branch_offset = ctx.count as usize; let next_length = ctx.peek_code(req, branch_offset) as isize; if next_length == 0 { - state.marks_pop_discard(); + state.marks.pop_discard(); return ctx.failure(); } @@ -465,7 +543,7 @@ fn op_branch(req: &Request, state: &mut State, ctx: &mut Matc if state.popped_has_matched { return ctx.success(); } - state.marks_pop_keep(); + state.marks.pop_keep(); create_context(req, state, ctx); } } @@ -502,7 +580,7 @@ fn op_min_repeat_one( return ctx.success(); } - mark!(push, state); + state.marks.push(); create_context(req, state, ctx); fn create_context( @@ -517,7 +595,7 @@ fn op_min_repeat_one( // next_ctx!(from 1, state, ctx, callback); ctx.next_from(1, req, state, callback); } else { - state.marks_pop_discard(); + state.marks.pop_discard(); ctx.failure(); } } @@ -530,13 +608,13 @@ fn op_min_repeat_one( state.string_position = ctx.string_position; if _count(req, state, ctx, 1) == 0 { - state.marks_pop_discard(); + state.marks.pop_discard(); return ctx.failure(); } ctx.skip_char(req, 1); ctx.count += 1; - state.marks_pop_keep(); + state.marks.pop_keep(); create_context(req, state, ctx); } } @@ -570,7 +648,7 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut return ctx.success(); } - mark!(push, state); + state.marks.push(); ctx.count = count as isize; create_context(req, state, ctx); @@ -587,7 +665,7 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut let c = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 2); while ctx.at_end(req) || ctx.peek_char(req) != c { if ctx.count <= min_count { - state.marks_pop_discard(); + state.marks.pop_discard(); return ctx.failure(); } ctx.back_skip_char(req, 1); @@ -610,14 +688,14 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut let min_count = ctx.peek_code(req, 2) as isize; if ctx.count <= min_count { - state.marks_pop_discard(); + state.marks.pop_discard(); return ctx.failure(); } ctx.back_skip_char(req, 1); ctx.count -= 1; - state.marks_pop_keep(); + state.marks.pop_keep(); create_context(req, state, ctx); } } @@ -680,7 +758,7 @@ fn op_min_until(state: &mut State, ctx: &mut MatchContext) { return; } - mark!(push, state); + state.marks.push(); ctx.count = ctx.repeat_ctx_id as isize; @@ -698,7 +776,7 @@ fn op_min_until(state: &mut State, ctx: &mut MatchContext) { state.string_position = ctx.string_position; - mark!(pop, state); + state.marks.pop(); // match more until tail matches @@ -752,7 +830,7 @@ fn op_max_until(state: &mut State, ctx: &mut MatchContext) { { /* we may have enough matches, but if we can match another item, do so */ - mark!(push, state); + state.marks.push(); ctx.count = repeat_ctx.last_position as isize; repeat_ctx.last_position = state.string_position; @@ -763,11 +841,11 @@ fn op_max_until(state: &mut State, ctx: &mut MatchContext) { repeat_ctx.last_position = save_last_position; if state.popped_has_matched { - state.marks_pop_discard(); + state.marks.pop_discard(); return ctx.success(); } - mark!(pop, state); + state.marks.pop(); repeat_ctx.count -= 1; state.string_position = ctx.string_position; @@ -1087,12 +1165,14 @@ fn general_op_groupref u32>( ctx: &mut MatchContext, mut f: F, ) { - let (group_start, group_end) = state.get_marks(ctx.peek_code(req, 1) as usize); - let (group_start, group_end) = match (group_start, group_end) { - (Some(start), Some(end)) if start <= end => (start, end), - _ => { - return ctx.failure(); - } + let (group_start, group_end) = state.marks.get(ctx.peek_code(req, 1) as usize); + let (group_start, group_end) = if group_start.is_some() + && group_end.is_some() + && group_start.unpack() <= group_end.unpack() + { + (group_start.unpack(), group_end.unpack()) + } else { + return ctx.failure(); }; let mut gctx = MatchContext { diff --git a/tests/tests.rs b/tests/tests.rs index ead111c74a..cb11db3483 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -10,7 +10,7 @@ impl Pattern { string: S, ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, 0, usize::MAX, self.code, false); - let state = engine::State::new(); + let state = engine::State::default(); (req, state) } } @@ -62,13 +62,14 @@ fn test_zerowidth() { #[test] fn test_repeat_context_panic() { + use optional::Optioned; // pattern p = re.compile(r'(?:a*?(xx)??z)*') // START GENERATED by generate_tests.py #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 25, 0, 4294967295, 27, 6, 0, 4294967295, 17, 97, 1, 24, 11, 0, 1, 18, 0, 17, 120, 17, 120, 18, 1, 20, 17, 122, 19, 1] }; // END GENERATED let (mut req, mut state) = p.state("axxzaz"); state.pymatch(&mut req); - assert!(state.marks == vec![Some(1), Some(3)]); + assert!(*state.marks == vec![Optioned::some(1), Optioned::some(3)]); } #[test] From 18258000cde2c848198b745e92f74a89d48c7fe8 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 22:17:01 +0200 Subject: [PATCH 067/705] clearup --- src/engine.rs | 78 +++++++++++---------------------------------------- 1 file changed, 16 insertions(+), 62 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 087d64e8cd..652ca04c27 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,6 +1,6 @@ // good luck to those that follow; here be dragons -use super::constants::{SreAtCode, SreCatCode, SreInfo, SreOpcode}; +use super::constants::{SreAtCode, SreCatCode, SreOpcode}; use super::MAXREPEAT; use optional::Optioned; use std::convert::TryFrom; @@ -41,19 +41,6 @@ impl<'a, S: StrDrive> Request<'a, S> { } } -// macro_rules! mark { -// (push, $state:expr) => { -// $state -// .marks_stack -// .push(($state.marks.clone(), $state.lastindex)) -// }; -// (pop, $state:expr) => { -// let (marks, lastindex) = $state.marks_stack.pop().unwrap(); -// $state.marks = marks; -// $state.lastindex = lastindex; -// }; -// } - #[derive(Debug)] pub struct Marks { last_index: isize, @@ -135,6 +122,7 @@ pub struct State { pub marks: Marks, context_stack: Vec>, repeat_stack: Vec, + pub start: usize, pub string_position: usize, next_context: Option>, popped_has_matched: bool, @@ -144,62 +132,30 @@ pub struct State { impl Default for State { fn default() -> Self { Self { - marks: Default::default(), - context_stack: Default::default(), - repeat_stack: Default::default(), - string_position: Default::default(), - next_context: Default::default(), - popped_has_matched: Default::default(), - has_matched: Default::default(), + marks: Marks::default(), + context_stack: Vec::new(), + repeat_stack: Vec::new(), + start: 0, + string_position: 0, + next_context: None, + popped_has_matched: false, + has_matched: false, } } } impl State { - pub fn reset(&mut self, string_position: usize) { + pub fn reset(&mut self, start: usize) { self.marks.clear(); self.context_stack.clear(); self.repeat_stack.clear(); - self.string_position = string_position; + self.start = start; + self.string_position = start; self.next_context = None; self.popped_has_matched = false; self.has_matched = false; } - // fn set_mark(&mut self, mark_nr: usize, position: usize) { - // if mark_nr & 1 != 0 { - // self.lastindex = mark_nr as isize / 2 + 1; - // } - // if mark_nr >= self.marks.len() { - // self.marks.resize(mark_nr + 1, None); - // } - // self.marks[mark_nr] = Some(position); - // } - // fn get_marks(&self, group_index: usize) -> (Option, Option) { - // let marks_index = 2 * group_index; - // if marks_index + 1 < self.marks.len() { - // (self.marks[marks_index], self.marks[marks_index + 1]) - // } else { - // (None, None) - // } - // } - // fn marks_push(&mut self) { - // self.marks_stack.push((self.marks.clone(), self.lastindex)); - // } - // fn marks_pop(&mut self) { - // let (marks, lastindex) = self.marks_stack.pop().unwrap(); - // self.marks = marks; - // self.lastindex = lastindex; - // } - // fn marks_pop_keep(&mut self) { - // let (marks, lastindex) = self.marks_stack.last().unwrap().clone(); - // self.marks = marks; - // self.lastindex = lastindex; - // } - // fn marks_pop_discard(&mut self) { - // self.marks_stack.pop(); - // } - fn _match(&mut self, req: &mut Request) { while let Some(mut ctx) = self.context_stack.pop() { if let Some(handler) = ctx.handler.take() { @@ -225,6 +181,7 @@ impl State { } pub fn pymatch(&mut self, req: &mut Request) { + self.start = req.start; self.string_position = req.start; let ctx = MatchContext { @@ -243,6 +200,7 @@ impl State { } pub fn search(&mut self, req: &mut Request) { + self.start = req.start; self.string_position = req.start; // TODO: optimize by op info and skip prefix @@ -479,7 +437,6 @@ fn op_assert(req: &Request, state: &mut State, ctx: &mut Matc return ctx.failure(); } - // let next_ctx = next_ctx!(offset 3, state, ctx, |req, state, ctx| { let next_ctx = ctx.next_offset(3, state, |req, state, ctx| { if state.popped_has_matched { ctx.skip_code_from(req, 1); @@ -592,7 +549,6 @@ fn op_min_repeat_one( if max_count == MAXREPEAT || ctx.count as usize <= max_count { state.string_position = ctx.string_position; - // next_ctx!(from 1, state, ctx, callback); ctx.next_from(1, req, state, callback); } else { state.marks.pop_discard(); @@ -676,7 +632,6 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut state.string_position = ctx.string_position; // General case: backtracking - // next_ctx!(from 1, state, ctx, callback); ctx.next_from(1, req, state, callback); } @@ -861,7 +816,6 @@ fn op_max_until(state: &mut State, ctx: &mut MatchContext) { /* cannot match more repeated items here. make sure the tail matches */ - // let next_ctx = next_ctx!(offset 1, state, ctx, tail_callback); let repeat_ctx_prev_id = repeat_ctx.prev_id; let next_ctx = ctx.next_offset(1, state, tail_callback); next_ctx.repeat_ctx_id = repeat_ctx_prev_id; @@ -965,7 +919,7 @@ struct MatchContext { count: isize, } -impl<'a, S: StrDrive> std::fmt::Debug for MatchContext { +impl std::fmt::Debug for MatchContext { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("MatchContext") .field("string_position", &self.string_position) From e42df1d8597bf964fb7f70b606df9e0be696c624 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Tue, 9 Aug 2022 22:18:01 +0200 Subject: [PATCH 068/705] update version to 0.4.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 8993c1e71d..373166d6db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.3.1" +version = "0.4.0" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" From c4f10edc95ab7dc1f20bed2c9b4bddfc520fa660 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 14 Aug 2022 20:46:20 +0200 Subject: [PATCH 069/705] impl opinfo single literal --- benches/benches.rs | 24 +++---- src/engine.rs | 173 +++++++++++++++++++++++---------------------- tests/tests.rs | 51 +++++++++---- 3 files changed, 139 insertions(+), 109 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index f19b92d64b..604cf91f42 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -92,29 +92,29 @@ fn benchmarks(b: &mut Bencher) { b.iter(move || { for (p, s) in &tests { - let (mut req, mut state) = p.state(s.clone()); - state.search(&mut req); + let (req, mut state) = p.state(s.clone()); + state.search(req); assert!(state.has_matched); - let (mut req, mut state) = p.state(s.clone()); - state.pymatch(&mut req); + let (req, mut state) = p.state(s.clone()); + state.pymatch(req); assert!(state.has_matched); let (mut req, mut state) = p.state(s.clone()); req.match_all = true; - state.pymatch(&mut req); + state.pymatch(req); assert!(state.has_matched); let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000)); - let (mut req, mut state) = p.state_range(s2.as_str(), 0..usize::MAX); - state.search(&mut req); + let (req, mut state) = p.state_range(s2.as_str(), 0..usize::MAX); + state.search(req); assert!(state.has_matched); - let (mut req, mut state) = p.state_range(s2.as_str(), 10000..usize::MAX); - state.pymatch(&mut req); + let (req, mut state) = p.state_range(s2.as_str(), 10000..usize::MAX); + state.pymatch(req); assert!(state.has_matched); - let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); - state.pymatch(&mut req); + let (req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); + state.pymatch(req); assert!(state.has_matched); let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); req.match_all = true; - state.pymatch(&mut req); + state.pymatch(req); assert!(state.has_matched); } }) diff --git a/src/engine.rs b/src/engine.rs index 652ca04c27..0d645e046d 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,5 +1,7 @@ // good luck to those that follow; here be dragons +use crate::constants::SreInfo; + use super::constants::{SreAtCode, SreCatCode, SreOpcode}; use super::MAXREPEAT; use optional::Optioned; @@ -10,6 +12,7 @@ const fn is_py_ascii_whitespace(b: u8) -> bool { matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') } +#[derive(Debug, Clone, Copy)] pub struct Request<'a, S: StrDrive> { pub string: S, pub start: usize, @@ -180,7 +183,7 @@ impl State { self.has_matched = self.popped_has_matched; } - pub fn pymatch(&mut self, req: &mut Request) { + pub fn pymatch(&mut self, mut req: Request) { self.start = req.start; self.string_position = req.start; @@ -196,10 +199,10 @@ impl State { }; self.context_stack.push(ctx); - self._match(req); + self._match(&mut req); } - pub fn search(&mut self, req: &mut Request) { + pub fn search(&mut self, mut req: Request) { self.start = req.start; self.string_position = req.start; @@ -208,12 +211,11 @@ impl State { return; } - // let start = self.start; - // let end = self.end; + let mut end = req.end; let mut start_offset = req.string.offset(0, req.start); - let ctx = MatchContext { + let mut ctx = MatchContext { string_position: req.start, string_offset: start_offset, code_position: 0, @@ -224,35 +226,97 @@ impl State { count: -1, }; - // if ctx.peek_code(self, 0) == SreOpcode::INFO as u32 { - // search_op_info(self, &mut ctx); - // if let Some(has_matched) = ctx.has_matched { - // self.has_matched = has_matched; - // return; - // } - // } + if ctx.peek_code(&req, 0) == SreOpcode::INFO as u32 { + /* optimization info block */ + /* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ + let req = &mut req; + let min = ctx.peek_code(req, 3) as usize; + + if ctx.remaining_chars(req) < min { + return; + } + + if min > 1 { + /* adjust end point (but make sure we leave at least one + character in there, so literal search will work) */ + // no overflow can happen as remaining chars >= min + end -= min - 1; + + // adjust ctx position + if end < ctx.string_position { + ctx.string_position = end; + ctx.string_offset = req.string.offset(0, ctx.string_position); + } + } + + let flags = SreInfo::from_bits_truncate(ctx.peek_code(req, 2)); + + if flags.contains(SreInfo::PREFIX) { + /* pattern starts with a known prefix */ + /* */ + let len = ctx.peek_code(req, 5) as usize; + let skip = ctx.peek_code(req, 6) as usize; + let prefix = &ctx.pattern(req)[7..]; + let overlap = &prefix[len - 1..]; + + if len == 1 { + // pattern starts with a literal character + ctx.skip_code_from(req, 1); + let c = prefix[0]; + req.must_advance = false; + + while !ctx.at_end(req) { + // find the next matched literal + while ctx.peek_char(req) != c { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + } + + req.start = ctx.string_position; + self.reset(req.start); + // self.start = ctx.string_position; + self.string_position += skip; + + // literal only + if flags.contains(SreInfo::LITERAL) { + self.has_matched = true; + return; + } + + let mut next_ctx = ctx; + next_ctx.skip_char(req, skip); + next_ctx.skip_code(2 * skip); + + self.context_stack.push(next_ctx); + self._match(req); + + if self.has_matched { + return; + } + + ctx.skip_char(req, 1); + } + return; + } + } + } self.context_stack.push(ctx); - self._match(req); + self._match(&mut req); req.must_advance = false; - while !self.has_matched && req.start < req.end { + ctx.toplevel = false; + while !self.has_matched && req.start < end { req.start += 1; start_offset = req.string.offset(start_offset, 1); self.reset(req.start); + ctx.string_position = req.start; + ctx.string_offset = start_offset; - let ctx = MatchContext { - string_position: req.start, - string_offset: start_offset, - code_position: 0, - has_matched: None, - toplevel: false, - handler: None, - repeat_ctx_id: usize::MAX, - count: -1, - }; self.context_stack.push(ctx); - self._match(req); + self._match(&mut req); } } } @@ -372,63 +436,6 @@ fn dispatch( } } -/* optimization info block */ -/* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ -// fn search_op_info<'a, S: StrDrive>(state: &mut State<'a, S>, ctx: &mut MatchContext<'a, S>) { -// let min = ctx.peek_code(state, 3) as usize; - -// if ctx.remaining_chars(state) < min { -// return ctx.failure(); -// } - -// if min > 1 { -// /* adjust end point (but make sure we leave at least one -// character in there, so literal search will work) */ -// // no overflow can happen as remaining chars >= min -// state.end -= min - 1; - -// // adjust ctx position -// if state.end < ctx.string_position { -// ctx.string_position = state.end; -// ctx.string_offset = state.string.offset(0, ctx.string_position); -// } -// } - -// let flags = SreInfo::from_bits_truncate(ctx.peek_code(state, 2)); - -// if flags.contains(SreInfo::PREFIX) { -// /* pattern starts with a known prefix */ -// /* */ -// let len = ctx.peek_code(state, 5) as usize; -// let skip = ctx.peek_code(state, 6) as usize; -// let prefix = &ctx.pattern(state)[7..]; -// let overlap = &prefix[len - 1..]; - -// ctx.skip_code_from(state, 1); - -// if len == 1 { -// // pattern starts with a literal character -// let c = prefix[0]; -// let end = state.end; - -// while (!ctx.at_end(state)) { -// // find the next matched literal -// while (ctx.peek_char(state) != c) { -// ctx.skip_char(state, 1); -// if (ctx.at_end(state)) { -// return ctx.failure(); -// } -// } - -// // literal only -// if flags.contains(SreInfo::LITERAL) { -// return ctx.success(); -// } -// } -// } -// } -// } - /* assert subpattern */ /* */ fn op_assert(req: &Request, state: &mut State, ctx: &mut MatchContext) { diff --git a/tests/tests.rs b/tests/tests.rs index cb11db3483..31f032f0a4 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -21,8 +21,8 @@ fn test_2427() { // START GENERATED by generate_tests.py #[rustfmt::skip] let lookbehind = Pattern { code: &[15, 4, 0, 1, 1, 5, 5, 1, 17, 46, 1, 17, 120, 6, 10, 1] }; // END GENERATED - let (mut req, mut state) = lookbehind.state("x"); - state.pymatch(&mut req); + let (req, mut state) = lookbehind.state("x"); + state.pymatch(req); assert!(state.has_matched); } @@ -32,8 +32,8 @@ fn test_assert() { // START GENERATED by generate_tests.py #[rustfmt::skip] let positive_lookbehind = Pattern { code: &[15, 4, 0, 3, 3, 4, 9, 3, 17, 97, 17, 98, 17, 99, 1, 17, 100, 17, 101, 17, 102, 1] }; // END GENERATED - let (mut req, mut state) = positive_lookbehind.state("abcdef"); - state.search(&mut req); + let (req, mut state) = positive_lookbehind.state("abcdef"); + state.search(req); assert!(state.has_matched); } @@ -43,8 +43,8 @@ fn test_string_boundaries() { // START GENERATED by generate_tests.py #[rustfmt::skip] let big_b = Pattern { code: &[15, 4, 0, 0, 0, 6, 11, 1] }; // END GENERATED - let (mut req, mut state) = big_b.state(""); - state.search(&mut req); + let (req, mut state) = big_b.state(""); + state.search(req); assert!(!state.has_matched); } @@ -56,8 +56,8 @@ fn test_zerowidth() { // END GENERATED let (mut req, mut state) = p.state("a:"); req.must_advance = true; - state.search(&mut req); - assert!(state.string_position == 1); + state.search(req); + assert_eq!(state.string_position, 1); } #[test] @@ -67,9 +67,9 @@ fn test_repeat_context_panic() { // START GENERATED by generate_tests.py #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 25, 0, 4294967295, 27, 6, 0, 4294967295, 17, 97, 1, 24, 11, 0, 1, 18, 0, 17, 120, 17, 120, 18, 1, 20, 17, 122, 19, 1] }; // END GENERATED - let (mut req, mut state) = p.state("axxzaz"); - state.pymatch(&mut req); - assert!(*state.marks == vec![Optioned::some(1), Optioned::some(3)]); + let (req, mut state) = p.state("axxzaz"); + state.pymatch(req); + assert_eq!(*state.marks, vec![Optioned::some(1), Optioned::some(3)]); } #[test] @@ -78,7 +78,30 @@ fn test_double_max_until() { // START GENERATED by generate_tests.py #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 18, 0, 4294967295, 18, 0, 24, 9, 0, 1, 18, 2, 17, 49, 18, 3, 19, 18, 1, 19, 1] }; // END GENERATED - let (mut req, mut state) = p.state("1111"); - state.pymatch(&mut req); - assert!(state.string_position == 4); + let (req, mut state) = p.state("1111"); + state.pymatch(req); + assert_eq!(state.string_position, 4); +} + +#[test] +fn test_info_single() { + // pattern p = re.compile(r'aa*') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 8, 1, 1, 4294967295, 1, 1, 97, 0, 17, 97, 25, 6, 0, 4294967295, 17, 97, 1, 1] }; + // END GENERATED + let (req, mut state) = p.state("baaaa"); + state.search(req); + assert_eq!(state.start, 1); + assert_eq!(state.string_position, 5); +} + +#[test] +fn test_info_single2() { + // pattern p = re.compile(r'Python|Perl') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1] }; + // END GENERATED + let (req, mut state) = p.state("Perl"); + state.search(req); + assert!(state.has_matched); } From 236631141fa3d2681e9a53eb3550c96219cb0cf7 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 15 Aug 2022 19:55:48 +0200 Subject: [PATCH 070/705] impl opinfo literal --- src/engine.rs | 81 +++++++++++++++++++++++++++++++++++++++++++++----- tests/tests.rs | 22 ++++++++++++++ 2 files changed, 96 insertions(+), 7 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 0d645e046d..53181c5043 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -256,13 +256,17 @@ impl State { /* */ let len = ctx.peek_code(req, 5) as usize; let skip = ctx.peek_code(req, 6) as usize; - let prefix = &ctx.pattern(req)[7..]; - let overlap = &prefix[len - 1..]; + let prefix = &ctx.pattern(req)[7..7 + len]; + let overlap = &ctx.pattern(req)[7 + len - 1..7 + len * 2]; if len == 1 { // pattern starts with a literal character - ctx.skip_code_from(req, 1); let c = prefix[0]; + + // code_position ready for tail match + ctx.skip_code_from(req, 1); + ctx.skip_code(2 * skip); + req.must_advance = false; while !ctx.at_end(req) { @@ -275,9 +279,8 @@ impl State { } req.start = ctx.string_position; - self.reset(req.start); - // self.start = ctx.string_position; - self.string_position += skip; + self.start = ctx.string_position; + self.string_position = ctx.string_position + skip; // literal only if flags.contains(SreInfo::LITERAL) { @@ -287,7 +290,6 @@ impl State { let mut next_ctx = ctx; next_ctx.skip_char(req, skip); - next_ctx.skip_code(2 * skip); self.context_stack.push(next_ctx); self._match(req); @@ -297,6 +299,71 @@ impl State { } ctx.skip_char(req, 1); + self.marks.clear(); + } + return; + } else if len > 1 { + // code_position ready for tail match + ctx.skip_code_from(req, 1); + ctx.skip_code(2 * skip); + + req.must_advance = false; + + while !ctx.at_end(req) { + let c = prefix[0]; + while ctx.peek_char(req) != c { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + } + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + + let mut i = 1; + loop { + if ctx.peek_char(req) == prefix[i] { + i += 1; + if i != len { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + continue; + } + + req.start = ctx.string_position - (len - 1); + self.start = req.start; + self.string_position = self.start + skip; + + if flags.contains(SreInfo::LITERAL) { + self.has_matched = true; + return; + } + + let mut next_ctx = ctx; + // next_ctx.skip_char(req, 1); + next_ctx.string_position = self.string_position; + next_ctx.string_offset = req.string.offset(0, self.string_position); + self.context_stack.push(next_ctx); + self._match(req); + if self.has_matched { + return; + } + + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + self.marks.clear(); + } + i = overlap[i] as usize; + if i == 0 { + break; + } + } } return; } diff --git a/tests/tests.rs b/tests/tests.rs index 31f032f0a4..21bc89d40c 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -105,3 +105,25 @@ fn test_info_single2() { state.search(req); assert!(state.has_matched); } + +#[test] +fn test_info_literal() { + // pattern p = re.compile(r'ababc+') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 14, 1, 5, 4294967295, 4, 4, 97, 98, 97, 98, 0, 0, 1, 2, 17, 97, 17, 98, 17, 97, 17, 98, 25, 6, 1, 4294967295, 17, 99, 1, 1] }; + // END GENERATED + let (req, mut state) = p.state("!ababc"); + state.search(req); + assert!(state.has_matched); +} + +#[test] +fn test_info_literal2() { + // pattern p = re.compile(r'(python)\1') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 112, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 112, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1] }; + // END GENERATED + let (req, mut state) = p.state("pythonpython"); + state.search(req); + assert!(state.has_matched); +} \ No newline at end of file From 646c8ac6578977b847e8f871e1f83fb422b3e39a Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 15 Aug 2022 20:09:51 +0200 Subject: [PATCH 071/705] impl opinfo charset --- src/engine.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/engine.rs b/src/engine.rs index 53181c5043..fe7ee438e9 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -367,6 +367,31 @@ impl State { } return; } + } else if flags.contains(SreInfo::CHARSET) { + let set = &ctx.pattern(req)[5..]; + ctx.skip_code_from(req, 1); + req.must_advance = false; + loop { + while !ctx.at_end(req) && !charset(set, ctx.peek_char(req)) { + ctx.skip_char(req, 1); + } + if ctx.at_end(req) { + return; + } + req.start = ctx.string_position; + self.start = ctx.string_position; + self.string_position = ctx.string_position; + + self.context_stack.push(ctx); + self._match(req); + + if self.has_matched { + return; + } + + ctx.skip_char(req, 1); + self.marks.clear(); + } } } From 7e7b9734810947155876cc52028d873ba953b5f4 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 15 Aug 2022 20:36:32 +0200 Subject: [PATCH 072/705] clearup --- src/engine.rs | 301 +++++++++++++++++++++++++++----------------------- 1 file changed, 163 insertions(+), 138 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index fe7ee438e9..7a644f0671 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -252,147 +252,16 @@ impl State { let flags = SreInfo::from_bits_truncate(ctx.peek_code(req, 2)); if flags.contains(SreInfo::PREFIX) { - /* pattern starts with a known prefix */ - /* */ - let len = ctx.peek_code(req, 5) as usize; - let skip = ctx.peek_code(req, 6) as usize; - let prefix = &ctx.pattern(req)[7..7 + len]; - let overlap = &ctx.pattern(req)[7 + len - 1..7 + len * 2]; - - if len == 1 { - // pattern starts with a literal character - let c = prefix[0]; - - // code_position ready for tail match - ctx.skip_code_from(req, 1); - ctx.skip_code(2 * skip); - - req.must_advance = false; - - while !ctx.at_end(req) { - // find the next matched literal - while ctx.peek_char(req) != c { - ctx.skip_char(req, 1); - if ctx.at_end(req) { - return; - } - } - - req.start = ctx.string_position; - self.start = ctx.string_position; - self.string_position = ctx.string_position + skip; - - // literal only - if flags.contains(SreInfo::LITERAL) { - self.has_matched = true; - return; - } - - let mut next_ctx = ctx; - next_ctx.skip_char(req, skip); - - self.context_stack.push(next_ctx); - self._match(req); - - if self.has_matched { - return; - } - - ctx.skip_char(req, 1); - self.marks.clear(); - } - return; - } else if len > 1 { - // code_position ready for tail match - ctx.skip_code_from(req, 1); - ctx.skip_code(2 * skip); - - req.must_advance = false; - - while !ctx.at_end(req) { - let c = prefix[0]; - while ctx.peek_char(req) != c { - ctx.skip_char(req, 1); - if ctx.at_end(req) { - return; - } - } - ctx.skip_char(req, 1); - if ctx.at_end(req) { - return; - } - - let mut i = 1; - loop { - if ctx.peek_char(req) == prefix[i] { - i += 1; - if i != len { - ctx.skip_char(req, 1); - if ctx.at_end(req) { - return; - } - continue; - } - - req.start = ctx.string_position - (len - 1); - self.start = req.start; - self.string_position = self.start + skip; - - if flags.contains(SreInfo::LITERAL) { - self.has_matched = true; - return; - } - - let mut next_ctx = ctx; - // next_ctx.skip_char(req, 1); - next_ctx.string_position = self.string_position; - next_ctx.string_offset = req.string.offset(0, self.string_position); - self.context_stack.push(next_ctx); - self._match(req); - if self.has_matched { - return; - } - - ctx.skip_char(req, 1); - if ctx.at_end(req) { - return; - } - self.marks.clear(); - } - i = overlap[i] as usize; - if i == 0 { - break; - } - } - } - return; + if flags.contains(SreInfo::LITERAL) { + search_info_literal::(req, self, ctx); + } else { + search_info_literal::(req, self, ctx); } + return; } else if flags.contains(SreInfo::CHARSET) { - let set = &ctx.pattern(req)[5..]; - ctx.skip_code_from(req, 1); - req.must_advance = false; - loop { - while !ctx.at_end(req) && !charset(set, ctx.peek_char(req)) { - ctx.skip_char(req, 1); - } - if ctx.at_end(req) { - return; - } - req.start = ctx.string_position; - self.start = ctx.string_position; - self.string_position = ctx.string_position; - - self.context_stack.push(ctx); - self._match(req); - - if self.has_matched { - return; - } - - ctx.skip_char(req, 1); - self.marks.clear(); - } + return search_info_charset(req, self, ctx); } + // fallback to general search } self.context_stack.push(ctx); @@ -528,6 +397,162 @@ fn dispatch( } } +fn search_info_literal( + req: &mut Request, + state: &mut State, + mut ctx: MatchContext, +) { + /* pattern starts with a known prefix */ + /* */ + let len = ctx.peek_code(req, 5) as usize; + let skip = ctx.peek_code(req, 6) as usize; + let prefix = &ctx.pattern(req)[7..7 + len]; + let overlap = &ctx.pattern(req)[7 + len - 1..7 + len * 2]; + + // code_position ready for tail match + ctx.skip_code_from(req, 1); + ctx.skip_code(2 * skip); + + req.must_advance = false; + + if len == 1 { + // pattern starts with a literal character + let c = prefix[0]; + + while !ctx.at_end(req) { + // find the next matched literal + while ctx.peek_char(req) != c { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + } + + req.start = ctx.string_position; + state.start = ctx.string_position; + state.string_position = ctx.string_position + skip; + + // literal only + if LITERAL { + state.has_matched = true; + return; + } + + let mut next_ctx = ctx; + next_ctx.skip_char(req, skip); + + state.context_stack.push(next_ctx); + state._match(req); + + if state.has_matched { + return; + } + + ctx.skip_char(req, 1); + state.marks.clear(); + } + } else { + while !ctx.at_end(req) { + let c = prefix[0]; + while ctx.peek_char(req) != c { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + } + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + + let mut i = 1; + loop { + if ctx.peek_char(req) == prefix[i] { + i += 1; + if i != len { + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + continue; + } + + req.start = ctx.string_position - (len - 1); + state.start = req.start; + state.string_position = state.start + skip; + + // literal only + if LITERAL { + state.has_matched = true; + return; + } + + let mut next_ctx = ctx; + if skip != 0 { + next_ctx.skip_char(req, 1); + } else { + next_ctx.string_position = state.string_position; + next_ctx.string_offset = req.string.offset(0, state.string_position); + } + + state.context_stack.push(next_ctx); + state._match(req); + + if state.has_matched { + return; + } + + ctx.skip_char(req, 1); + if ctx.at_end(req) { + return; + } + state.marks.clear(); + } + + i = overlap[i] as usize; + if i == 0 { + break; + } + } + } + } +} + +fn search_info_charset( + req: &mut Request, + state: &mut State, + mut ctx: MatchContext, +) { + let set = &ctx.pattern(req)[5..]; + + ctx.skip_code_from(req, 1); + + req.must_advance = false; + + loop { + while !ctx.at_end(req) && !charset(set, ctx.peek_char(req)) { + ctx.skip_char(req, 1); + } + if ctx.at_end(req) { + return; + } + + req.start = ctx.string_position; + state.start = ctx.string_position; + state.string_position = ctx.string_position; + + state.context_stack.push(ctx); + state._match(req); + + if state.has_matched { + return; + } + + ctx.skip_char(req, 1); + state.marks.clear(); + } +} + /* assert subpattern */ /* */ fn op_assert(req: &Request, state: &mut State, ctx: &mut MatchContext) { From 26a78dbaa4e4e78f1e5d8dcea9c32054ae07fdd3 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 15 Aug 2022 20:36:46 +0200 Subject: [PATCH 073/705] update to 0.4.1 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 373166d6db..53524f1446 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.4.0" +version = "0.4.1" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" From 4e6b27144a407bb4daf341048310cad26570a7b3 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 15 Aug 2022 21:30:40 +0200 Subject: [PATCH 074/705] introduce SearchIter --- src/engine.rs | 26 ++++++++++++++++++++++++++ tests/tests.rs | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/engine.rs b/src/engine.rs index 7a644f0671..7334516c2f 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -282,6 +282,32 @@ impl State { } } +pub struct SearchIter<'a, S: StrDrive> { + pub req: Request<'a, S>, + pub state: State, +} + +impl<'a, S: StrDrive> Iterator for SearchIter<'a, S> { + type Item = (); + + fn next(&mut self) -> Option { + if self.req.start > self.req.end { + return None; + } + + self.state.reset(self.req.start); + self.state.search(self.req); + if !self.state.has_matched { + return None; + } + + self.req.must_advance = self.state.string_position == self.state.start; + self.req.start = self.state.string_position; + + Some(()) + } +} + fn dispatch( req: &Request, state: &mut State, diff --git a/tests/tests.rs b/tests/tests.rs index 21bc89d40c..5212226f4e 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -126,4 +126,4 @@ fn test_info_literal2() { let (req, mut state) = p.state("pythonpython"); state.search(req); assert!(state.has_matched); -} \ No newline at end of file +} From 285ba765a70ab243ee0f881604e60b999c5771af Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Sat, 7 Oct 2023 14:40:45 +0900 Subject: [PATCH 075/705] 0.4.2 with dependency update --- Cargo.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 53524f1446..30e403b54c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,15 +1,15 @@ [package] name = "sre-engine" -version = "0.4.1" +version = "0.4.2" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" license = "MIT" -edition = "2018" +edition = "2021" keywords = ["regex"] include = ["LICENSE", "src/**/*.rs"] [dependencies] -num_enum = "0.5" -bitflags = "1.2" +num_enum = "0.5.9" +bitflags = "2" optional = "0.5" From a777d22a537bdda61a7df5f4c6af6369de1cadc6 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 8 Dec 2023 22:36:46 +0200 Subject: [PATCH 076/705] fix _count --- src/constants.rs | 1 + src/engine.rs | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/constants.rs b/src/constants.rs index f3962b339a..0d5bb41939 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -101,6 +101,7 @@ pub enum SreCatCode { UNI_NOT_LINEBREAK = 17, } bitflags! { + #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct SreFlag: u16 { const TEMPLATE = 1; const IGNORECASE = 2; diff --git a/src/engine.rs b/src/engine.rs index 7334516c2f..f49560c2c6 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -672,7 +672,9 @@ fn op_min_repeat_one( ctx.count = if min_count == 0 { 0 } else { - let count = _count(req, state, ctx, min_count); + let mut next_ctx = *ctx; + next_ctx.skip_code(4); + let count = _count(req, state, next_ctx, min_count); if count < min_count { return ctx.failure(); } @@ -713,7 +715,9 @@ fn op_min_repeat_one( state.string_position = ctx.string_position; - if _count(req, state, ctx, 1) == 0 { + let mut next_ctx = *ctx; + next_ctx.skip_code(4); + if _count(req, state, next_ctx, 1) == 0 { state.marks.pop_discard(); return ctx.failure(); } @@ -741,7 +745,9 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut state.string_position = ctx.string_position; - let count = _count(req, state, ctx, max_count); + let mut next_ctx = *ctx; + next_ctx.skip_code(4); + let count = _count(req, state, next_ctx, max_count); ctx.skip_char(req, count); if count < min_count { return ctx.failure(); @@ -1428,17 +1434,16 @@ fn charset(set: &[u32], ch: u32) -> bool { fn _count( req: &Request, state: &mut State, - ctx: &MatchContext, + mut ctx: MatchContext, max_count: usize, ) -> usize { - let mut ctx = *ctx; let max_count = std::cmp::min(max_count, ctx.remaining_chars(req)); let end = ctx.string_position + max_count; let opcode = SreOpcode::try_from(ctx.peek_code(req, 0)).unwrap(); match opcode { SreOpcode::ANY => { - while !ctx.string_position < end && !ctx.at_linebreak(req) { + while ctx.string_position < end && !ctx.at_linebreak(req) { ctx.skip_char(req, 1); } } @@ -1446,8 +1451,7 @@ fn _count( ctx.skip_char(req, max_count); } SreOpcode::IN => { - while !ctx.string_position < end && charset(&ctx.pattern(req)[2..], ctx.peek_char(req)) - { + while ctx.string_position < end && charset(&ctx.pattern(req)[2..], ctx.peek_char(req)) { ctx.skip_char(req, 1); } } @@ -1483,7 +1487,6 @@ fn _count( /* General case */ let mut count = 0; - ctx.skip_code(4); let reset_position = ctx.code_position; while count < max_count { @@ -1511,7 +1514,7 @@ fn general_count_literal bool>( mut f: F, ) { let ch = ctx.peek_code(req, 1); - while !ctx.string_position < end && f(ch, ctx.peek_char(req)) { + while ctx.string_position < end && f(ch, ctx.peek_char(req)) { ctx.skip_char(req, 1); } } From d73cc5f58c94e2589efadf1b21f4b5a811869682 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 8 Dec 2023 22:42:38 +0200 Subject: [PATCH 077/705] update version and dependency --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 30e403b54c..de1d68cf6d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.4.2" +version = "0.4.3" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" @@ -10,6 +10,6 @@ keywords = ["regex"] include = ["LICENSE", "src/**/*.rs"] [dependencies] -num_enum = "0.5.9" +num_enum = "0.7" bitflags = "2" optional = "0.5" From 9070e12e0df3ece6949dc3d9d650fa2d37d8eb54 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 10 Dec 2023 20:50:43 +0200 Subject: [PATCH 078/705] refactor _match with nest loop --- benches/benches.rs | 28 +- src/engine.rs | 894 +++++++++++++++++++++++++++++++++------------ tests/tests.rs | 30 +- 3 files changed, 677 insertions(+), 275 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index 604cf91f42..fe470d023c 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -10,10 +10,7 @@ struct Pattern { } impl Pattern { - fn state<'a, S: engine::StrDrive>( - &self, - string: S, - ) -> (engine::Request<'a, S>, engine::State) { + fn state<'a, S: engine::StrDrive>(&self, string: S) -> (engine::Request<'a, S>, engine::State) { self.state_range(string, 0..usize::MAX) } @@ -21,7 +18,7 @@ impl Pattern { &self, string: S, range: std::ops::Range, - ) -> (engine::Request<'a, S>, engine::State) { + ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, range.start, range.end, self.code, false); let state = engine::State::default(); (req, state) @@ -93,29 +90,22 @@ fn benchmarks(b: &mut Bencher) { b.iter(move || { for (p, s) in &tests { let (req, mut state) = p.state(s.clone()); - state.search(req); - assert!(state.has_matched); + assert!(state.search(req)); let (req, mut state) = p.state(s.clone()); - state.pymatch(req); - assert!(state.has_matched); + assert!(state.pymatch(&req)); let (mut req, mut state) = p.state(s.clone()); req.match_all = true; - state.pymatch(req); - assert!(state.has_matched); + assert!(state.pymatch(&req)); let s2 = format!("{}{}{}", " ".repeat(10000), s, " ".repeat(10000)); let (req, mut state) = p.state_range(s2.as_str(), 0..usize::MAX); - state.search(req); - assert!(state.has_matched); + assert!(state.search(req)); let (req, mut state) = p.state_range(s2.as_str(), 10000..usize::MAX); - state.pymatch(req); - assert!(state.has_matched); + assert!(state.pymatch(&req)); let (req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); - state.pymatch(req); - assert!(state.has_matched); + assert!(state.pymatch(&req)); let (mut req, mut state) = p.state_range(s2.as_str(), 10000..10000 + s.len()); req.match_all = true; - state.pymatch(req); - assert!(state.has_matched); + assert!(state.pymatch(&req)); } }) } diff --git a/src/engine.rs b/src/engine.rs index f49560c2c6..7474f29013 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -6,14 +6,13 @@ use super::constants::{SreAtCode, SreCatCode, SreOpcode}; use super::MAXREPEAT; use optional::Optioned; use std::convert::TryFrom; -use std::ops::Deref; const fn is_py_ascii_whitespace(b: u8) -> bool { matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') } #[derive(Debug, Clone, Copy)] -pub struct Request<'a, S: StrDrive> { +pub struct Request<'a, S> { pub string: S, pub start: usize, pub end: usize, @@ -61,14 +60,6 @@ impl Default for Marks { } } -impl Deref for Marks { - type Target = Vec>; - - fn deref(&self) -> &Self::Target { - &self.marks - } -} - impl Marks { pub fn get(&self, group_index: usize) -> (Optioned, Optioned) { let marks_index = 2 * group_index; @@ -83,6 +74,10 @@ impl Marks { self.last_index } + pub fn raw(&self) -> &[Optioned] { + self.marks.as_slice() + } + fn set(&mut self, mark_nr: usize, position: usize) { if mark_nr & 1 != 0 { self.last_index = mark_nr as isize / 2 + 1; @@ -120,70 +115,23 @@ impl Marks { } } -#[derive(Debug)] -pub struct State { - pub marks: Marks, - context_stack: Vec>, - repeat_stack: Vec, +#[derive(Debug, Default)] +pub struct State { pub start: usize, + pub marks: Marks, pub string_position: usize, - next_context: Option>, - popped_has_matched: bool, - pub has_matched: bool, -} - -impl Default for State { - fn default() -> Self { - Self { - marks: Marks::default(), - context_stack: Vec::new(), - repeat_stack: Vec::new(), - start: 0, - string_position: 0, - next_context: None, - popped_has_matched: false, - has_matched: false, - } - } + repeat_stack: Vec, } -impl State { +impl State { pub fn reset(&mut self, start: usize) { self.marks.clear(); - self.context_stack.clear(); self.repeat_stack.clear(); self.start = start; self.string_position = start; - self.next_context = None; - self.popped_has_matched = false; - self.has_matched = false; - } - - fn _match(&mut self, req: &mut Request) { - while let Some(mut ctx) = self.context_stack.pop() { - if let Some(handler) = ctx.handler.take() { - handler(req, self, &mut ctx); - } else if ctx.remaining_codes(req) > 0 { - let code = ctx.peek_code(req, 0); - let code = SreOpcode::try_from(code).unwrap(); - dispatch(req, self, &mut ctx, code); - } else { - ctx.failure(); - } - - if let Some(has_matched) = ctx.has_matched { - self.popped_has_matched = has_matched; - } else { - self.context_stack.push(ctx); - if let Some(next_ctx) = self.next_context.take() { - self.context_stack.push(next_ctx); - } - } - } - self.has_matched = self.popped_has_matched; } - pub fn pymatch(&mut self, mut req: Request) { + pub fn pymatch(&mut self, req: &Request) -> bool { self.start = req.start; self.string_position = req.start; @@ -191,24 +139,20 @@ impl State { string_position: req.start, string_offset: req.string.offset(0, req.start), code_position: 0, - has_matched: None, toplevel: true, - handler: None, + jump: Jump::OpCode, repeat_ctx_id: usize::MAX, count: -1, }; - self.context_stack.push(ctx); - - self._match(&mut req); + _match(&req, self, ctx) } - pub fn search(&mut self, mut req: Request) { + pub fn search(&mut self, mut req: Request) -> bool { self.start = req.start; self.string_position = req.start; - // TODO: optimize by op info and skip prefix if req.start > req.end { - return; + return false; } let mut end = req.end; @@ -219,9 +163,8 @@ impl State { string_position: req.start, string_offset: start_offset, code_position: 0, - has_matched: None, toplevel: true, - handler: None, + jump: Jump::OpCode, repeat_ctx_id: usize::MAX, count: -1, }; @@ -229,11 +172,10 @@ impl State { if ctx.peek_code(&req, 0) == SreOpcode::INFO as u32 { /* optimization info block */ /* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ - let req = &mut req; - let min = ctx.peek_code(req, 3) as usize; + let min = ctx.peek_code(&req, 3) as usize; - if ctx.remaining_chars(req) < min { - return; + if ctx.remaining_chars(&req) < min { + return false; } if min > 1 { @@ -249,42 +191,44 @@ impl State { } } - let flags = SreInfo::from_bits_truncate(ctx.peek_code(req, 2)); + let flags = SreInfo::from_bits_truncate(ctx.peek_code(&req, 2)); if flags.contains(SreInfo::PREFIX) { if flags.contains(SreInfo::LITERAL) { - search_info_literal::(req, self, ctx); + return search_info_literal::(&mut req, self, ctx); } else { - search_info_literal::(req, self, ctx); + return search_info_literal::(&mut req, self, ctx); } - return; } else if flags.contains(SreInfo::CHARSET) { - return search_info_charset(req, self, ctx); + return search_info_charset(&mut req, self, ctx); } // fallback to general search } - self.context_stack.push(ctx); - self._match(&mut req); + if _match(&req, self, ctx) { + return true; + } req.must_advance = false; ctx.toplevel = false; - while !self.has_matched && req.start < end { + while req.start < end { req.start += 1; start_offset = req.string.offset(start_offset, 1); self.reset(req.start); ctx.string_position = req.start; ctx.string_offset = start_offset; - self.context_stack.push(ctx); - self._match(&mut req); + if _match(&req, self, ctx) { + return true; + } } + false } } pub struct SearchIter<'a, S: StrDrive> { pub req: Request<'a, S>, - pub state: State, + pub state: State, } impl<'a, S: StrDrive> Iterator for SearchIter<'a, S> { @@ -296,8 +240,7 @@ impl<'a, S: StrDrive> Iterator for SearchIter<'a, S> { } self.state.reset(self.req.start); - self.state.search(self.req); - if !self.state.has_matched { + if !self.state.search(self.req) { return None; } @@ -308,10 +251,537 @@ impl<'a, S: StrDrive> Iterator for SearchIter<'a, S> { } } +#[derive(Debug, Clone, Copy)] +enum Jump { + OpCode, + Assert1, + AssertNot1, + Branch1, + Branch2, + Repeat1, + UntilBacktrace, + MaxUntil2, + MaxUntil3, + MinUntil1, + RepeatOne1, + RepeatOne2, + MinRepeatOne1, + MinRepeatOne2, +} + +fn _match(req: &Request, state: &mut State, ctx: MatchContext) -> bool { + let mut context_stack = vec![ctx]; + let mut popped_result = false; + + 'coro: loop { + let Some(mut ctx) = context_stack.pop() else { + break; + }; + + popped_result = 'result: loop { + let yield_ = 'context: loop { + match ctx.jump { + Jump::OpCode => {} + Jump::Assert1 => { + if popped_result { + ctx.skip_code_from(req, 1); + } else { + break 'result false; + } + } + Jump::AssertNot1 => { + if popped_result { + break 'result false; + } + ctx.skip_code_from(req, 1); + } + Jump::Branch1 => { + let branch_offset = ctx.count as usize; + let next_length = ctx.peek_code(req, branch_offset) as isize; + if next_length == 0 { + state.marks.pop_discard(); + break 'result false; + } + state.string_position = ctx.string_position; + let next_ctx = ctx.next_offset(branch_offset + 1, Jump::Branch2); + ctx.count += next_length; + break 'context next_ctx; + } + Jump::Branch2 => { + if popped_result { + break 'result true; + } + state.marks.pop_keep(); + ctx.jump = Jump::Branch1; + continue 'context; + } + Jump::Repeat1 => { + state.repeat_stack.pop(); + break 'result popped_result; + } + Jump::UntilBacktrace => { + if !popped_result { + state.repeat_stack[ctx.repeat_ctx_id].count -= 1; + state.string_position = ctx.string_position; + } + break 'result popped_result; + } + Jump::MaxUntil2 => { + let save_last_position = ctx.count as usize; + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + repeat_ctx.last_position = save_last_position; + + if popped_result { + state.marks.pop_discard(); + break 'result true; + } + + state.marks.pop(); + repeat_ctx.count -= 1; + state.string_position = ctx.string_position; + + /* cannot match more repeated items here. make sure the + tail matches */ + let mut next_ctx = ctx.next_offset(1, Jump::MaxUntil3); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + break 'context next_ctx; + } + Jump::MaxUntil3 => { + if !popped_result { + state.string_position = ctx.string_position; + } + break 'result popped_result; + } + Jump::MinUntil1 => { + if popped_result { + break 'result true; + } + ctx.repeat_ctx_id = ctx.count as usize; + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + state.string_position = ctx.string_position; + state.marks.pop(); + + // match more until tail matches + if repeat_ctx.count as usize >= repeat_ctx.max_count + && repeat_ctx.max_count != MAXREPEAT + || state.string_position == repeat_ctx.last_position + { + repeat_ctx.count -= 1; + break 'result false; + } + + /* zero-width match protection */ + repeat_ctx.last_position = state.string_position; + + break 'context ctx + .next_at(repeat_ctx.code_position + 4, Jump::UntilBacktrace); + } + Jump::RepeatOne1 => { + let min_count = ctx.peek_code(req, 2) as isize; + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); + if next_code == SreOpcode::LITERAL as u32 { + // Special case: Tail starts with a literal. Skip positions where + // the rest of the pattern cannot possibly match. + let c = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 2); + while ctx.at_end(req) || ctx.peek_char(req) != c { + if ctx.count <= min_count { + state.marks.pop_discard(); + break 'result false; + } + ctx.back_skip_char(req, 1); + ctx.count -= 1; + } + } + + state.string_position = ctx.string_position; + // General case: backtracking + break 'context ctx.next_peek_from(1, req, Jump::RepeatOne2); + } + Jump::RepeatOne2 => { + if popped_result { + break 'result true; + } + + let min_count = ctx.peek_code(req, 2) as isize; + if ctx.count <= min_count { + state.marks.pop_discard(); + break 'result false; + } + + ctx.back_skip_char(req, 1); + ctx.count -= 1; + + state.marks.pop_keep(); + ctx.jump = Jump::RepeatOne1; + continue 'context; + } + Jump::MinRepeatOne1 => { + let max_count = ctx.peek_code(req, 3) as usize; + if max_count == MAXREPEAT || ctx.count as usize <= max_count { + state.string_position = ctx.string_position; + break 'context ctx.next_peek_from(1, req, Jump::MinRepeatOne2); + } else { + state.marks.pop_discard(); + break 'result false; + } + } + Jump::MinRepeatOne2 => { + if popped_result { + break 'result true; + } + + state.string_position = ctx.string_position; + + let mut count_ctx = ctx; + count_ctx.skip_code(4); + if _count(req, state, count_ctx, 1) == 0 { + state.marks.pop_discard(); + break 'result false; + } + + ctx.skip_char(req, 1); + ctx.count += 1; + state.marks.pop_keep(); + ctx.jump = Jump::MinRepeatOne1; + continue 'context; + } + } + ctx.jump = Jump::OpCode; + + loop { + macro_rules! general_op_literal { + ($f:expr) => {{ + if ctx.at_end(req) || !$f(ctx.peek_code(req, 1), ctx.peek_char(req)) { + break 'result false; + } + ctx.skip_code(2); + ctx.skip_char(req, 1); + }}; + } + + macro_rules! general_op_in { + ($f:expr) => {{ + if ctx.at_end(req) || !$f(&ctx.pattern(req)[2..], ctx.peek_char(req)) { + break 'result false; + } + ctx.skip_code_from(req, 1); + ctx.skip_char(req, 1); + }}; + } + + macro_rules! general_op_groupref { + ($f:expr) => {{ + let (group_start, group_end) = + state.marks.get(ctx.peek_code(req, 1) as usize); + let (group_start, group_end) = if group_start.is_some() + && group_end.is_some() + && group_start.unpack() <= group_end.unpack() + { + (group_start.unpack(), group_end.unpack()) + } else { + break 'result false; + }; + + let mut gctx = MatchContext { + string_position: group_start, + string_offset: req.string.offset(0, group_start), + ..ctx + }; + + for _ in group_start..group_end { + if ctx.at_end(req) + || $f(ctx.peek_char(req)) != $f(gctx.peek_char(req)) + { + break 'result false; + } + ctx.skip_char(req, 1); + gctx.skip_char(req, 1); + } + + ctx.skip_code(2); + }}; + } + + if ctx.remaining_codes(req) == 0 { + break 'result false; + } + let opcode = ctx.peek_code(req, 0); + let opcode = SreOpcode::try_from(opcode).unwrap(); + + match opcode { + SreOpcode::FAILURE => break 'result false, + SreOpcode::SUCCESS => { + if ctx.can_success(req) { + state.string_position = ctx.string_position; + break 'result true; + } + break 'result false; + } + SreOpcode::ANY => { + if ctx.at_end(req) || ctx.at_linebreak(req) { + break 'result false; + } + ctx.skip_code(1); + ctx.skip_char(req, 1); + } + SreOpcode::ANY_ALL => { + if ctx.at_end(req) { + break 'result false; + } + ctx.skip_code(1); + ctx.skip_char(req, 1); + } + SreOpcode::ASSERT => { + let back = ctx.peek_code(req, 2) as usize; + if ctx.string_position < back { + break 'result false; + } + + let mut next_ctx = ctx.next_offset(3, Jump::Assert1); + next_ctx.toplevel = false; + next_ctx.back_skip_char(req, back); + state.string_position = next_ctx.string_position; + break 'context next_ctx; + } + SreOpcode::ASSERT_NOT => { + let back = ctx.peek_code(req, 2) as usize; + if ctx.string_position < back { + ctx.skip_code_from(req, 1); + continue; + } + + let mut next_ctx = ctx.next_offset(3, Jump::AssertNot1); + next_ctx.toplevel = false; + next_ctx.back_skip_char(req, back); + state.string_position = next_ctx.string_position; + break 'context next_ctx; + } + SreOpcode::AT => { + let atcode = SreAtCode::try_from(ctx.peek_code(req, 1)).unwrap(); + if at(req, &ctx, atcode) { + ctx.skip_code(2); + } else { + break 'result false; + } + } + SreOpcode::BRANCH => { + state.marks.push(); + ctx.count = 1; + ctx.jump = Jump::Branch1; + continue 'context; + } + SreOpcode::CATEGORY => { + let catcode = SreCatCode::try_from(ctx.peek_code(req, 1)).unwrap(); + if ctx.at_end(req) || !category(catcode, ctx.peek_char(req)) { + break 'result false; + } + ctx.skip_code(2); + ctx.skip_char(req, 1); + } + SreOpcode::IN => general_op_in!(charset), + SreOpcode::IN_IGNORE => { + general_op_in!(|set, c| charset(set, lower_ascii(c))) + } + SreOpcode::IN_UNI_IGNORE => { + general_op_in!(|set, c| charset(set, lower_unicode(c))) + } + SreOpcode::IN_LOC_IGNORE => general_op_in!(charset_loc_ignore), + SreOpcode::INFO => { + let min = ctx.peek_code(req, 3) as usize; + if ctx.remaining_chars(req) < min { + break 'result false; + } + ctx.skip_code_from(req, 1); + } + SreOpcode::MARK => { + state + .marks + .set(ctx.peek_code(req, 1) as usize, ctx.string_position); + ctx.skip_code(2); + } + SreOpcode::JUMP => ctx.skip_code_from(req, 1), + SreOpcode::REPEAT => { + let repeat_ctx = RepeatContext { + count: -1, + min_count: ctx.peek_code(req, 2) as usize, + max_count: ctx.peek_code(req, 3) as usize, + code_position: ctx.code_position, + last_position: std::usize::MAX, + prev_id: ctx.repeat_ctx_id, + }; + state.repeat_stack.push(repeat_ctx); + let repeat_ctx_id = state.repeat_stack.len() - 1; + state.string_position = ctx.string_position; + let mut next_ctx = ctx.next_peek_from(1, req, Jump::Repeat1); + next_ctx.repeat_ctx_id = repeat_ctx_id; + break 'context next_ctx; + } + SreOpcode::MAX_UNTIL => { + let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; + state.string_position = ctx.string_position; + repeat_ctx.count += 1; + + if (repeat_ctx.count as usize) < repeat_ctx.min_count { + // not enough matches + break 'context ctx + .next_at(repeat_ctx.code_position + 4, Jump::UntilBacktrace); + } + + if ((repeat_ctx.count as usize) < repeat_ctx.max_count + || repeat_ctx.max_count == MAXREPEAT) + && state.string_position != repeat_ctx.last_position + { + /* we may have enough matches, but if we can + match another item, do so */ + state.marks.push(); + ctx.count = repeat_ctx.last_position as isize; + repeat_ctx.last_position = state.string_position; + + break 'context ctx + .next_at(repeat_ctx.code_position + 4, Jump::MaxUntil2); + } + + /* cannot match more repeated items here. make sure the + tail matches */ + let mut next_ctx = ctx.next_offset(1, Jump::MaxUntil3); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + break 'context next_ctx; + } + SreOpcode::MIN_UNTIL => { + let repeat_ctx = state.repeat_stack.last_mut().unwrap(); + state.string_position = ctx.string_position; + repeat_ctx.count += 1; + + if (repeat_ctx.count as usize) < repeat_ctx.min_count { + // not enough matches + break 'context ctx + .next_at(repeat_ctx.code_position + 4, Jump::UntilBacktrace); + } + + state.marks.push(); + ctx.count = ctx.repeat_ctx_id as isize; + let mut next_ctx = ctx.next_offset(1, Jump::MinUntil1); + next_ctx.repeat_ctx_id = repeat_ctx.prev_id; + break 'context next_ctx; + } + SreOpcode::REPEAT_ONE => { + let min_count = ctx.peek_code(req, 2) as usize; + let max_count = ctx.peek_code(req, 3) as usize; + + if ctx.remaining_chars(req) < min_count { + break 'result false; + } + + state.string_position = ctx.string_position; + + let mut next_ctx = ctx; + next_ctx.skip_code(4); + let count = _count(req, state, next_ctx, max_count); + ctx.skip_char(req, count); + if count < min_count { + break 'result false; + } + + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { + // tail is empty. we're finished + state.string_position = ctx.string_position; + break 'result true; + } + + state.marks.push(); + ctx.count = count as isize; + ctx.jump = Jump::RepeatOne1; + continue 'context; + } + SreOpcode::MIN_REPEAT_ONE => { + let min_count = ctx.peek_code(req, 2) as usize; + if ctx.remaining_chars(req) < min_count { + break 'result false; + } + + state.string_position = ctx.string_position; + ctx.count = if min_count == 0 { + 0 + } else { + let mut count_ctx = ctx; + count_ctx.skip_code(4); + let count = _count(req, state, count_ctx, min_count); + if count < min_count { + break 'result false; + } + ctx.skip_char(req, count); + count as isize + }; + + let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); + if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { + // tail is empty. we're finished + state.string_position = ctx.string_position; + break 'result true; + } + + state.marks.push(); + ctx.jump = Jump::MinRepeatOne1; + continue 'context; + } + SreOpcode::LITERAL => general_op_literal!(|code, c| code == c), + SreOpcode::NOT_LITERAL => general_op_literal!(|code, c| code != c), + SreOpcode::LITERAL_IGNORE => { + general_op_literal!(|code, c| code == lower_ascii(c)) + } + SreOpcode::NOT_LITERAL_IGNORE => { + general_op_literal!(|code, c| code != lower_ascii(c)) + } + SreOpcode::LITERAL_UNI_IGNORE => { + general_op_literal!(|code, c| code == lower_unicode(c)) + } + SreOpcode::NOT_LITERAL_UNI_IGNORE => { + general_op_literal!(|code, c| code != lower_unicode(c)) + } + SreOpcode::LITERAL_LOC_IGNORE => general_op_literal!(char_loc_ignore), + SreOpcode::NOT_LITERAL_LOC_IGNORE => { + general_op_literal!(|code, c| !char_loc_ignore(code, c)) + } + SreOpcode::GROUPREF => general_op_groupref!(|x| x), + SreOpcode::GROUPREF_IGNORE => general_op_groupref!(lower_ascii), + SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref!(lower_locate), + SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref!(lower_unicode), + SreOpcode::GROUPREF_EXISTS => { + let (group_start, group_end) = + state.marks.get(ctx.peek_code(req, 1) as usize); + if group_start.is_some() + && group_end.is_some() + && group_start.unpack() <= group_end.unpack() + { + ctx.skip_code(3); + } else { + ctx.skip_code_from(req, 2) + } + } + SreOpcode::CALL => todo!(), + SreOpcode::CHARSET => todo!(), + SreOpcode::BIGCHARSET => todo!(), + SreOpcode::NEGATE => todo!(), + SreOpcode::RANGE => todo!(), + SreOpcode::RANGE_UNI_IGNORE => todo!(), + SreOpcode::SUBPATTERN => todo!(), + } + } + }; + context_stack.push(ctx); + context_stack.push(yield_); + continue 'coro; + }; + } + popped_result +} + +/* fn dispatch( req: &Request, - state: &mut State, - ctx: &mut MatchContext, + state: &mut State, + ctx: &mut MatchContext, opcode: SreOpcode, ) { match opcode { @@ -422,12 +892,13 @@ fn dispatch( _ => unreachable!("unexpected opcode"), } } +*/ fn search_info_literal( req: &mut Request, - state: &mut State, - mut ctx: MatchContext, -) { + state: &mut State, + mut ctx: MatchContext, +) -> bool { /* pattern starts with a known prefix */ /* */ let len = ctx.peek_code(req, 5) as usize; @@ -450,7 +921,7 @@ fn search_info_literal( while ctx.peek_char(req) != c { ctx.skip_char(req, 1); if ctx.at_end(req) { - return; + return false; } } @@ -460,18 +931,14 @@ fn search_info_literal( // literal only if LITERAL { - state.has_matched = true; - return; + return true; } let mut next_ctx = ctx; next_ctx.skip_char(req, skip); - state.context_stack.push(next_ctx); - state._match(req); - - if state.has_matched { - return; + if _match(req, state, next_ctx) { + return true; } ctx.skip_char(req, 1); @@ -483,12 +950,12 @@ fn search_info_literal( while ctx.peek_char(req) != c { ctx.skip_char(req, 1); if ctx.at_end(req) { - return; + return false; } } ctx.skip_char(req, 1); if ctx.at_end(req) { - return; + return false; } let mut i = 1; @@ -498,7 +965,7 @@ fn search_info_literal( if i != len { ctx.skip_char(req, 1); if ctx.at_end(req) { - return; + return false; } continue; } @@ -509,8 +976,7 @@ fn search_info_literal( // literal only if LITERAL { - state.has_matched = true; - return; + return true; } let mut next_ctx = ctx; @@ -521,16 +987,13 @@ fn search_info_literal( next_ctx.string_offset = req.string.offset(0, state.string_position); } - state.context_stack.push(next_ctx); - state._match(req); - - if state.has_matched { - return; + if _match(req, state, next_ctx) { + return true; } ctx.skip_char(req, 1); if ctx.at_end(req) { - return; + return false; } state.marks.clear(); } @@ -542,13 +1005,14 @@ fn search_info_literal( } } } + false } fn search_info_charset( req: &mut Request, - state: &mut State, - mut ctx: MatchContext, -) { + state: &mut State, + mut ctx: MatchContext, +) -> bool { let set = &ctx.pattern(req)[5..]; ctx.skip_code_from(req, 1); @@ -560,18 +1024,15 @@ fn search_info_charset( ctx.skip_char(req, 1); } if ctx.at_end(req) { - return; + return false; } req.start = ctx.string_position; state.start = ctx.string_position; state.string_position = ctx.string_position; - state.context_stack.push(ctx); - state._match(req); - - if state.has_matched { - return; + if _match(req, state, ctx) { + return true; } ctx.skip_char(req, 1); @@ -579,9 +1040,10 @@ fn search_info_charset( } } +/* /* assert subpattern */ /* */ -fn op_assert(req: &Request, state: &mut State, ctx: &mut MatchContext) { +fn op_assert(req: &Request, state: &mut State, ctx: &mut MatchContext) { let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { return ctx.failure(); @@ -601,7 +1063,7 @@ fn op_assert(req: &Request, state: &mut State, ctx: &mut Matc /* assert not subpattern */ /* */ -fn op_assert_not(req: &Request, state: &mut State, ctx: &mut MatchContext) { +fn op_assert_not(req: &Request, state: &mut State, ctx: &mut MatchContext) { let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { @@ -622,17 +1084,13 @@ fn op_assert_not(req: &Request, state: &mut State, ctx: &mut // alternation // <0=skip> code ... -fn op_branch(req: &Request, state: &mut State, ctx: &mut MatchContext) { +fn op_branch(req: &Request, state: &mut State, ctx: &mut MatchContext) { state.marks.push(); ctx.count = 1; create_context(req, state, ctx); - fn create_context( - req: &Request, - state: &mut State, - ctx: &mut MatchContext, - ) { + fn create_context(req: &Request, state: &mut State, ctx: &mut MatchContext) { let branch_offset = ctx.count as usize; let next_length = ctx.peek_code(req, branch_offset) as isize; if next_length == 0 { @@ -646,7 +1104,7 @@ fn op_branch(req: &Request, state: &mut State, ctx: &mut Matc ctx.next_offset(branch_offset + 1, state, callback); } - fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { + fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { return ctx.success(); } @@ -656,11 +1114,7 @@ fn op_branch(req: &Request, state: &mut State, ctx: &mut Matc } /* <1=min> <2=max> item tail */ -fn op_min_repeat_one( - req: &Request, - state: &mut State, - ctx: &mut MatchContext, -) { +fn op_min_repeat_one(req: &Request, state: &mut State, ctx: &mut MatchContext) { let min_count = ctx.peek_code(req, 2) as usize; if ctx.remaining_chars(req) < min_count { @@ -692,23 +1146,19 @@ fn op_min_repeat_one( state.marks.push(); create_context(req, state, ctx); - fn create_context( - req: &Request, - state: &mut State, - ctx: &mut MatchContext, - ) { + fn create_context(req: &Request, state: &mut State, ctx: &mut MatchContext) { let max_count = ctx.peek_code(req, 3) as usize; if max_count == MAXREPEAT || ctx.count as usize <= max_count { state.string_position = ctx.string_position; - ctx.next_from(1, req, state, callback); + ctx.next_peek_from(1, req, state, callback); } else { state.marks.pop_discard(); ctx.failure(); } } - fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { + fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { return ctx.success(); } @@ -735,7 +1185,7 @@ exactly one character wide, and we're not already collecting backtracking points. for other cases, use the MAX_REPEAT operator */ /* <1=min> <2=max> item tail */ -fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut MatchContext) { +fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut MatchContext) { let min_count = ctx.peek_code(req, 2) as usize; let max_count = ctx.peek_code(req, 3) as usize; @@ -764,11 +1214,7 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut ctx.count = count as isize; create_context(req, state, ctx); - fn create_context( - req: &Request, - state: &mut State, - ctx: &mut MatchContext, - ) { + fn create_context(req: &Request, state: &mut State, ctx: &mut MatchContext) { let min_count = ctx.peek_code(req, 2) as isize; let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); if next_code == SreOpcode::LITERAL as u32 { @@ -788,10 +1234,10 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut state.string_position = ctx.string_position; // General case: backtracking - ctx.next_from(1, req, state, callback); + ctx.next_peek_from(1, req, state, callback); } - fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { + fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { return ctx.success(); } @@ -810,6 +1256,7 @@ fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut create_context(req, state, ctx); } } +*/ #[derive(Debug, Clone, Copy)] struct RepeatContext { @@ -821,10 +1268,11 @@ struct RepeatContext { prev_id: usize, } +/* /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ /* <1=min> <2=max> item tail */ -fn op_repeat(req: &Request, state: &mut State, ctx: &mut MatchContext) { +fn op_repeat(req: &Request, state: &mut State, ctx: &mut MatchContext) { let repeat_ctx = RepeatContext { count: -1, min_count: ctx.peek_code(req, 2) as usize, @@ -840,7 +1288,7 @@ fn op_repeat(req: &Request, state: &mut State, ctx: &mut Matc let repeat_ctx_id = state.repeat_stack.len() - 1; - let next_ctx = ctx.next_from(1, req, state, |_, state, ctx| { + let next_ctx = ctx.next_peek_from(1, req, state, |_, state, ctx| { ctx.has_matched = Some(state.popped_has_matched); state.repeat_stack.pop(); }); @@ -848,7 +1296,7 @@ fn op_repeat(req: &Request, state: &mut State, ctx: &mut Matc } /* minimizing repeat */ -fn op_min_until(state: &mut State, ctx: &mut MatchContext) { +fn op_min_until(state: &mut State, ctx: &mut MatchContext) { let repeat_ctx = state.repeat_stack.last_mut().unwrap(); state.string_position = ctx.string_position; @@ -915,7 +1363,7 @@ fn op_min_until(state: &mut State, ctx: &mut MatchContext) { } /* maximizing repeat */ -fn op_max_until(state: &mut State, ctx: &mut MatchContext) { +fn op_max_until(state: &mut State, ctx: &mut MatchContext) { let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; state.string_position = ctx.string_position; @@ -976,7 +1424,7 @@ fn op_max_until(state: &mut State, ctx: &mut MatchContext) { let next_ctx = ctx.next_offset(1, state, tail_callback); next_ctx.repeat_ctx_id = repeat_ctx_prev_id; - fn tail_callback(_: &Request, state: &mut State, ctx: &mut MatchContext) { + fn tail_callback(_: &Request, state: &mut State, ctx: &mut MatchContext) { if state.popped_has_matched { ctx.success(); } else { @@ -985,6 +1433,7 @@ fn op_max_until(state: &mut State, ctx: &mut MatchContext) { } } } +*/ pub trait StrDrive: Copy { fn offset(&self, offset: usize, skip: usize) -> usize; @@ -1061,67 +1510,49 @@ impl<'a> StrDrive for &'a [u8] { } } -type OpFunc = for<'a> fn(&Request<'a, S>, &mut State, &mut MatchContext); - #[derive(Clone, Copy)] -struct MatchContext { +struct MatchContext { string_position: usize, string_offset: usize, code_position: usize, - has_matched: Option, toplevel: bool, - handler: Option>, + jump: Jump, repeat_ctx_id: usize, count: isize, } -impl std::fmt::Debug for MatchContext { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("MatchContext") - .field("string_position", &self.string_position) - .field("string_offset", &self.string_offset) - .field("code_position", &self.code_position) - .field("has_matched", &self.has_matched) - .field("toplevel", &self.toplevel) - .field("handler", &self.handler.map(|x| x as usize)) - .field("repeat_ctx_id", &self.repeat_ctx_id) - .field("count", &self.count) - .finish() - } -} - -impl MatchContext { - fn pattern<'a>(&self, req: &Request<'a, S>) -> &'a [u32] { +impl MatchContext { + fn pattern<'a, S>(&self, req: &Request<'a, S>) -> &'a [u32] { &req.pattern_codes[self.code_position..] } - fn remaining_codes(&self, req: &Request) -> usize { + fn remaining_codes(&self, req: &Request) -> usize { req.pattern_codes.len() - self.code_position } - fn remaining_chars(&self, req: &Request) -> usize { + fn remaining_chars(&self, req: &Request) -> usize { req.end - self.string_position } - fn peek_char(&self, req: &Request) -> u32 { + fn peek_char(&self, req: &Request) -> u32 { req.string.peek(self.string_offset) } - fn skip_char(&mut self, req: &Request, skip: usize) { + fn skip_char(&mut self, req: &Request, skip: usize) { self.string_position += skip; self.string_offset = req.string.offset(self.string_offset, skip); } - fn back_peek_char(&self, req: &Request) -> u32 { + fn back_peek_char(&self, req: &Request) -> u32 { req.string.back_peek(self.string_offset) } - fn back_skip_char(&mut self, req: &Request, skip: usize) { + fn back_skip_char(&mut self, req: &Request, skip: usize) { self.string_position -= skip; self.string_offset = req.string.back_offset(self.string_offset, skip); } - fn peek_code(&self, req: &Request, peek: usize) -> u32 { + fn peek_code(&self, req: &Request, peek: usize) -> u32 { req.pattern_codes[self.code_position + peek] } @@ -1129,7 +1560,7 @@ impl MatchContext { self.code_position += skip; } - fn skip_code_from(&mut self, req: &Request, peek: usize) { + fn skip_code_from(&mut self, req: &Request, peek: usize) { self.skip_code(self.peek_code(req, peek) as usize + 1); } @@ -1138,15 +1569,19 @@ impl MatchContext { self.string_position == 0 } - fn at_end(&self, req: &Request) -> bool { + fn at_end(&self, req: &Request) -> bool { self.string_position == req.end } - fn at_linebreak(&self, req: &Request) -> bool { + fn at_linebreak(&self, req: &Request) -> bool { !self.at_end(req) && is_linebreak(self.peek_char(req)) } - fn at_boundary bool>(&self, req: &Request, mut word_checker: F) -> bool { + fn at_boundary bool>( + &self, + req: &Request, + mut word_checker: F, + ) -> bool { if self.at_beginning() && self.at_end(req) { return false; } @@ -1155,7 +1590,7 @@ impl MatchContext { this != that } - fn at_non_boundary bool>( + fn at_non_boundary bool>( &self, req: &Request, mut word_checker: F, @@ -1168,7 +1603,7 @@ impl MatchContext { this == that } - fn can_success(&self, req: &Request) -> bool { + fn can_success(&self, req: &Request) -> bool { if !self.toplevel { return true; } @@ -1181,51 +1616,29 @@ impl MatchContext { true } - fn success(&mut self) { - self.has_matched = Some(true); + #[must_use] + fn next_peek_from(&mut self, peek: usize, req: &Request, jump: Jump) -> Self { + self.next_offset(self.peek_code(req, peek) as usize + 1, jump) } - fn failure(&mut self) { - self.has_matched = Some(false); + #[must_use] + fn next_offset(&mut self, offset: usize, jump: Jump) -> Self { + self.next_at(self.code_position + offset, jump) } - fn next_from<'b>( - &mut self, - peek: usize, - req: &Request, - state: &'b mut State, - f: OpFunc, - ) -> &'b mut Self { - self.next_offset(self.peek_code(req, peek) as usize + 1, state, f) - } - - fn next_offset<'b>( - &mut self, - offset: usize, - state: &'b mut State, - f: OpFunc, - ) -> &'b mut Self { - self.next_at(self.code_position + offset, state, f) - } - - fn next_at<'b>( - &mut self, - code_position: usize, - state: &'b mut State, - f: OpFunc, - ) -> &'b mut Self { - self.handler = Some(f); - state.next_context.insert(MatchContext { + #[must_use] + fn next_at(&mut self, code_position: usize, jump: Jump) -> Self { + self.jump = jump; + MatchContext { code_position, - has_matched: None, - handler: None, + jump: Jump::OpCode, count: -1, ..*self - }) + } } } -fn at(req: &Request, ctx: &MatchContext, atcode: SreAtCode) -> bool { +fn at(req: &Request, ctx: &MatchContext, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(req)), @@ -1243,9 +1656,10 @@ fn at(req: &Request, ctx: &MatchContext, atcode: SreAtCode) - } } +/* fn general_op_literal bool>( req: &Request, - ctx: &mut MatchContext, + ctx: &mut MatchContext, f: F, ) { if ctx.at_end(req) || !f(ctx.peek_code(req, 1), ctx.peek_char(req)) { @@ -1258,7 +1672,7 @@ fn general_op_literal bool>( fn general_op_in bool>( req: &Request, - ctx: &mut MatchContext, + ctx: &mut MatchContext, f: F, ) { if ctx.at_end(req) || !f(&ctx.pattern(req)[2..], ctx.peek_char(req)) { @@ -1271,8 +1685,8 @@ fn general_op_in bool>( fn general_op_groupref u32>( req: &Request, - state: &State, - ctx: &mut MatchContext, + state: &State, + ctx: &mut MatchContext, mut f: F, ) { let (group_start, group_end) = state.marks.get(ctx.peek_code(req, 1) as usize); @@ -1301,6 +1715,7 @@ fn general_op_groupref u32>( ctx.skip_code(2); } +*/ fn char_loc_ignore(code: u32, c: u32) -> bool { code == c || code == lower_locate(c) || code == upper_locate(c) @@ -1433,8 +1848,8 @@ fn charset(set: &[u32], ch: u32) -> bool { fn _count( req: &Request, - state: &mut State, - mut ctx: MatchContext, + state: &mut State, + mut ctx: MatchContext, max_count: usize, ) -> usize { let max_count = std::cmp::min(max_count, ctx.remaining_chars(req)); @@ -1491,12 +1906,15 @@ fn _count( while count < max_count { ctx.code_position = reset_position; - let code = ctx.peek_code(req, 0); - let code = SreOpcode::try_from(code).unwrap(); - dispatch(req, state, &mut ctx, code); - if ctx.has_matched == Some(false) { + if !_match(req, state, ctx) { break; } + // let code = ctx.peek_code(req, 0); + // let code = SreOpcode::try_from(code).unwrap(); + // dispatch(req, state, &mut ctx, code); + // if ctx.has_matched == Some(false) { + // break; + // } count += 1; } return count; @@ -1509,7 +1927,7 @@ fn _count( fn general_count_literal bool>( req: &Request, - ctx: &mut MatchContext, + ctx: &mut MatchContext, end: usize, mut f: F, ) { diff --git a/tests/tests.rs b/tests/tests.rs index 5212226f4e..4e282a6f97 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -8,7 +8,7 @@ impl Pattern { fn state<'a, S: engine::StrDrive>( &self, string: S, - ) -> (engine::Request<'a, S>, engine::State) { + ) -> (engine::Request<'a, S>, engine::State) { let req = engine::Request::new(string, 0, usize::MAX, self.code, false); let state = engine::State::default(); (req, state) @@ -22,8 +22,7 @@ fn test_2427() { #[rustfmt::skip] let lookbehind = Pattern { code: &[15, 4, 0, 1, 1, 5, 5, 1, 17, 46, 1, 17, 120, 6, 10, 1] }; // END GENERATED let (req, mut state) = lookbehind.state("x"); - state.pymatch(req); - assert!(state.has_matched); + assert!(state.pymatch(&req)); } #[test] @@ -33,8 +32,7 @@ fn test_assert() { #[rustfmt::skip] let positive_lookbehind = Pattern { code: &[15, 4, 0, 3, 3, 4, 9, 3, 17, 97, 17, 98, 17, 99, 1, 17, 100, 17, 101, 17, 102, 1] }; // END GENERATED let (req, mut state) = positive_lookbehind.state("abcdef"); - state.search(req); - assert!(state.has_matched); + assert!(state.search(req)); } #[test] @@ -44,8 +42,7 @@ fn test_string_boundaries() { #[rustfmt::skip] let big_b = Pattern { code: &[15, 4, 0, 0, 0, 6, 11, 1] }; // END GENERATED let (req, mut state) = big_b.state(""); - state.search(req); - assert!(!state.has_matched); + assert!(!state.search(req)); } #[test] @@ -56,7 +53,7 @@ fn test_zerowidth() { // END GENERATED let (mut req, mut state) = p.state("a:"); req.must_advance = true; - state.search(req); + assert!(state.search(req)); assert_eq!(state.string_position, 1); } @@ -68,8 +65,8 @@ fn test_repeat_context_panic() { #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 25, 0, 4294967295, 27, 6, 0, 4294967295, 17, 97, 1, 24, 11, 0, 1, 18, 0, 17, 120, 17, 120, 18, 1, 20, 17, 122, 19, 1] }; // END GENERATED let (req, mut state) = p.state("axxzaz"); - state.pymatch(req); - assert_eq!(*state.marks, vec![Optioned::some(1), Optioned::some(3)]); + assert!(state.pymatch(&req)); + assert_eq!(*state.marks.raw(), vec![Optioned::some(1), Optioned::some(3)]); } #[test] @@ -79,7 +76,7 @@ fn test_double_max_until() { #[rustfmt::skip] let p = Pattern { code: &[15, 4, 0, 0, 4294967295, 24, 18, 0, 4294967295, 18, 0, 24, 9, 0, 1, 18, 2, 17, 49, 18, 3, 19, 18, 1, 19, 1] }; // END GENERATED let (req, mut state) = p.state("1111"); - state.pymatch(req); + assert!(state.pymatch(&req)); assert_eq!(state.string_position, 4); } @@ -90,7 +87,7 @@ fn test_info_single() { #[rustfmt::skip] let p = Pattern { code: &[15, 8, 1, 1, 4294967295, 1, 1, 97, 0, 17, 97, 25, 6, 0, 4294967295, 17, 97, 1, 1] }; // END GENERATED let (req, mut state) = p.state("baaaa"); - state.search(req); + assert!(state.search(req)); assert_eq!(state.start, 1); assert_eq!(state.string_position, 5); } @@ -102,8 +99,7 @@ fn test_info_single2() { #[rustfmt::skip] let p = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1] }; // END GENERATED let (req, mut state) = p.state("Perl"); - state.search(req); - assert!(state.has_matched); + assert!(state.search(req)); } #[test] @@ -113,8 +109,7 @@ fn test_info_literal() { #[rustfmt::skip] let p = Pattern { code: &[15, 14, 1, 5, 4294967295, 4, 4, 97, 98, 97, 98, 0, 0, 1, 2, 17, 97, 17, 98, 17, 97, 17, 98, 25, 6, 1, 4294967295, 17, 99, 1, 1] }; // END GENERATED let (req, mut state) = p.state("!ababc"); - state.search(req); - assert!(state.has_matched); + assert!(state.search(req)); } #[test] @@ -124,6 +119,5 @@ fn test_info_literal2() { #[rustfmt::skip] let p = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 112, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 112, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1] }; // END GENERATED let (req, mut state) = p.state("pythonpython"); - state.search(req); - assert!(state.has_matched); + assert!(state.search(req)); } From 39c0106e873645ff8f212bcfa21c79216e5f7d89 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Mon, 11 Dec 2023 20:17:51 +0200 Subject: [PATCH 079/705] update to cpython 3.12 op code --- benches/benches.rs | 22 +++++++------- src/constants.rs | 72 ++++++++++++++++++++++++---------------------- src/engine.rs | 4 ++- tests/tests.rs | 20 ++++++------- 4 files changed, 61 insertions(+), 57 deletions(-) diff --git a/benches/benches.rs b/benches/benches.rs index fe470d023c..f70138f920 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -30,47 +30,47 @@ fn benchmarks(b: &mut Bencher) { // # test common prefix // pattern p1 = re.compile('Python|Perl') # , 'Perl'), # Alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p1 = Pattern { code: &[15, 8, 1, 4, 6, 1, 1, 80, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 1] }; + #[rustfmt::skip] let p1 = Pattern { code: &[14, 8, 1, 4, 6, 1, 1, 80, 0, 16, 80, 7, 13, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 11, 9, 16, 101, 16, 114, 16, 108, 15, 2, 0, 1] }; // END GENERATED // pattern p2 = re.compile('(Python|Perl)') #, 'Perl'), # Grouped alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p2 = Pattern { code: &[15, 8, 1, 4, 6, 1, 0, 80, 0, 18, 0, 17, 80, 7, 13, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 11, 9, 17, 101, 17, 114, 17, 108, 16, 2, 0, 18, 1, 1] }; + #[rustfmt::skip] let p2 = Pattern { code: &[14, 8, 1, 4, 6, 1, 0, 80, 0, 17, 0, 16, 80, 7, 13, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 11, 9, 16, 101, 16, 114, 16, 108, 15, 2, 0, 17, 1, 1] }; // END GENERATED // pattern p3 = re.compile('Python|Perl|Tcl') #, 'Perl'), # Alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p3 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 1] }; + #[rustfmt::skip] let p3 = Pattern { code: &[14, 9, 4, 3, 6, 16, 80, 16, 84, 0, 7, 15, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 22, 11, 16, 80, 16, 101, 16, 114, 16, 108, 15, 11, 9, 16, 84, 16, 99, 16, 108, 15, 2, 0, 1] }; // END GENERATED // pattern p4 = re.compile('(Python|Perl|Tcl)') #, 'Perl'), # Grouped alternation // START GENERATED by generate_tests.py - #[rustfmt::skip] let p4 = Pattern { code: &[15, 9, 4, 3, 6, 17, 80, 17, 84, 0, 18, 0, 7, 15, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 16, 22, 11, 17, 80, 17, 101, 17, 114, 17, 108, 16, 11, 9, 17, 84, 17, 99, 17, 108, 16, 2, 0, 18, 1, 1] }; + #[rustfmt::skip] let p4 = Pattern { code: &[14, 9, 4, 3, 6, 16, 80, 16, 84, 0, 17, 0, 7, 15, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 15, 22, 11, 16, 80, 16, 101, 16, 114, 16, 108, 15, 11, 9, 16, 84, 16, 99, 16, 108, 15, 2, 0, 17, 1, 1] }; // END GENERATED // pattern p5 = re.compile('(Python)\\1') #, 'PythonPython'), # Backreference // START GENERATED by generate_tests.py - #[rustfmt::skip] let p5 = Pattern { code: &[15, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 12, 0, 1] }; + #[rustfmt::skip] let p5 = Pattern { code: &[14, 18, 1, 12, 12, 6, 0, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 17, 1, 11, 0, 1] }; // END GENERATED // pattern p6 = re.compile('([0a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # Disable the fastmap optimization // START GENERATED by generate_tests.py - #[rustfmt::skip] let p6 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 31, 1, 4294967295, 18, 0, 14, 7, 17, 48, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1] }; + #[rustfmt::skip] let p6 = Pattern { code: &[14, 4, 0, 2, 4294967295, 23, 31, 1, 4294967295, 17, 0, 13, 7, 16, 48, 22, 97, 122, 0, 24, 13, 0, 4294967295, 13, 8, 22, 97, 122, 22, 48, 57, 0, 1, 16, 44, 17, 1, 18, 1] }; // END GENERATED // pattern p7 = re.compile('([a-z][a-z0-9]*,)+') #, 'a5,b7,c9,'), # A few sets // START GENERATED by generate_tests.py - #[rustfmt::skip] let p7 = Pattern { code: &[15, 4, 0, 2, 4294967295, 24, 29, 1, 4294967295, 18, 0, 14, 5, 23, 97, 122, 0, 25, 13, 0, 4294967295, 14, 8, 23, 97, 122, 23, 48, 57, 0, 1, 17, 44, 18, 1, 19, 1] }; + #[rustfmt::skip] let p7 = Pattern { code: &[14, 4, 0, 2, 4294967295, 23, 29, 1, 4294967295, 17, 0, 13, 5, 22, 97, 122, 0, 24, 13, 0, 4294967295, 13, 8, 22, 97, 122, 22, 48, 57, 0, 1, 16, 44, 17, 1, 18, 1] }; // END GENERATED // pattern p8 = re.compile('Python') #, 'Python'), # Simple text literal // START GENERATED by generate_tests.py - #[rustfmt::skip] let p8 = Pattern { code: &[15, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1] }; + #[rustfmt::skip] let p8 = Pattern { code: &[14, 18, 3, 6, 6, 6, 6, 80, 121, 116, 104, 111, 110, 0, 0, 0, 0, 0, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 1] }; // END GENERATED // pattern p9 = re.compile('.*Python') #, 'Python'), # Bad text literal // START GENERATED by generate_tests.py - #[rustfmt::skip] let p9 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 1] }; + #[rustfmt::skip] let p9 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 1] }; // END GENERATED // pattern p10 = re.compile('.*Python.*') #, 'Python'), # Worse text literal // START GENERATED by generate_tests.py - #[rustfmt::skip] let p10 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 25, 5, 0, 4294967295, 2, 1, 1] }; + #[rustfmt::skip] let p10 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 24, 5, 0, 4294967295, 2, 1, 1] }; // END GENERATED // pattern p11 = re.compile('.*(Python)') #, 'Python'), # Bad text literal with grouping // START GENERATED by generate_tests.py - #[rustfmt::skip] let p11 = Pattern { code: &[15, 4, 0, 6, 4294967295, 25, 5, 0, 4294967295, 2, 1, 18, 0, 17, 80, 17, 121, 17, 116, 17, 104, 17, 111, 17, 110, 18, 1, 1] }; + #[rustfmt::skip] let p11 = Pattern { code: &[14, 4, 0, 6, 4294967295, 24, 5, 0, 4294967295, 2, 1, 17, 0, 16, 80, 16, 121, 16, 116, 16, 104, 16, 111, 16, 110, 17, 1, 1] }; // END GENERATED let tests = [ diff --git a/src/constants.rs b/src/constants.rs index 0d5bb41939..dc61c33b2c 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -13,7 +13,7 @@ use bitflags::bitflags; -pub const SRE_MAGIC: usize = 20171005; +pub const SRE_MAGIC: usize = 20221023; #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] #[allow(non_camel_case_types, clippy::upper_case_acronyms)] @@ -26,39 +26,41 @@ pub enum SreOpcode { ASSERT_NOT = 5, AT = 6, BRANCH = 7, - CALL = 8, - CATEGORY = 9, - CHARSET = 10, - BIGCHARSET = 11, - GROUPREF = 12, - GROUPREF_EXISTS = 13, - IN = 14, - INFO = 15, - JUMP = 16, - LITERAL = 17, - MARK = 18, - MAX_UNTIL = 19, - MIN_UNTIL = 20, - NOT_LITERAL = 21, - NEGATE = 22, - RANGE = 23, - REPEAT = 24, - REPEAT_ONE = 25, - SUBPATTERN = 26, - MIN_REPEAT_ONE = 27, - GROUPREF_IGNORE = 28, - IN_IGNORE = 29, - LITERAL_IGNORE = 30, - NOT_LITERAL_IGNORE = 31, - GROUPREF_LOC_IGNORE = 32, - IN_LOC_IGNORE = 33, - LITERAL_LOC_IGNORE = 34, - NOT_LITERAL_LOC_IGNORE = 35, - GROUPREF_UNI_IGNORE = 36, - IN_UNI_IGNORE = 37, - LITERAL_UNI_IGNORE = 38, - NOT_LITERAL_UNI_IGNORE = 39, - RANGE_UNI_IGNORE = 40, + CATEGORY = 8, + CHARSET = 9, + BIGCHARSET = 10, + GROUPREF = 11, + GROUPREF_EXISTS = 12, + IN = 13, + INFO = 14, + JUMP = 15, + LITERAL = 16, + MARK = 17, + MAX_UNTIL = 18, + MIN_UNTIL = 19, + NOT_LITERAL = 20, + NEGATE = 21, + RANGE = 22, + REPEAT = 23, + REPEAT_ONE = 24, + SUBPATTERN = 25, + MIN_REPEAT_ONE = 26, + ATOMIC_GROUP = 27, + POSSESSIVE_REPEAT = 28, + POSSESSIVE_REPEAT_ONE = 29, + GROUPREF_IGNORE = 30, + IN_IGNORE = 31, + LITERAL_IGNORE = 32, + NOT_LITERAL_IGNORE = 33, + GROUPREF_LOC_IGNORE = 34, + IN_LOC_IGNORE = 35, + LITERAL_LOC_IGNORE = 36, + NOT_LITERAL_LOC_IGNORE = 37, + GROUPREF_UNI_IGNORE = 38, + IN_UNI_IGNORE = 39, + LITERAL_UNI_IGNORE = 40, + NOT_LITERAL_UNI_IGNORE = 41, + RANGE_UNI_IGNORE = 42, } #[derive(num_enum::TryFromPrimitive, Debug)] #[repr(u32)] @@ -101,7 +103,7 @@ pub enum SreCatCode { UNI_NOT_LINEBREAK = 17, } bitflags! { - #[derive(Debug, PartialEq, Eq, Clone, Copy)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct SreFlag: u16 { const TEMPLATE = 1; const IGNORECASE = 2; diff --git a/src/engine.rs b/src/engine.rs index 7474f29013..e44a1f4a09 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -759,13 +759,15 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.skip_code_from(req, 2) } } - SreOpcode::CALL => todo!(), SreOpcode::CHARSET => todo!(), SreOpcode::BIGCHARSET => todo!(), SreOpcode::NEGATE => todo!(), SreOpcode::RANGE => todo!(), SreOpcode::RANGE_UNI_IGNORE => todo!(), SreOpcode::SUBPATTERN => todo!(), + SreOpcode::ATOMIC_GROUP => todo!(), + SreOpcode::POSSESSIVE_REPEAT => todo!(), + SreOpcode::POSSESSIVE_REPEAT_ONE => todo!(), } } }; diff --git a/tests/tests.rs b/tests/tests.rs index 4e282a6f97..0a1dc407fc 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -19,7 +19,7 @@ impl Pattern { fn test_2427() { // pattern lookbehind = re.compile(r'(? Date: Mon, 11 Dec 2023 22:28:36 +0200 Subject: [PATCH 080/705] fix _count general case --- src/engine.rs | 18 ++++++++---------- tests/tests.rs | 10 ++++++++++ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index e44a1f4a09..3a24fe812b 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1904,19 +1904,17 @@ fn _count( /* General case */ let mut count = 0; - let reset_position = ctx.code_position; - while count < max_count { - ctx.code_position = reset_position; - if !_match(req, state, ctx) { + let sub_ctx = MatchContext { + toplevel: true, + jump: Jump::OpCode, + repeat_ctx_id: usize::MAX, + count: -1, + ..ctx + }; + if !_match(req, state, sub_ctx) { break; } - // let code = ctx.peek_code(req, 0); - // let code = SreOpcode::try_from(code).unwrap(); - // dispatch(req, state, &mut ctx, code); - // if ctx.has_matched == Some(false) { - // break; - // } count += 1; } return count; diff --git a/tests/tests.rs b/tests/tests.rs index 0a1dc407fc..a452efb740 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -121,3 +121,13 @@ fn test_info_literal2() { let (req, mut state) = p.state("pythonpython"); assert!(state.search(req)); } + +#[test] +fn test_repeat_in_assertions() { + // pattern p = re.compile('^([ab]*?)(?=(b)?)c', re.IGNORECASE) + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 1, 4294967295, 6, 0, 17, 0, 26, 10, 0, 4294967295, 39, 5, 22, 97, 98, 0, 1, 17, 1, 4, 14, 0, 23, 9, 0, 1, 17, 2, 40, 98, 17, 3, 18, 1, 40, 99, 1] }; + // END GENERATED + let (req, mut state) = p.state("abc"); + assert!(state.search(req)); +} \ No newline at end of file From 99ed744c57c7aa8e3a0d33bf7893b26d5be86434 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 13 Dec 2023 21:53:30 +0200 Subject: [PATCH 081/705] impl atomic group & possessive repeat --- src/engine.rs | 108 +++++++++++++++++++++++++++++++++++++++++++++++-- tests/tests.rs | 32 +++++++++++++++ 2 files changed, 136 insertions(+), 4 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 3a24fe812b..daed81212f 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -267,6 +267,11 @@ enum Jump { RepeatOne2, MinRepeatOne1, MinRepeatOne2, + AtomicGroup1, + PossessiveRepeat1, + PossessiveRepeat2, + PossessiveRepeat3, + PossessiveRepeat4, } fn _match(req: &Request, state: &mut State, ctx: MatchContext) -> bool { @@ -445,6 +450,73 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.jump = Jump::MinRepeatOne1; continue 'context; } + Jump::AtomicGroup1 => { + if popped_result { + ctx.skip_code_from(req, 1); + ctx.string_position = state.string_position; + ctx.string_offset = req.string.offset(0, state.string_position); + // dispatch opcode + } else { + state.string_position = ctx.string_position; + break 'result false; + } + } + Jump::PossessiveRepeat1 => { + let min_count = ctx.peek_code(req, 2) as isize; + if ctx.count < min_count { + break 'context ctx.next_offset(4, Jump::PossessiveRepeat2); + } + // zero match protection + ctx.string_position = usize::MAX; + ctx.jump = Jump::PossessiveRepeat3; + continue 'context; + } + Jump::PossessiveRepeat2 => { + if popped_result { + ctx.count += 1; + ctx.jump = Jump::PossessiveRepeat1; + continue 'context; + } else { + state.string_position = ctx.string_position; + break 'result false; + } + } + Jump::PossessiveRepeat3 => { + let max_count = ctx.peek_code(req, 3) as usize; + if ((ctx.count as usize) < max_count || max_count == MAXREPEAT) + && ctx.string_position != state.string_position + { + state.marks.push(); + ctx.string_position = state.string_position; + ctx.string_offset = req.string.offset(0, state.string_position); + break 'context ctx.next_offset(4, Jump::PossessiveRepeat4); + } + ctx.string_position = state.string_position; + ctx.string_offset = req.string.offset(0, state.string_position); + // popped_result = false; + // ctx.jump = Jump::PossessiveRepeat4; + // continue 'context; + ctx.skip_code_from(req, 1); + ctx.skip_code(1); + // if ctx.remaining_codes(req) > 1 && ctx.toplevel { + // ctx.skip_code(1); + // } + } + Jump::PossessiveRepeat4 => { + if popped_result { + state.marks.pop_discard(); + ctx.count += 1; + ctx.jump = Jump::PossessiveRepeat3; + continue 'context; + } + state.marks.pop(); + state.string_position = ctx.string_position; + ctx.skip_code_from(req, 1); + ctx.skip_code(1); + // if ctx.remaining_codes(req) > 1 && ctx.toplevel { + // ctx.skip_code(1); + // } + } } ctx.jump = Jump::OpCode; @@ -759,15 +831,43 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.skip_code_from(req, 2) } } + /* pattern tail */ + SreOpcode::ATOMIC_GROUP => { + state.string_position = ctx.string_position; + break 'context ctx.next_offset(2, Jump::AtomicGroup1); + } + /* <1=min> <2=max> pattern + tail */ + SreOpcode::POSSESSIVE_REPEAT => { + state.string_position = ctx.string_position; + ctx.count = 0; + ctx.jump = Jump::PossessiveRepeat1; + continue 'context; + } + /* <1=min> <2=max> item + tail */ + SreOpcode::POSSESSIVE_REPEAT_ONE => { + let min_count = ctx.peek_code(req, 2) as usize; + let max_count = ctx.peek_code(req, 3) as usize; + if ctx.remaining_chars(req) < min_count { + break 'result false; + } + state.string_position = ctx.string_position; + let mut count_ctx = ctx; + count_ctx.skip_code(4); + let count = _count(req, state, count_ctx, max_count); + if count < min_count { + break 'result false; + } + ctx.skip_char(req, count); + ctx.skip_code_from(req, 1); + } SreOpcode::CHARSET => todo!(), SreOpcode::BIGCHARSET => todo!(), SreOpcode::NEGATE => todo!(), SreOpcode::RANGE => todo!(), SreOpcode::RANGE_UNI_IGNORE => todo!(), SreOpcode::SUBPATTERN => todo!(), - SreOpcode::ATOMIC_GROUP => todo!(), - SreOpcode::POSSESSIVE_REPEAT => todo!(), - SreOpcode::POSSESSIVE_REPEAT_ONE => todo!(), } } }; @@ -1906,7 +2006,7 @@ fn _count( while count < max_count { let sub_ctx = MatchContext { - toplevel: true, + toplevel: false, jump: Jump::OpCode, repeat_ctx_id: usize::MAX, count: -1, diff --git a/tests/tests.rs b/tests/tests.rs index a452efb740..4c56c42fae 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -130,4 +130,36 @@ fn test_repeat_in_assertions() { // END GENERATED let (req, mut state) = p.state("abc"); assert!(state.search(req)); +} + +#[test] +fn test_possessive_quantifier() { + // pattern p = re.compile('e++a') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 2, 4294967295, 29, 6, 1, 4294967295, 16, 101, 1, 16, 97, 1] }; + // END GENERATED + let (req, mut state) = p.state("eeea"); + assert!(state.pymatch(&req)); +} + +#[test] +fn test_possessive_atomic_group() { + // pattern p = re.compile('(?>x)++x') + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 2, 4294967295, 28, 8, 1, 4294967295, 27, 4, 16, 120, 1, 1, 16, 120, 1] }; + // END GENERATED + let (req, mut state) = p.state("xxx"); + assert!(!state.pymatch(&req)); +} + +#[test] +fn test_bug_20998() { + // pattern p = re.compile('[a-c]+', re.I) + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 1, 4294967295, 24, 10, 1, 4294967295, 39, 5, 22, 97, 99, 0, 1, 1] }; + // END GENERATED + let (mut req, mut state) = p.state("ABC"); + req.match_all = true; + assert!(state.pymatch(&req)); + assert_eq!(state.string_position, 3); } \ No newline at end of file From 9378497346147500328d6b96e59386caecbfbeab Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 13 Dec 2023 21:59:38 +0200 Subject: [PATCH 082/705] clearup --- src/engine.rs | 592 ++------------------------------------------------ 1 file changed, 14 insertions(+), 578 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index daed81212f..3486c52d9f 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -493,14 +493,8 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - } ctx.string_position = state.string_position; ctx.string_offset = req.string.offset(0, state.string_position); - // popped_result = false; - // ctx.jump = Jump::PossessiveRepeat4; - // continue 'context; ctx.skip_code_from(req, 1); ctx.skip_code(1); - // if ctx.remaining_codes(req) > 1 && ctx.toplevel { - // ctx.skip_code(1); - // } } Jump::PossessiveRepeat4 => { if popped_result { @@ -513,9 +507,6 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - state.string_position = ctx.string_position; ctx.skip_code_from(req, 1); ctx.skip_code(1); - // if ctx.remaining_codes(req) > 1 && ctx.toplevel { - // ctx.skip_code(1); - // } } } ctx.jump = Jump::OpCode; @@ -603,6 +594,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.skip_code(1); ctx.skip_char(req, 1); } + /* */ SreOpcode::ASSERT => { let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { @@ -615,6 +607,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - state.string_position = next_ctx.string_position; break 'context next_ctx; } + /* */ SreOpcode::ASSERT_NOT => { let back = ctx.peek_code(req, 2) as usize; if ctx.string_position < back { @@ -636,6 +629,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - break 'result false; } } + // <0=skip> code ... SreOpcode::BRANCH => { state.marks.push(); ctx.count = 1; @@ -672,6 +666,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.skip_code(2); } SreOpcode::JUMP => ctx.skip_code_from(req, 1), + /* <1=min> <2=max> item tail */ SreOpcode::REPEAT => { let repeat_ctx = RepeatContext { count: -1, @@ -736,6 +731,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - next_ctx.repeat_ctx_id = repeat_ctx.prev_id; break 'context next_ctx; } + /* <1=min> <2=max> item tail */ SreOpcode::REPEAT_ONE => { let min_count = ctx.peek_code(req, 2) as usize; let max_count = ctx.peek_code(req, 3) as usize; @@ -766,6 +762,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.jump = Jump::RepeatOne1; continue 'context; } + /* <1=min> <2=max> item tail */ SreOpcode::MIN_REPEAT_ONE => { let min_count = ctx.peek_code(req, 2) as usize; if ctx.remaining_chars(req) < min_count { @@ -862,12 +859,14 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.skip_char(req, count); ctx.skip_code_from(req, 1); } - SreOpcode::CHARSET => todo!(), - SreOpcode::BIGCHARSET => todo!(), - SreOpcode::NEGATE => todo!(), - SreOpcode::RANGE => todo!(), - SreOpcode::RANGE_UNI_IGNORE => todo!(), - SreOpcode::SUBPATTERN => todo!(), + SreOpcode::CHARSET + | SreOpcode::BIGCHARSET + | SreOpcode::NEGATE + | SreOpcode::RANGE + | SreOpcode::RANGE_UNI_IGNORE + | SreOpcode::SUBPATTERN => { + unreachable!("unexpected opcode on main dispatch") + } } } }; @@ -879,123 +878,6 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - popped_result } -/* -fn dispatch( - req: &Request, - state: &mut State, - ctx: &mut MatchContext, - opcode: SreOpcode, -) { - match opcode { - SreOpcode::FAILURE => { - ctx.failure(); - } - SreOpcode::SUCCESS => { - if ctx.can_success(req) { - state.string_position = ctx.string_position; - ctx.success(); - } else { - ctx.failure(); - } - } - SreOpcode::ANY => { - if ctx.at_end(req) || ctx.at_linebreak(req) { - ctx.failure(); - } else { - ctx.skip_code(1); - ctx.skip_char(req, 1); - } - } - SreOpcode::ANY_ALL => { - if ctx.at_end(req) { - ctx.failure(); - } else { - ctx.skip_code(1); - ctx.skip_char(req, 1); - } - } - SreOpcode::ASSERT => op_assert(req, state, ctx), - SreOpcode::ASSERT_NOT => op_assert_not(req, state, ctx), - SreOpcode::AT => { - let atcode = SreAtCode::try_from(ctx.peek_code(req, 1)).unwrap(); - if at(req, ctx, atcode) { - ctx.skip_code(2); - } else { - ctx.failure(); - } - } - SreOpcode::BRANCH => op_branch(req, state, ctx), - SreOpcode::CATEGORY => { - let catcode = SreCatCode::try_from(ctx.peek_code(req, 1)).unwrap(); - if ctx.at_end(req) || !category(catcode, ctx.peek_char(req)) { - ctx.failure(); - } else { - ctx.skip_code(2); - ctx.skip_char(req, 1); - } - } - SreOpcode::IN => general_op_in(req, ctx, charset), - SreOpcode::IN_IGNORE => general_op_in(req, ctx, |set, c| charset(set, lower_ascii(c))), - SreOpcode::IN_UNI_IGNORE => { - general_op_in(req, ctx, |set, c| charset(set, lower_unicode(c))) - } - SreOpcode::IN_LOC_IGNORE => general_op_in(req, ctx, charset_loc_ignore), - SreOpcode::INFO => { - let min = ctx.peek_code(req, 3) as usize; - if ctx.remaining_chars(req) < min { - ctx.failure(); - } else { - ctx.skip_code_from(req, 1); - } - } - SreOpcode::JUMP => ctx.skip_code_from(req, 1), - SreOpcode::LITERAL => general_op_literal(req, ctx, |code, c| code == c), - SreOpcode::NOT_LITERAL => general_op_literal(req, ctx, |code, c| code != c), - SreOpcode::LITERAL_IGNORE => general_op_literal(req, ctx, |code, c| code == lower_ascii(c)), - SreOpcode::NOT_LITERAL_IGNORE => { - general_op_literal(req, ctx, |code, c| code != lower_ascii(c)) - } - SreOpcode::LITERAL_UNI_IGNORE => { - general_op_literal(req, ctx, |code, c| code == lower_unicode(c)) - } - SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_op_literal(req, ctx, |code, c| code != lower_unicode(c)) - } - SreOpcode::LITERAL_LOC_IGNORE => general_op_literal(req, ctx, char_loc_ignore), - SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_op_literal(req, ctx, |code, c| !char_loc_ignore(code, c)) - } - SreOpcode::MARK => { - state - .marks - .set(ctx.peek_code(req, 1) as usize, ctx.string_position); - ctx.skip_code(2); - } - SreOpcode::MAX_UNTIL => op_max_until(state, ctx), - SreOpcode::MIN_UNTIL => op_min_until(state, ctx), - SreOpcode::REPEAT => op_repeat(req, state, ctx), - SreOpcode::REPEAT_ONE => op_repeat_one(req, state, ctx), - SreOpcode::MIN_REPEAT_ONE => op_min_repeat_one(req, state, ctx), - SreOpcode::GROUPREF => general_op_groupref(req, state, ctx, |x| x), - SreOpcode::GROUPREF_IGNORE => general_op_groupref(req, state, ctx, lower_ascii), - SreOpcode::GROUPREF_LOC_IGNORE => general_op_groupref(req, state, ctx, lower_locate), - SreOpcode::GROUPREF_UNI_IGNORE => general_op_groupref(req, state, ctx, lower_unicode), - SreOpcode::GROUPREF_EXISTS => { - let (group_start, group_end) = state.marks.get(ctx.peek_code(req, 1) as usize); - if group_start.is_some() - && group_end.is_some() - && group_start.unpack() <= group_end.unpack() - { - ctx.skip_code(3); - } else { - ctx.skip_code_from(req, 2) - } - } - _ => unreachable!("unexpected opcode"), - } -} -*/ - fn search_info_literal( req: &mut Request, state: &mut State, @@ -1142,224 +1024,6 @@ fn search_info_charset( } } -/* -/* assert subpattern */ -/* */ -fn op_assert(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let back = ctx.peek_code(req, 2) as usize; - if ctx.string_position < back { - return ctx.failure(); - } - - let next_ctx = ctx.next_offset(3, state, |req, state, ctx| { - if state.popped_has_matched { - ctx.skip_code_from(req, 1); - } else { - ctx.failure(); - } - }); - next_ctx.toplevel = false; - next_ctx.back_skip_char(req, back); - state.string_position = next_ctx.string_position; -} - -/* assert not subpattern */ -/* */ -fn op_assert_not(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let back = ctx.peek_code(req, 2) as usize; - - if ctx.string_position < back { - return ctx.skip_code_from(req, 1); - } - - let next_ctx = ctx.next_offset(3, state, |req, state, ctx| { - if state.popped_has_matched { - ctx.failure(); - } else { - ctx.skip_code_from(req, 1); - } - }); - next_ctx.toplevel = false; - next_ctx.back_skip_char(req, back); - state.string_position = next_ctx.string_position; -} - -// alternation -// <0=skip> code ... -fn op_branch(req: &Request, state: &mut State, ctx: &mut MatchContext) { - state.marks.push(); - - ctx.count = 1; - create_context(req, state, ctx); - - fn create_context(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let branch_offset = ctx.count as usize; - let next_length = ctx.peek_code(req, branch_offset) as isize; - if next_length == 0 { - state.marks.pop_discard(); - return ctx.failure(); - } - - state.string_position = ctx.string_position; - - ctx.count += next_length; - ctx.next_offset(branch_offset + 1, state, callback); - } - - fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { - if state.popped_has_matched { - return ctx.success(); - } - state.marks.pop_keep(); - create_context(req, state, ctx); - } -} - -/* <1=min> <2=max> item tail */ -fn op_min_repeat_one(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let min_count = ctx.peek_code(req, 2) as usize; - - if ctx.remaining_chars(req) < min_count { - return ctx.failure(); - } - - state.string_position = ctx.string_position; - - ctx.count = if min_count == 0 { - 0 - } else { - let mut next_ctx = *ctx; - next_ctx.skip_code(4); - let count = _count(req, state, next_ctx, min_count); - if count < min_count { - return ctx.failure(); - } - ctx.skip_char(req, count); - count as isize - }; - - let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { - // tail is empty. we're finished - state.string_position = ctx.string_position; - return ctx.success(); - } - - state.marks.push(); - create_context(req, state, ctx); - - fn create_context(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let max_count = ctx.peek_code(req, 3) as usize; - - if max_count == MAXREPEAT || ctx.count as usize <= max_count { - state.string_position = ctx.string_position; - ctx.next_peek_from(1, req, state, callback); - } else { - state.marks.pop_discard(); - ctx.failure(); - } - } - - fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { - if state.popped_has_matched { - return ctx.success(); - } - - state.string_position = ctx.string_position; - - let mut next_ctx = *ctx; - next_ctx.skip_code(4); - if _count(req, state, next_ctx, 1) == 0 { - state.marks.pop_discard(); - return ctx.failure(); - } - - ctx.skip_char(req, 1); - ctx.count += 1; - state.marks.pop_keep(); - create_context(req, state, ctx); - } -} - -/* match repeated sequence (maximizing regexp) */ -/* this operator only works if the repeated item is -exactly one character wide, and we're not already -collecting backtracking points. for other cases, -use the MAX_REPEAT operator */ -/* <1=min> <2=max> item tail */ -fn op_repeat_one(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let min_count = ctx.peek_code(req, 2) as usize; - let max_count = ctx.peek_code(req, 3) as usize; - - if ctx.remaining_chars(req) < min_count { - return ctx.failure(); - } - - state.string_position = ctx.string_position; - - let mut next_ctx = *ctx; - next_ctx.skip_code(4); - let count = _count(req, state, next_ctx, max_count); - ctx.skip_char(req, count); - if count < min_count { - return ctx.failure(); - } - - let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); - if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { - // tail is empty. we're finished - state.string_position = ctx.string_position; - return ctx.success(); - } - - state.marks.push(); - ctx.count = count as isize; - create_context(req, state, ctx); - - fn create_context(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let min_count = ctx.peek_code(req, 2) as isize; - let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); - if next_code == SreOpcode::LITERAL as u32 { - // Special case: Tail starts with a literal. Skip positions where - // the rest of the pattern cannot possibly match. - let c = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 2); - while ctx.at_end(req) || ctx.peek_char(req) != c { - if ctx.count <= min_count { - state.marks.pop_discard(); - return ctx.failure(); - } - ctx.back_skip_char(req, 1); - ctx.count -= 1; - } - } - - state.string_position = ctx.string_position; - - // General case: backtracking - ctx.next_peek_from(1, req, state, callback); - } - - fn callback(req: &Request, state: &mut State, ctx: &mut MatchContext) { - if state.popped_has_matched { - return ctx.success(); - } - - let min_count = ctx.peek_code(req, 2) as isize; - - if ctx.count <= min_count { - state.marks.pop_discard(); - return ctx.failure(); - } - - ctx.back_skip_char(req, 1); - ctx.count -= 1; - - state.marks.pop_keep(); - create_context(req, state, ctx); - } -} -*/ - #[derive(Debug, Clone, Copy)] struct RepeatContext { count: isize, @@ -1370,173 +1034,6 @@ struct RepeatContext { prev_id: usize, } -/* -/* create repeat context. all the hard work is done -by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ -/* <1=min> <2=max> item tail */ -fn op_repeat(req: &Request, state: &mut State, ctx: &mut MatchContext) { - let repeat_ctx = RepeatContext { - count: -1, - min_count: ctx.peek_code(req, 2) as usize, - max_count: ctx.peek_code(req, 3) as usize, - code_position: ctx.code_position, - last_position: std::usize::MAX, - prev_id: ctx.repeat_ctx_id, - }; - - state.repeat_stack.push(repeat_ctx); - - state.string_position = ctx.string_position; - - let repeat_ctx_id = state.repeat_stack.len() - 1; - - let next_ctx = ctx.next_peek_from(1, req, state, |_, state, ctx| { - ctx.has_matched = Some(state.popped_has_matched); - state.repeat_stack.pop(); - }); - next_ctx.repeat_ctx_id = repeat_ctx_id; -} - -/* minimizing repeat */ -fn op_min_until(state: &mut State, ctx: &mut MatchContext) { - let repeat_ctx = state.repeat_stack.last_mut().unwrap(); - - state.string_position = ctx.string_position; - - repeat_ctx.count += 1; - - if (repeat_ctx.count as usize) < repeat_ctx.min_count { - // not enough matches - ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { - if state.popped_has_matched { - ctx.success(); - } else { - state.repeat_stack[ctx.repeat_ctx_id].count -= 1; - state.string_position = ctx.string_position; - ctx.failure(); - } - }); - return; - } - - state.marks.push(); - - ctx.count = ctx.repeat_ctx_id as isize; - - let repeat_ctx_prev_id = repeat_ctx.prev_id; - - // see if the tail matches - let next_ctx = ctx.next_offset(1, state, |_, state, ctx| { - if state.popped_has_matched { - return ctx.success(); - } - - ctx.repeat_ctx_id = ctx.count as usize; - - let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; - - state.string_position = ctx.string_position; - - state.marks.pop(); - - // match more until tail matches - - if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT - || state.string_position == repeat_ctx.last_position - { - repeat_ctx.count -= 1; - return ctx.failure(); - } - - /* zero-width match protection */ - repeat_ctx.last_position = state.string_position; - - ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { - if state.popped_has_matched { - ctx.success(); - } else { - state.repeat_stack[ctx.repeat_ctx_id].count -= 1; - state.string_position = ctx.string_position; - ctx.failure(); - } - }); - }); - next_ctx.repeat_ctx_id = repeat_ctx_prev_id; -} - -/* maximizing repeat */ -fn op_max_until(state: &mut State, ctx: &mut MatchContext) { - let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; - - state.string_position = ctx.string_position; - - repeat_ctx.count += 1; - - if (repeat_ctx.count as usize) < repeat_ctx.min_count { - // not enough matches - ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { - if state.popped_has_matched { - ctx.success(); - } else { - state.repeat_stack[ctx.repeat_ctx_id].count -= 1; - state.string_position = ctx.string_position; - ctx.failure(); - } - }); - return; - } - - if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) - && state.string_position != repeat_ctx.last_position - { - /* we may have enough matches, but if we can - match another item, do so */ - state.marks.push(); - - ctx.count = repeat_ctx.last_position as isize; - repeat_ctx.last_position = state.string_position; - - ctx.next_at(repeat_ctx.code_position + 4, state, |_, state, ctx| { - let save_last_position = ctx.count as usize; - let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; - repeat_ctx.last_position = save_last_position; - - if state.popped_has_matched { - state.marks.pop_discard(); - return ctx.success(); - } - - state.marks.pop(); - repeat_ctx.count -= 1; - - state.string_position = ctx.string_position; - - /* cannot match more repeated items here. make sure the - tail matches */ - let repeat_ctx_prev_id = repeat_ctx.prev_id; - let next_ctx = ctx.next_offset(1, state, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx_prev_id; - }); - return; - } - - /* cannot match more repeated items here. make sure the - tail matches */ - let repeat_ctx_prev_id = repeat_ctx.prev_id; - let next_ctx = ctx.next_offset(1, state, tail_callback); - next_ctx.repeat_ctx_id = repeat_ctx_prev_id; - - fn tail_callback(_: &Request, state: &mut State, ctx: &mut MatchContext) { - if state.popped_has_matched { - ctx.success(); - } else { - state.string_position = ctx.string_position; - ctx.failure(); - } - } -} -*/ - pub trait StrDrive: Copy { fn offset(&self, offset: usize, skip: usize) -> usize; fn count(&self) -> usize; @@ -1758,67 +1255,6 @@ fn at(req: &Request, ctx: &MatchContext, atcode: SreAtCode) -> b } } -/* -fn general_op_literal bool>( - req: &Request, - ctx: &mut MatchContext, - f: F, -) { - if ctx.at_end(req) || !f(ctx.peek_code(req, 1), ctx.peek_char(req)) { - ctx.failure(); - } else { - ctx.skip_code(2); - ctx.skip_char(req, 1); - } -} - -fn general_op_in bool>( - req: &Request, - ctx: &mut MatchContext, - f: F, -) { - if ctx.at_end(req) || !f(&ctx.pattern(req)[2..], ctx.peek_char(req)) { - ctx.failure(); - } else { - ctx.skip_code_from(req, 1); - ctx.skip_char(req, 1); - } -} - -fn general_op_groupref u32>( - req: &Request, - state: &State, - ctx: &mut MatchContext, - mut f: F, -) { - let (group_start, group_end) = state.marks.get(ctx.peek_code(req, 1) as usize); - let (group_start, group_end) = if group_start.is_some() - && group_end.is_some() - && group_start.unpack() <= group_end.unpack() - { - (group_start.unpack(), group_end.unpack()) - } else { - return ctx.failure(); - }; - - let mut gctx = MatchContext { - string_position: group_start, - string_offset: req.string.offset(0, group_start), - ..*ctx - }; - - for _ in group_start..group_end { - if ctx.at_end(req) || f(ctx.peek_char(req)) != f(gctx.peek_char(req)) { - return ctx.failure(); - } - ctx.skip_char(req, 1); - gctx.skip_char(req, 1); - } - - ctx.skip_code(2); -} -*/ - fn char_loc_ignore(code: u32, c: u32) -> bool { code == c || code == lower_locate(c) || code == upper_locate(c) } From 003c45dbffbfefe5a7a47899836d042bddfbb8a9 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 13 Dec 2023 22:00:35 +0200 Subject: [PATCH 083/705] remove unneccesary INFO logic on main dispatch --- src/engine.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 3486c52d9f..5cf79ea147 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -652,20 +652,13 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - general_op_in!(|set, c| charset(set, lower_unicode(c))) } SreOpcode::IN_LOC_IGNORE => general_op_in!(charset_loc_ignore), - SreOpcode::INFO => { - let min = ctx.peek_code(req, 3) as usize; - if ctx.remaining_chars(req) < min { - break 'result false; - } - ctx.skip_code_from(req, 1); - } SreOpcode::MARK => { state .marks .set(ctx.peek_code(req, 1) as usize, ctx.string_position); ctx.skip_code(2); } - SreOpcode::JUMP => ctx.skip_code_from(req, 1), + SreOpcode::INFO | SreOpcode::JUMP => ctx.skip_code_from(req, 1), /* <1=min> <2=max> item tail */ SreOpcode::REPEAT => { let repeat_ctx = RepeatContext { From 41bdcfe2212d08c4068595babe3c497cd2a035fb Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 13 Dec 2023 22:13:54 +0200 Subject: [PATCH 084/705] bump version to 0.5.0 --- Cargo.toml | 2 +- src/engine.rs | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index de1d68cf6d..b0ec8eab2d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.4.3" +version = "0.5.0" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" diff --git a/src/engine.rs b/src/engine.rs index 5cf79ea147..5ee9af7d1b 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,8 +1,6 @@ // good luck to those that follow; here be dragons -use crate::constants::SreInfo; - -use super::constants::{SreAtCode, SreCatCode, SreOpcode}; +use super::constants::{SreAtCode, SreCatCode, SreOpcode, SreInfo}; use super::MAXREPEAT; use optional::Optioned; use std::convert::TryFrom; @@ -284,7 +282,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - }; popped_result = 'result: loop { - let yield_ = 'context: loop { + let yielded = 'context: loop { match ctx.jump { Jump::OpCode => {} Jump::Assert1 => { @@ -864,7 +862,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - } }; context_stack.push(ctx); - context_stack.push(yield_); + context_stack.push(yielded); continue 'coro; }; } From 169368b7f06fd502a5a77fd56cf5b7ec46e3a975 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 13 Dec 2023 22:43:17 +0200 Subject: [PATCH 085/705] fix some clippy --- src/engine.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 5ee9af7d1b..b6bf6a6fb6 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -142,7 +142,7 @@ impl State { repeat_ctx_id: usize::MAX, count: -1, }; - _match(&req, self, ctx) + _match(req, self, ctx) } pub fn search(&mut self, mut req: Request) -> bool { @@ -1400,16 +1400,16 @@ fn _count( } } SreOpcode::LITERAL => { - general_count_literal(req, &mut ctx, end, |code, c| code == c as u32); + general_count_literal(req, &mut ctx, end, |code, c| code == c); } SreOpcode::NOT_LITERAL => { - general_count_literal(req, &mut ctx, end, |code, c| code != c as u32); + general_count_literal(req, &mut ctx, end, |code, c| code != c); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| code == lower_ascii(c) as u32); + general_count_literal(req, &mut ctx, end, |code, c| code == lower_ascii(c)); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| code != lower_ascii(c) as u32); + general_count_literal(req, &mut ctx, end, |code, c| code != lower_ascii(c)); } SreOpcode::LITERAL_LOC_IGNORE => { general_count_literal(req, &mut ctx, end, char_loc_ignore); @@ -1419,12 +1419,12 @@ fn _count( } SreOpcode::LITERAL_UNI_IGNORE => { general_count_literal(req, &mut ctx, end, |code, c| { - code == lower_unicode(c) as u32 + code == lower_unicode(c) }); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { general_count_literal(req, &mut ctx, end, |code, c| { - code != lower_unicode(c) as u32 + code != lower_unicode(c) }); } _ => { From 454aa4b6544cc53b3443328015ff923103f906c4 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 3 Jan 2024 15:34:01 +0200 Subject: [PATCH 086/705] fix count not advance --- src/engine.rs | 28 ++++++++++++++-------------- tests/tests.rs | 13 ++++++++++++- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index b6bf6a6fb6..86b8d20d8a 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,6 +1,6 @@ // good luck to those that follow; here be dragons -use super::constants::{SreAtCode, SreCatCode, SreOpcode, SreInfo}; +use super::constants::{SreAtCode, SreCatCode, SreInfo, SreOpcode}; use super::MAXREPEAT; use optional::Optioned; use std::convert::TryFrom; @@ -1418,31 +1418,31 @@ fn _count( general_count_literal(req, &mut ctx, end, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| { - code == lower_unicode(c) - }); + general_count_literal(req, &mut ctx, end, |code, c| code == lower_unicode(c)); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| { - code != lower_unicode(c) - }); + general_count_literal(req, &mut ctx, end, |code, c| code != lower_unicode(c)); } _ => { /* General case */ let mut count = 0; + let mut sub_ctx = MatchContext { + toplevel: false, + jump: Jump::OpCode, + repeat_ctx_id: usize::MAX, + count: -1, + ..ctx + }; + while count < max_count { - let sub_ctx = MatchContext { - toplevel: false, - jump: Jump::OpCode, - repeat_ctx_id: usize::MAX, - count: -1, - ..ctx - }; if !_match(req, state, sub_ctx) { break; } count += 1; + sub_ctx.skip_char(req, 1); + // ctx.string_position = state.string_position; + // ctx.string_offset = req.string.offset(0, state.string_position); } return count; } diff --git a/tests/tests.rs b/tests/tests.rs index 4c56c42fae..efeb2d2838 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -162,4 +162,15 @@ fn test_bug_20998() { req.match_all = true; assert!(state.pymatch(&req)); assert_eq!(state.string_position, 3); -} \ No newline at end of file +} + +#[test] +fn test_bigcharset() { + // pattern p = re.compile('[a-z]*', re.I) + // START GENERATED by generate_tests.py + #[rustfmt::skip] let p = Pattern { code: &[14, 4, 0, 0, 4294967295, 24, 97, 0, 4294967295, 39, 92, 10, 3, 33685760, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 33686018, 0, 0, 0, 134217726, 0, 0, 0, 0, 0, 131072, 0, 2147483648, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1] }; + // END GENERATED + let (req, mut state) = p.state("x "); + assert!(state.pymatch(&req)); + assert_eq!(state.string_position, 1); +} From 17e1152de63cd9b70e4ad3b061979baa7ee54a35 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 3 Jan 2024 17:17:27 +0200 Subject: [PATCH 087/705] fix assert not mark --- src/engine.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/engine.rs b/src/engine.rs index 86b8d20d8a..964775fd8f 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -296,6 +296,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - if popped_result { break 'result false; } + state.marks.pop(); ctx.skip_code_from(req, 1); } Jump::Branch1 => { @@ -612,6 +613,7 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - ctx.skip_code_from(req, 1); continue; } + state.marks.push(); let mut next_ctx = ctx.next_offset(3, Jump::AssertNot1); next_ctx.toplevel = false; From 2fe129252fd0d798dc0cdac78737bd641825d48b Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Thu, 4 Jan 2024 07:51:35 +0200 Subject: [PATCH 088/705] improve ctx in _match lazy create stack vec --- src/constants.rs | 4 ++-- src/engine.rs | 31 ++++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/constants.rs b/src/constants.rs index dc61c33b2c..9fe792ce17 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -14,7 +14,7 @@ use bitflags::bitflags; pub const SRE_MAGIC: usize = 20221023; -#[derive(num_enum::TryFromPrimitive, Debug)] +#[derive(num_enum::TryFromPrimitive, Debug, PartialEq, Eq)] #[repr(u32)] #[allow(non_camel_case_types, clippy::upper_case_acronyms)] pub enum SreOpcode { @@ -62,7 +62,7 @@ pub enum SreOpcode { NOT_LITERAL_UNI_IGNORE = 41, RANGE_UNI_IGNORE = 42, } -#[derive(num_enum::TryFromPrimitive, Debug)] +#[derive(num_enum::TryFromPrimitive, Debug, PartialEq, Eq)] #[repr(u32)] #[allow(non_camel_case_types, clippy::upper_case_acronyms)] pub enum SreAtCode { diff --git a/src/engine.rs b/src/engine.rs index 964775fd8f..c23ffb7a2f 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -207,6 +207,15 @@ impl State { return true; } + if ctx.try_peek_code_as::(&req, 1).unwrap() == SreOpcode::AT + && (ctx.try_peek_code_as::(&req, 2).unwrap() == SreAtCode::BEGINNING + || ctx.try_peek_code_as::(&req, 2).unwrap() + == SreAtCode::BEGINNING_STRING) + { + self.reset(req.end); + return false; + } + req.must_advance = false; ctx.toplevel = false; while req.start < end { @@ -272,15 +281,11 @@ enum Jump { PossessiveRepeat4, } -fn _match(req: &Request, state: &mut State, ctx: MatchContext) -> bool { - let mut context_stack = vec![ctx]; +fn _match(req: &Request, state: &mut State, mut ctx: MatchContext) -> bool { + let mut context_stack = vec![]; let mut popped_result = false; 'coro: loop { - let Some(mut ctx) = context_stack.pop() else { - break; - }; - popped_result = 'result: loop { let yielded = 'context: loop { match ctx.jump { @@ -864,9 +869,14 @@ fn _match(req: &Request, state: &mut State, ctx: MatchContext) - } }; context_stack.push(ctx); - context_stack.push(yielded); + ctx = yielded; continue 'coro; }; + if let Some(popped_ctx) = context_stack.pop() { + ctx = popped_ctx; + } else { + break; + } } popped_result } @@ -1148,6 +1158,13 @@ impl MatchContext { req.pattern_codes[self.code_position + peek] } + fn try_peek_code_as(&self, req: &Request, peek: usize) -> Result + where + T: TryFrom, + { + self.peek_code(req, peek).try_into() + } + fn skip_code(&mut self, skip: usize) { self.code_position += skip; } From f9b2d10c710d45ebd0cc9788294cd5806cc9e8ac Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Fri, 5 Jan 2024 07:23:50 +0200 Subject: [PATCH 089/705] improve search at_beginning --- src/engine.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index c23ffb7a2f..dbb58025fe 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -201,15 +201,17 @@ impl State { return search_info_charset(&mut req, self, ctx); } // fallback to general search + // skip OP INFO + ctx.skip_code_from(&req, 1); } if _match(&req, self, ctx) { return true; } - if ctx.try_peek_code_as::(&req, 1).unwrap() == SreOpcode::AT - && (ctx.try_peek_code_as::(&req, 2).unwrap() == SreAtCode::BEGINNING - || ctx.try_peek_code_as::(&req, 2).unwrap() + if ctx.try_peek_code_as::(&req, 0).unwrap() == SreOpcode::AT + && (ctx.try_peek_code_as::(&req, 1).unwrap() == SreAtCode::BEGINNING + || ctx.try_peek_code_as::(&req, 1).unwrap() == SreAtCode::BEGINNING_STRING) { self.reset(req.end); From 118a00c012810900fe89277cebe6a5f09ff286d1 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 7 Jan 2024 08:42:18 +0200 Subject: [PATCH 090/705] refactor _count general case --- src/engine.rs | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index dbb58025fe..ca44f4994a 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1446,26 +1446,20 @@ fn _count( } _ => { /* General case */ - let mut count = 0; - - let mut sub_ctx = MatchContext { - toplevel: false, - jump: Jump::OpCode, - repeat_ctx_id: usize::MAX, - count: -1, - ..ctx + ctx.toplevel = false; + ctx.jump = Jump::OpCode; + ctx.repeat_ctx_id = usize::MAX; + ctx.count = -1; + + let mut sub_state = State { + marks: Marks::default(), + repeat_stack: vec![], + ..*state }; - while count < max_count { - if !_match(req, state, sub_ctx) { - break; - } - count += 1; - sub_ctx.skip_char(req, 1); - // ctx.string_position = state.string_position; - // ctx.string_offset = req.string.offset(0, state.string_position); + while ctx.string_position < end && _match(req, &mut sub_state, ctx) { + ctx.skip_char(req, 1); } - return count; } } From c93ea30b3b5849edc3ac888a1bfec69f08b537c1 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sat, 13 Jan 2024 16:03:38 +0200 Subject: [PATCH 091/705] improve use StringCursor replace index based position --- Cargo.toml | 2 +- benches/benches.rs | 12 +- src/engine.rs | 511 ++++++++++++++------------------------------- src/lib.rs | 5 + src/string.rs | 381 +++++++++++++++++++++++++++++++++ tests/tests.rs | 31 +-- 6 files changed, 562 insertions(+), 380 deletions(-) create mode 100644 src/string.rs diff --git a/Cargo.toml b/Cargo.toml index b0ec8eab2d..e54f124ac0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sre-engine" -version = "0.5.0" +version = "0.6.0" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" repository = "https://github.com/RustPython/sre-engine" diff --git a/benches/benches.rs b/benches/benches.rs index f70138f920..e89adab0dd 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -3,24 +3,24 @@ extern crate test; use test::Bencher; -use sre_engine::engine; +use sre_engine::{Request, State, StrDrive}; struct Pattern { code: &'static [u32], } impl Pattern { - fn state<'a, S: engine::StrDrive>(&self, string: S) -> (engine::Request<'a, S>, engine::State) { + fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) { self.state_range(string, 0..usize::MAX) } - fn state_range<'a, S: engine::StrDrive>( + fn state_range<'a, S: StrDrive>( &self, string: S, range: std::ops::Range, - ) -> (engine::Request<'a, S>, engine::State) { - let req = engine::Request::new(string, range.start, range.end, self.code, false); - let state = engine::State::default(); + ) -> (Request<'a, S>, State) { + let req = Request::new(string, range.start, range.end, self.code, false); + let state = State::default(); (req, state) } } diff --git a/src/engine.rs b/src/engine.rs index ca44f4994a..97489633d8 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -1,14 +1,14 @@ // good luck to those that follow; here be dragons -use super::constants::{SreAtCode, SreCatCode, SreInfo, SreOpcode}; -use super::MAXREPEAT; +use crate::string::{ + is_digit, is_linebreak, is_loc_word, is_space, is_uni_digit, is_uni_linebreak, is_uni_space, + is_uni_word, is_word, lower_ascii, lower_locate, lower_unicode, upper_locate, upper_unicode, +}; + +use super::{SreAtCode, SreCatCode, SreInfo, SreOpcode, StrDrive, StringCursor, MAXREPEAT}; use optional::Optioned; use std::convert::TryFrom; -const fn is_py_ascii_whitespace(b: u8) -> bool { - matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') -} - #[derive(Debug, Clone, Copy)] pub struct Request<'a, S> { pub string: S, @@ -117,25 +117,29 @@ impl Marks { pub struct State { pub start: usize, pub marks: Marks, - pub string_position: usize, + pub cursor: StringCursor, repeat_stack: Vec, } impl State { - pub fn reset(&mut self, start: usize) { + pub fn reset(&mut self, req: &Request, start: usize) { self.marks.clear(); self.repeat_stack.clear(); self.start = start; - self.string_position = start; + if self.cursor.ptr.is_null() || self.cursor.position > self.start { + self.cursor = req.string.create_cursor(self.start); + } else if self.cursor.position < self.start { + let skip = self.start - self.cursor.position; + S::skip(&mut self.cursor, skip); + } } pub fn pymatch(&mut self, req: &Request) -> bool { self.start = req.start; - self.string_position = req.start; + self.cursor = req.string.create_cursor(self.start); let ctx = MatchContext { - string_position: req.start, - string_offset: req.string.offset(0, req.start), + cursor: self.cursor, code_position: 0, toplevel: true, jump: Jump::OpCode, @@ -147,7 +151,7 @@ impl State { pub fn search(&mut self, mut req: Request) -> bool { self.start = req.start; - self.string_position = req.start; + self.cursor = req.string.create_cursor(self.start); if req.start > req.end { return false; @@ -155,11 +159,8 @@ impl State { let mut end = req.end; - let mut start_offset = req.string.offset(0, req.start); - let mut ctx = MatchContext { - string_position: req.start, - string_offset: start_offset, + cursor: self.cursor, code_position: 0, toplevel: true, jump: Jump::OpCode, @@ -183,9 +184,9 @@ impl State { end -= min - 1; // adjust ctx position - if end < ctx.string_position { - ctx.string_position = end; - ctx.string_offset = req.string.offset(0, ctx.string_position); + if end < ctx.cursor.position { + let skip = end - self.cursor.position; + S::skip(&mut self.cursor, skip); } } @@ -214,7 +215,7 @@ impl State { || ctx.try_peek_code_as::(&req, 1).unwrap() == SreAtCode::BEGINNING_STRING) { - self.reset(req.end); + self.reset(&req, req.end); return false; } @@ -222,10 +223,8 @@ impl State { ctx.toplevel = false; while req.start < end { req.start += 1; - start_offset = req.string.offset(start_offset, 1); - self.reset(req.start); - ctx.string_position = req.start; - ctx.string_offset = start_offset; + self.reset(&req, req.start); + ctx.cursor = self.cursor; if _match(&req, self, ctx) { return true; @@ -248,13 +247,13 @@ impl<'a, S: StrDrive> Iterator for SearchIter<'a, S> { return None; } - self.state.reset(self.req.start); + self.state.reset(&self.req, self.req.start); if !self.state.search(self.req) { return None; } - self.req.must_advance = self.state.string_position == self.state.start; - self.req.start = self.state.string_position; + self.req.must_advance = self.state.cursor.position == self.state.start; + self.req.start = self.state.cursor.position; Some(()) } @@ -313,7 +312,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex state.marks.pop_discard(); break 'result false; } - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; let next_ctx = ctx.next_offset(branch_offset + 1, Jump::Branch2); ctx.count += next_length; break 'context next_ctx; @@ -333,7 +332,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex Jump::UntilBacktrace => { if !popped_result { state.repeat_stack[ctx.repeat_ctx_id].count -= 1; - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; } break 'result popped_result; } @@ -349,7 +348,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex state.marks.pop(); repeat_ctx.count -= 1; - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; /* cannot match more repeated items here. make sure the tail matches */ @@ -359,7 +358,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex } Jump::MaxUntil3 => { if !popped_result { - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; } break 'result popped_result; } @@ -369,20 +368,20 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex } ctx.repeat_ctx_id = ctx.count as usize; let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; state.marks.pop(); // match more until tail matches if repeat_ctx.count as usize >= repeat_ctx.max_count && repeat_ctx.max_count != MAXREPEAT - || state.string_position == repeat_ctx.last_position + || state.cursor.position == repeat_ctx.last_position { repeat_ctx.count -= 1; break 'result false; } /* zero-width match protection */ - repeat_ctx.last_position = state.string_position; + repeat_ctx.last_position = state.cursor.position; break 'context ctx .next_at(repeat_ctx.code_position + 4, Jump::UntilBacktrace); @@ -394,17 +393,17 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex // Special case: Tail starts with a literal. Skip positions where // the rest of the pattern cannot possibly match. let c = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 2); - while ctx.at_end(req) || ctx.peek_char(req) != c { + while ctx.at_end(req) || ctx.peek_char::() != c { if ctx.count <= min_count { state.marks.pop_discard(); break 'result false; } - ctx.back_skip_char(req, 1); + ctx.back_advance_char::(); ctx.count -= 1; } } - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; // General case: backtracking break 'context ctx.next_peek_from(1, req, Jump::RepeatOne2); } @@ -419,7 +418,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex break 'result false; } - ctx.back_skip_char(req, 1); + ctx.back_advance_char::(); ctx.count -= 1; state.marks.pop_keep(); @@ -429,7 +428,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex Jump::MinRepeatOne1 => { let max_count = ctx.peek_code(req, 3) as usize; if max_count == MAXREPEAT || ctx.count as usize <= max_count { - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; break 'context ctx.next_peek_from(1, req, Jump::MinRepeatOne2); } else { state.marks.pop_discard(); @@ -441,7 +440,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex break 'result true; } - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; let mut count_ctx = ctx; count_ctx.skip_code(4); @@ -450,7 +449,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex break 'result false; } - ctx.skip_char(req, 1); + ctx.advance_char::(); ctx.count += 1; state.marks.pop_keep(); ctx.jump = Jump::MinRepeatOne1; @@ -459,11 +458,10 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex Jump::AtomicGroup1 => { if popped_result { ctx.skip_code_from(req, 1); - ctx.string_position = state.string_position; - ctx.string_offset = req.string.offset(0, state.string_position); + ctx.cursor = state.cursor; // dispatch opcode } else { - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; break 'result false; } } @@ -473,7 +471,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex break 'context ctx.next_offset(4, Jump::PossessiveRepeat2); } // zero match protection - ctx.string_position = usize::MAX; + ctx.cursor.position = usize::MAX; ctx.jump = Jump::PossessiveRepeat3; continue 'context; } @@ -483,22 +481,20 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex ctx.jump = Jump::PossessiveRepeat1; continue 'context; } else { - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; break 'result false; } } Jump::PossessiveRepeat3 => { let max_count = ctx.peek_code(req, 3) as usize; if ((ctx.count as usize) < max_count || max_count == MAXREPEAT) - && ctx.string_position != state.string_position + && ctx.cursor.position != state.cursor.position { state.marks.push(); - ctx.string_position = state.string_position; - ctx.string_offset = req.string.offset(0, state.string_position); + ctx.cursor = state.cursor; break 'context ctx.next_offset(4, Jump::PossessiveRepeat4); } - ctx.string_position = state.string_position; - ctx.string_offset = req.string.offset(0, state.string_position); + ctx.cursor = state.cursor; ctx.skip_code_from(req, 1); ctx.skip_code(1); } @@ -510,7 +506,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex continue 'context; } state.marks.pop(); - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; ctx.skip_code_from(req, 1); ctx.skip_code(1); } @@ -520,21 +516,22 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex loop { macro_rules! general_op_literal { ($f:expr) => {{ - if ctx.at_end(req) || !$f(ctx.peek_code(req, 1), ctx.peek_char(req)) { + if ctx.at_end(req) || !$f(ctx.peek_code(req, 1), ctx.peek_char::()) { break 'result false; } ctx.skip_code(2); - ctx.skip_char(req, 1); + ctx.advance_char::(); }}; } macro_rules! general_op_in { ($f:expr) => {{ - if ctx.at_end(req) || !$f(&ctx.pattern(req)[2..], ctx.peek_char(req)) { + if ctx.at_end(req) || !$f(&ctx.pattern(req)[2..], ctx.peek_char::()) + { break 'result false; } ctx.skip_code_from(req, 1); - ctx.skip_char(req, 1); + ctx.advance_char::(); }}; } @@ -552,19 +549,18 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex }; let mut gctx = MatchContext { - string_position: group_start, - string_offset: req.string.offset(0, group_start), + cursor: req.string.create_cursor(group_start), ..ctx }; for _ in group_start..group_end { if ctx.at_end(req) - || $f(ctx.peek_char(req)) != $f(gctx.peek_char(req)) + || $f(ctx.peek_char::()) != $f(gctx.peek_char::()) { break 'result false; } - ctx.skip_char(req, 1); - gctx.skip_char(req, 1); + ctx.advance_char::(); + gctx.advance_char::(); } ctx.skip_code(2); @@ -581,7 +577,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex SreOpcode::FAILURE => break 'result false, SreOpcode::SUCCESS => { if ctx.can_success(req) { - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; break 'result true; } break 'result false; @@ -591,32 +587,32 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex break 'result false; } ctx.skip_code(1); - ctx.skip_char(req, 1); + ctx.advance_char::(); } SreOpcode::ANY_ALL => { if ctx.at_end(req) { break 'result false; } ctx.skip_code(1); - ctx.skip_char(req, 1); + ctx.advance_char::(); } /* */ SreOpcode::ASSERT => { let back = ctx.peek_code(req, 2) as usize; - if ctx.string_position < back { + if ctx.cursor.position < back { break 'result false; } let mut next_ctx = ctx.next_offset(3, Jump::Assert1); next_ctx.toplevel = false; - next_ctx.back_skip_char(req, back); - state.string_position = next_ctx.string_position; + next_ctx.back_skip_char::(back); + state.cursor = next_ctx.cursor; break 'context next_ctx; } /* */ SreOpcode::ASSERT_NOT => { let back = ctx.peek_code(req, 2) as usize; - if ctx.string_position < back { + if ctx.cursor.position < back { ctx.skip_code_from(req, 1); continue; } @@ -624,8 +620,8 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex let mut next_ctx = ctx.next_offset(3, Jump::AssertNot1); next_ctx.toplevel = false; - next_ctx.back_skip_char(req, back); - state.string_position = next_ctx.string_position; + next_ctx.back_skip_char::(back); + state.cursor = next_ctx.cursor; break 'context next_ctx; } SreOpcode::AT => { @@ -645,11 +641,11 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex } SreOpcode::CATEGORY => { let catcode = SreCatCode::try_from(ctx.peek_code(req, 1)).unwrap(); - if ctx.at_end(req) || !category(catcode, ctx.peek_char(req)) { + if ctx.at_end(req) || !category(catcode, ctx.peek_char::()) { break 'result false; } ctx.skip_code(2); - ctx.skip_char(req, 1); + ctx.advance_char::(); } SreOpcode::IN => general_op_in!(charset), SreOpcode::IN_IGNORE => { @@ -662,7 +658,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex SreOpcode::MARK => { state .marks - .set(ctx.peek_code(req, 1) as usize, ctx.string_position); + .set(ctx.peek_code(req, 1) as usize, ctx.cursor.position); ctx.skip_code(2); } SreOpcode::INFO | SreOpcode::JUMP => ctx.skip_code_from(req, 1), @@ -678,14 +674,14 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex }; state.repeat_stack.push(repeat_ctx); let repeat_ctx_id = state.repeat_stack.len() - 1; - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; let mut next_ctx = ctx.next_peek_from(1, req, Jump::Repeat1); next_ctx.repeat_ctx_id = repeat_ctx_id; break 'context next_ctx; } SreOpcode::MAX_UNTIL => { let repeat_ctx = &mut state.repeat_stack[ctx.repeat_ctx_id]; - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; repeat_ctx.count += 1; if (repeat_ctx.count as usize) < repeat_ctx.min_count { @@ -696,13 +692,13 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex if ((repeat_ctx.count as usize) < repeat_ctx.max_count || repeat_ctx.max_count == MAXREPEAT) - && state.string_position != repeat_ctx.last_position + && state.cursor.position != repeat_ctx.last_position { /* we may have enough matches, but if we can match another item, do so */ state.marks.push(); ctx.count = repeat_ctx.last_position as isize; - repeat_ctx.last_position = state.string_position; + repeat_ctx.last_position = state.cursor.position; break 'context ctx .next_at(repeat_ctx.code_position + 4, Jump::MaxUntil2); @@ -716,7 +712,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex } SreOpcode::MIN_UNTIL => { let repeat_ctx = state.repeat_stack.last_mut().unwrap(); - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; repeat_ctx.count += 1; if (repeat_ctx.count as usize) < repeat_ctx.min_count { @@ -740,12 +736,12 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex break 'result false; } - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; let mut next_ctx = ctx; next_ctx.skip_code(4); let count = _count(req, state, next_ctx, max_count); - ctx.skip_char(req, count); + ctx.skip_char::(count); if count < min_count { break 'result false; } @@ -753,7 +749,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { // tail is empty. we're finished - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; break 'result true; } @@ -769,7 +765,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex break 'result false; } - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; ctx.count = if min_count == 0 { 0 } else { @@ -779,14 +775,14 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex if count < min_count { break 'result false; } - ctx.skip_char(req, count); + ctx.skip_char::(count); count as isize }; let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { // tail is empty. we're finished - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; break 'result true; } @@ -830,13 +826,13 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex } /* pattern tail */ SreOpcode::ATOMIC_GROUP => { - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; break 'context ctx.next_offset(2, Jump::AtomicGroup1); } /* <1=min> <2=max> pattern tail */ SreOpcode::POSSESSIVE_REPEAT => { - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; ctx.count = 0; ctx.jump = Jump::PossessiveRepeat1; continue 'context; @@ -849,14 +845,14 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex if ctx.remaining_chars(req) < min_count { break 'result false; } - state.string_position = ctx.string_position; + state.cursor = ctx.cursor; let mut count_ctx = ctx; count_ctx.skip_code(4); let count = _count(req, state, count_ctx, max_count); if count < min_count { break 'result false; } - ctx.skip_char(req, count); + ctx.skip_char::(count); ctx.skip_code_from(req, 1); } SreOpcode::CHARSET @@ -907,16 +903,17 @@ fn search_info_literal( while !ctx.at_end(req) { // find the next matched literal - while ctx.peek_char(req) != c { - ctx.skip_char(req, 1); + while ctx.peek_char::() != c { + ctx.advance_char::(); if ctx.at_end(req) { return false; } } - req.start = ctx.string_position; - state.start = ctx.string_position; - state.string_position = ctx.string_position + skip; + req.start = ctx.cursor.position; + state.start = req.start; + state.cursor = ctx.cursor; + S::skip(&mut state.cursor, skip); // literal only if LITERAL { @@ -924,44 +921,46 @@ fn search_info_literal( } let mut next_ctx = ctx; - next_ctx.skip_char(req, skip); + next_ctx.skip_char::(skip); if _match(req, state, next_ctx) { return true; } - ctx.skip_char(req, 1); + ctx.advance_char::(); state.marks.clear(); } } else { while !ctx.at_end(req) { let c = prefix[0]; - while ctx.peek_char(req) != c { - ctx.skip_char(req, 1); + while ctx.peek_char::() != c { + ctx.advance_char::(); if ctx.at_end(req) { return false; } } - ctx.skip_char(req, 1); + ctx.advance_char::(); if ctx.at_end(req) { return false; } let mut i = 1; loop { - if ctx.peek_char(req) == prefix[i] { + if ctx.peek_char::() == prefix[i] { i += 1; if i != len { - ctx.skip_char(req, 1); + ctx.advance_char::(); if ctx.at_end(req) { return false; } continue; } - req.start = ctx.string_position - (len - 1); - state.start = req.start; - state.string_position = state.start + skip; + req.start = ctx.cursor.position - (len - 1); + state.reset(req, req.start); + S::skip(&mut state.cursor, skip); + // state.start = req.start; + // state.cursor = req.string.create_cursor(req.start + skip); // literal only if LITERAL { @@ -970,17 +969,16 @@ fn search_info_literal( let mut next_ctx = ctx; if skip != 0 { - next_ctx.skip_char(req, 1); + next_ctx.advance_char::(); } else { - next_ctx.string_position = state.string_position; - next_ctx.string_offset = req.string.offset(0, state.string_position); + next_ctx.cursor = state.cursor; } if _match(req, state, next_ctx) { return true; } - ctx.skip_char(req, 1); + ctx.advance_char::(); if ctx.at_end(req) { return false; } @@ -1009,22 +1007,22 @@ fn search_info_charset( req.must_advance = false; loop { - while !ctx.at_end(req) && !charset(set, ctx.peek_char(req)) { - ctx.skip_char(req, 1); + while !ctx.at_end(req) && !charset(set, ctx.peek_char::()) { + ctx.advance_char::(); } if ctx.at_end(req) { return false; } - req.start = ctx.string_position; - state.start = ctx.string_position; - state.string_position = ctx.string_position; + req.start = ctx.cursor.position; + state.start = ctx.cursor.position; + state.cursor = ctx.cursor; if _match(req, state, ctx) { return true; } - ctx.skip_char(req, 1); + ctx.advance_char::(); state.marks.clear(); } } @@ -1039,85 +1037,9 @@ struct RepeatContext { prev_id: usize, } -pub trait StrDrive: Copy { - fn offset(&self, offset: usize, skip: usize) -> usize; - fn count(&self) -> usize; - fn peek(&self, offset: usize) -> u32; - fn back_peek(&self, offset: usize) -> u32; - fn back_offset(&self, offset: usize, skip: usize) -> usize; -} - -impl StrDrive for &str { - fn offset(&self, offset: usize, skip: usize) -> usize { - self.get(offset..) - .and_then(|s| s.char_indices().nth(skip).map(|x| x.0 + offset)) - .unwrap_or(self.len()) - } - - fn count(&self) -> usize { - self.chars().count() - } - - fn peek(&self, offset: usize) -> u32 { - unsafe { self.get_unchecked(offset..) } - .chars() - .next() - .unwrap() as u32 - } - - fn back_peek(&self, offset: usize) -> u32 { - let bytes = self.as_bytes(); - let back_offset = utf8_back_peek_offset(bytes, offset); - match offset - back_offset { - 1 => u32::from_be_bytes([0, 0, 0, bytes[offset - 1]]), - 2 => u32::from_be_bytes([0, 0, bytes[offset - 2], bytes[offset - 1]]), - 3 => u32::from_be_bytes([0, bytes[offset - 3], bytes[offset - 2], bytes[offset - 1]]), - 4 => u32::from_be_bytes([ - bytes[offset - 4], - bytes[offset - 3], - bytes[offset - 2], - bytes[offset - 1], - ]), - _ => unreachable!(), - } - } - - fn back_offset(&self, offset: usize, skip: usize) -> usize { - let bytes = self.as_bytes(); - let mut back_offset = offset; - for _ in 0..skip { - back_offset = utf8_back_peek_offset(bytes, back_offset); - } - back_offset - } -} - -impl<'a> StrDrive for &'a [u8] { - fn offset(&self, offset: usize, skip: usize) -> usize { - offset + skip - } - - fn count(&self) -> usize { - self.len() - } - - fn peek(&self, offset: usize) -> u32 { - self[offset] as u32 - } - - fn back_peek(&self, offset: usize) -> u32 { - self[offset - 1] as u32 - } - - fn back_offset(&self, offset: usize, skip: usize) -> usize { - offset - skip - } -} - #[derive(Clone, Copy)] struct MatchContext { - string_position: usize, - string_offset: usize, + cursor: StringCursor, code_position: usize, toplevel: bool, jump: Jump, @@ -1135,25 +1057,31 @@ impl MatchContext { } fn remaining_chars(&self, req: &Request) -> usize { - req.end - self.string_position + req.end - self.cursor.position + } + + fn peek_char(&self) -> u32 { + S::peek(&self.cursor) + } + + fn skip_char(&mut self, skip: usize) { + S::skip(&mut self.cursor, skip); } - fn peek_char(&self, req: &Request) -> u32 { - req.string.peek(self.string_offset) + fn advance_char(&mut self) -> u32 { + S::advance(&mut self.cursor) } - fn skip_char(&mut self, req: &Request, skip: usize) { - self.string_position += skip; - self.string_offset = req.string.offset(self.string_offset, skip); + fn back_peek_char(&self) -> u32 { + S::back_peek(&self.cursor) } - fn back_peek_char(&self, req: &Request) -> u32 { - req.string.back_peek(self.string_offset) + fn back_skip_char(&mut self, skip: usize) { + S::back_skip(&mut self.cursor, skip); } - fn back_skip_char(&mut self, req: &Request, skip: usize) { - self.string_position -= skip; - self.string_offset = req.string.back_offset(self.string_offset, skip); + fn back_advance_char(&mut self) -> u32 { + S::back_advance(&mut self.cursor) } fn peek_code(&self, req: &Request, peek: usize) -> u32 { @@ -1177,15 +1105,15 @@ impl MatchContext { fn at_beginning(&self) -> bool { // self.ctx().string_position == self.state().start - self.string_position == 0 + self.cursor.position == 0 } fn at_end(&self, req: &Request) -> bool { - self.string_position == req.end + self.cursor.position == req.end } fn at_linebreak(&self, req: &Request) -> bool { - !self.at_end(req) && is_linebreak(self.peek_char(req)) + !self.at_end(req) && is_linebreak(self.peek_char::()) } fn at_boundary bool>( @@ -1196,8 +1124,8 @@ impl MatchContext { if self.at_beginning() && self.at_end(req) { return false; } - let that = !self.at_beginning() && word_checker(self.back_peek_char(req)); - let this = !self.at_end(req) && word_checker(self.peek_char(req)); + let that = !self.at_beginning() && word_checker(self.back_peek_char::()); + let this = !self.at_end(req) && word_checker(self.peek_char::()); this != that } @@ -1209,8 +1137,8 @@ impl MatchContext { if self.at_beginning() && self.at_end(req) { return false; } - let that = !self.at_beginning() && word_checker(self.back_peek_char(req)); - let this = !self.at_end(req) && word_checker(self.peek_char(req)); + let that = !self.at_beginning() && word_checker(self.back_peek_char::()); + let this = !self.at_end(req) && word_checker(self.peek_char::()); this == that } @@ -1221,7 +1149,7 @@ impl MatchContext { if req.match_all && !self.at_end(req) { return false; } - if req.must_advance && self.string_position == req.start { + if req.must_advance && self.cursor.position == req.start { return false; } true @@ -1252,7 +1180,7 @@ impl MatchContext { fn at(req: &Request, ctx: &MatchContext, atcode: SreAtCode) -> bool { match atcode { SreAtCode::BEGINNING | SreAtCode::BEGINNING_STRING => ctx.at_beginning(), - SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char(req)), + SreAtCode::BEGINNING_LINE => ctx.at_beginning() || is_linebreak(ctx.back_peek_char::()), SreAtCode::BOUNDARY => ctx.at_boundary(req, is_word), SreAtCode::NON_BOUNDARY => ctx.at_non_boundary(req, is_word), SreAtCode::END => { @@ -1403,21 +1331,22 @@ fn _count( max_count: usize, ) -> usize { let max_count = std::cmp::min(max_count, ctx.remaining_chars(req)); - let end = ctx.string_position + max_count; + let end = ctx.cursor.position + max_count; let opcode = SreOpcode::try_from(ctx.peek_code(req, 0)).unwrap(); match opcode { SreOpcode::ANY => { - while ctx.string_position < end && !ctx.at_linebreak(req) { - ctx.skip_char(req, 1); + while ctx.cursor.position < end && !ctx.at_linebreak(req) { + ctx.advance_char::(); } } SreOpcode::ANY_ALL => { - ctx.skip_char(req, max_count); + ctx.skip_char::(max_count); } SreOpcode::IN => { - while ctx.string_position < end && charset(&ctx.pattern(req)[2..], ctx.peek_char(req)) { - ctx.skip_char(req, 1); + while ctx.cursor.position < end && charset(&ctx.pattern(req)[2..], ctx.peek_char::()) + { + ctx.advance_char::(); } } SreOpcode::LITERAL => { @@ -1457,14 +1386,14 @@ fn _count( ..*state }; - while ctx.string_position < end && _match(req, &mut sub_state, ctx) { - ctx.skip_char(req, 1); + while ctx.cursor.position < end && _match(req, &mut sub_state, ctx) { + ctx.advance_char::(); } } } // TODO: return offset - ctx.string_position - state.string_position + ctx.cursor.position - state.cursor.position } fn general_count_literal bool>( @@ -1474,145 +1403,7 @@ fn general_count_literal bool>( mut f: F, ) { let ch = ctx.peek_code(req, 1); - while ctx.string_position < end && f(ch, ctx.peek_char(req)) { - ctx.skip_char(req, 1); - } -} - -fn is_word(ch: u32) -> bool { - ch == '_' as u32 - || u8::try_from(ch) - .map(|x| x.is_ascii_alphanumeric()) - .unwrap_or(false) -} -fn is_space(ch: u32) -> bool { - u8::try_from(ch) - .map(is_py_ascii_whitespace) - .unwrap_or(false) -} -fn is_digit(ch: u32) -> bool { - u8::try_from(ch) - .map(|x| x.is_ascii_digit()) - .unwrap_or(false) -} -fn is_loc_alnum(ch: u32) -> bool { - // FIXME: Ignore the locales - u8::try_from(ch) - .map(|x| x.is_ascii_alphanumeric()) - .unwrap_or(false) -} -fn is_loc_word(ch: u32) -> bool { - ch == '_' as u32 || is_loc_alnum(ch) -} -fn is_linebreak(ch: u32) -> bool { - ch == '\n' as u32 -} -pub fn lower_ascii(ch: u32) -> u32 { - u8::try_from(ch) - .map(|x| x.to_ascii_lowercase() as u32) - .unwrap_or(ch) -} -fn lower_locate(ch: u32) -> u32 { - // FIXME: Ignore the locales - lower_ascii(ch) -} -fn upper_locate(ch: u32) -> u32 { - // FIXME: Ignore the locales - u8::try_from(ch) - .map(|x| x.to_ascii_uppercase() as u32) - .unwrap_or(ch) -} -fn is_uni_digit(ch: u32) -> bool { - // TODO: check with cpython - char::try_from(ch) - .map(|x| x.is_ascii_digit()) - .unwrap_or(false) -} -fn is_uni_space(ch: u32) -> bool { - // TODO: check with cpython - is_space(ch) - || matches!( - ch, - 0x0009 - | 0x000A - | 0x000B - | 0x000C - | 0x000D - | 0x001C - | 0x001D - | 0x001E - | 0x001F - | 0x0020 - | 0x0085 - | 0x00A0 - | 0x1680 - | 0x2000 - | 0x2001 - | 0x2002 - | 0x2003 - | 0x2004 - | 0x2005 - | 0x2006 - | 0x2007 - | 0x2008 - | 0x2009 - | 0x200A - | 0x2028 - | 0x2029 - | 0x202F - | 0x205F - | 0x3000 - ) -} -fn is_uni_linebreak(ch: u32) -> bool { - matches!( - ch, - 0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029 - ) -} -fn is_uni_alnum(ch: u32) -> bool { - // TODO: check with cpython - char::try_from(ch) - .map(|x| x.is_alphanumeric()) - .unwrap_or(false) -} -fn is_uni_word(ch: u32) -> bool { - ch == '_' as u32 || is_uni_alnum(ch) -} -pub fn lower_unicode(ch: u32) -> u32 { - // TODO: check with cpython - char::try_from(ch) - .map(|x| x.to_lowercase().next().unwrap() as u32) - .unwrap_or(ch) -} -pub fn upper_unicode(ch: u32) -> u32 { - // TODO: check with cpython - char::try_from(ch) - .map(|x| x.to_uppercase().next().unwrap() as u32) - .unwrap_or(ch) -} - -fn is_utf8_first_byte(b: u8) -> bool { - // In UTF-8, there are three kinds of byte... - // 0xxxxxxx : ASCII - // 10xxxxxx : 2nd, 3rd or 4th byte of code - // 11xxxxxx : 1st byte of multibyte code - (b & 0b10000000 == 0) || (b & 0b11000000 == 0b11000000) -} - -fn utf8_back_peek_offset(bytes: &[u8], offset: usize) -> usize { - let mut offset = offset - 1; - if !is_utf8_first_byte(bytes[offset]) { - offset -= 1; - if !is_utf8_first_byte(bytes[offset]) { - offset -= 1; - if !is_utf8_first_byte(bytes[offset]) { - offset -= 1; - if !is_utf8_first_byte(bytes[offset]) { - panic!("not utf-8 code point"); - } - } - } + while ctx.cursor.position < end && f(ch, ctx.peek_char::()) { + ctx.advance_char::(); } - offset } diff --git a/src/lib.rs b/src/lib.rs index c23e807501..fd9f367dc6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,10 @@ pub mod constants; pub mod engine; +pub mod string; + +pub use constants::{SreAtCode, SreCatCode, SreFlag, SreInfo, SreOpcode, SRE_MAGIC}; +pub use engine::{Request, SearchIter, State}; +pub use string::{StrDrive, StringCursor}; pub const CODESIZE: usize = 4; diff --git a/src/string.rs b/src/string.rs new file mode 100644 index 0000000000..464901af83 --- /dev/null +++ b/src/string.rs @@ -0,0 +1,381 @@ +#[derive(Debug, Clone, Copy)] +pub struct StringCursor { + pub(crate) ptr: *const u8, + pub position: usize, +} + +impl Default for StringCursor { + fn default() -> Self { + Self { + ptr: std::ptr::null(), + position: 0, + } + } +} + +pub trait StrDrive: Copy { + fn count(&self) -> usize; + fn create_cursor(&self, n: usize) -> StringCursor; + fn advance(cursor: &mut StringCursor) -> u32; + fn peek(cursor: &StringCursor) -> u32; + fn skip(cursor: &mut StringCursor, n: usize); + fn back_advance(cursor: &mut StringCursor) -> u32; + fn back_peek(cursor: &StringCursor) -> u32; + fn back_skip(cursor: &mut StringCursor, n: usize); +} + +impl<'a> StrDrive for &'a [u8] { + #[inline] + fn count(&self) -> usize { + self.len() + } + + #[inline] + fn create_cursor(&self, n: usize) -> StringCursor { + StringCursor { + ptr: self[n..].as_ptr(), + position: n, + } + } + + #[inline] + fn advance(cursor: &mut StringCursor) -> u32 { + cursor.position += 1; + unsafe { cursor.ptr = cursor.ptr.add(1) }; + unsafe { *cursor.ptr as u32 } + } + + #[inline] + fn peek(cursor: &StringCursor) -> u32 { + unsafe { *cursor.ptr as u32 } + } + + #[inline] + fn skip(cursor: &mut StringCursor, n: usize) { + cursor.position += n; + unsafe { cursor.ptr = cursor.ptr.add(n) }; + } + + #[inline] + fn back_advance(cursor: &mut StringCursor) -> u32 { + cursor.position -= 1; + unsafe { cursor.ptr = cursor.ptr.sub(1) }; + unsafe { *cursor.ptr as u32 } + } + + #[inline] + fn back_peek(cursor: &StringCursor) -> u32 { + unsafe { *cursor.ptr.offset(-1) as u32 } + } + + #[inline] + fn back_skip(cursor: &mut StringCursor, n: usize) { + cursor.position -= n; + unsafe { cursor.ptr = cursor.ptr.sub(n) }; + } +} + +impl StrDrive for &str { + #[inline] + fn count(&self) -> usize { + self.chars().count() + } + + #[inline] + fn create_cursor(&self, n: usize) -> StringCursor { + let mut ptr = self.as_ptr(); + for _ in 0..n { + unsafe { next_code_point(&mut ptr) }; + } + StringCursor { ptr, position: n } + } + + #[inline] + fn advance(cursor: &mut StringCursor) -> u32 { + cursor.position += 1; + unsafe { next_code_point(&mut cursor.ptr) } + } + + #[inline] + fn peek(cursor: &StringCursor) -> u32 { + let mut ptr = cursor.ptr; + unsafe { next_code_point(&mut ptr) } + } + + #[inline] + fn skip(cursor: &mut StringCursor, n: usize) { + cursor.position += n; + for _ in 0..n { + unsafe { next_code_point(&mut cursor.ptr) }; + } + } + + #[inline] + fn back_advance(cursor: &mut StringCursor) -> u32 { + cursor.position -= 1; + unsafe { next_code_point_reverse(&mut cursor.ptr) } + } + + #[inline] + fn back_peek(cursor: &StringCursor) -> u32 { + let mut ptr = cursor.ptr; + unsafe { next_code_point_reverse(&mut ptr) } + } + + #[inline] + fn back_skip(cursor: &mut StringCursor, n: usize) { + cursor.position -= n; + for _ in 0..n { + unsafe { next_code_point_reverse(&mut cursor.ptr) }; + } + } +} + +/// Reads the next code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +/// +/// # Safety +/// +/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string +#[inline] +unsafe fn next_code_point(ptr: &mut *const u8) -> u32 { + // Decode UTF-8 + let x = **ptr; + *ptr = ptr.offset(1); + + if x < 128 { + return x as u32; + } + + // Multibyte case follows + // Decode from a byte combination out of: [[[x y] z] w] + // NOTE: Performance is sensitive to the exact formulation here + let init = utf8_first_byte(x, 2); + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let y = **ptr; + *ptr = ptr.offset(1); + let mut ch = utf8_acc_cont_byte(init, y); + if x >= 0xE0 { + // [[x y z] w] case + // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let z = **ptr; + *ptr = ptr.offset(1); + let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); + ch = init << 12 | y_z; + if x >= 0xF0 { + // [x y z w] case + // use only the lower 3 bits of `init` + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + let w = **ptr; + *ptr = ptr.offset(1); + ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); + } + } + + ch +} + +/// Reads the last code point out of a byte iterator (assuming a +/// UTF-8-like encoding). +/// +/// # Safety +/// +/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string +#[inline] +unsafe fn next_code_point_reverse(ptr: &mut *const u8) -> u32 { + // Decode UTF-8 + *ptr = ptr.offset(-1); + let w = match **ptr { + next_byte if next_byte < 128 => return next_byte as u32, + back_byte => back_byte, + }; + + // Multibyte case follows + // Decode from a byte combination out of: [x [y [z w]]] + let mut ch; + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + *ptr = ptr.offset(-1); + let z = **ptr; + ch = utf8_first_byte(z, 2); + if utf8_is_cont_byte(z) { + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + *ptr = ptr.offset(-1); + let y = **ptr; + ch = utf8_first_byte(y, 3); + if utf8_is_cont_byte(y) { + // SAFETY: `bytes` produces an UTF-8-like string, + // so the iterator must produce a value here. + *ptr = ptr.offset(-1); + let x = **ptr; + ch = utf8_first_byte(x, 4); + ch = utf8_acc_cont_byte(ch, y); + } + ch = utf8_acc_cont_byte(ch, z); + } + ch = utf8_acc_cont_byte(ch, w); + + ch +} + +/// Returns the initial codepoint accumulator for the first byte. +/// The first byte is special, only want bottom 5 bits for width 2, 4 bits +/// for width 3, and 3 bits for width 4. +#[inline] +const fn utf8_first_byte(byte: u8, width: u32) -> u32 { + (byte & (0x7F >> width)) as u32 +} + +/// Returns the value of `ch` updated with continuation byte `byte`. +#[inline] +const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { + (ch << 6) | (byte & CONT_MASK) as u32 +} + +/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the +/// bits `10`). +#[inline] +const fn utf8_is_cont_byte(byte: u8) -> bool { + (byte as i8) < -64 +} + +/// Mask of the value bits of a continuation byte. +const CONT_MASK: u8 = 0b0011_1111; + +const fn is_py_ascii_whitespace(b: u8) -> bool { + matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') +} + +#[inline] +pub(crate) fn is_word(ch: u32) -> bool { + ch == '_' as u32 + || u8::try_from(ch) + .map(|x| x.is_ascii_alphanumeric()) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_space(ch: u32) -> bool { + u8::try_from(ch) + .map(is_py_ascii_whitespace) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_digit(ch: u32) -> bool { + u8::try_from(ch) + .map(|x| x.is_ascii_digit()) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_loc_alnum(ch: u32) -> bool { + // FIXME: Ignore the locales + u8::try_from(ch) + .map(|x| x.is_ascii_alphanumeric()) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_loc_word(ch: u32) -> bool { + ch == '_' as u32 || is_loc_alnum(ch) +} +#[inline] +pub(crate) fn is_linebreak(ch: u32) -> bool { + ch == '\n' as u32 +} +#[inline] +pub fn lower_ascii(ch: u32) -> u32 { + u8::try_from(ch) + .map(|x| x.to_ascii_lowercase() as u32) + .unwrap_or(ch) +} +#[inline] +pub(crate) fn lower_locate(ch: u32) -> u32 { + // FIXME: Ignore the locales + lower_ascii(ch) +} +#[inline] +pub(crate) fn upper_locate(ch: u32) -> u32 { + // FIXME: Ignore the locales + u8::try_from(ch) + .map(|x| x.to_ascii_uppercase() as u32) + .unwrap_or(ch) +} +#[inline] +pub(crate) fn is_uni_digit(ch: u32) -> bool { + // TODO: check with cpython + char::try_from(ch) + .map(|x| x.is_ascii_digit()) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_uni_space(ch: u32) -> bool { + // TODO: check with cpython + is_space(ch) + || matches!( + ch, + 0x0009 + | 0x000A + | 0x000B + | 0x000C + | 0x000D + | 0x001C + | 0x001D + | 0x001E + | 0x001F + | 0x0020 + | 0x0085 + | 0x00A0 + | 0x1680 + | 0x2000 + | 0x2001 + | 0x2002 + | 0x2003 + | 0x2004 + | 0x2005 + | 0x2006 + | 0x2007 + | 0x2008 + | 0x2009 + | 0x200A + | 0x2028 + | 0x2029 + | 0x202F + | 0x205F + | 0x3000 + ) +} +#[inline] +pub(crate) fn is_uni_linebreak(ch: u32) -> bool { + matches!( + ch, + 0x000A | 0x000B | 0x000C | 0x000D | 0x001C | 0x001D | 0x001E | 0x0085 | 0x2028 | 0x2029 + ) +} +#[inline] +pub(crate) fn is_uni_alnum(ch: u32) -> bool { + // TODO: check with cpython + char::try_from(ch) + .map(|x| x.is_alphanumeric()) + .unwrap_or(false) +} +#[inline] +pub(crate) fn is_uni_word(ch: u32) -> bool { + ch == '_' as u32 || is_uni_alnum(ch) +} +#[inline] +pub fn lower_unicode(ch: u32) -> u32 { + // TODO: check with cpython + char::try_from(ch) + .map(|x| x.to_lowercase().next().unwrap() as u32) + .unwrap_or(ch) +} +#[inline] +pub fn upper_unicode(ch: u32) -> u32 { + // TODO: check with cpython + char::try_from(ch) + .map(|x| x.to_uppercase().next().unwrap() as u32) + .unwrap_or(ch) +} diff --git a/tests/tests.rs b/tests/tests.rs index efeb2d2838..f589c62e6e 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -1,16 +1,13 @@ -use sre_engine::engine; +use sre_engine::{Request, State, StrDrive}; struct Pattern { code: &'static [u32], } impl Pattern { - fn state<'a, S: engine::StrDrive>( - &self, - string: S, - ) -> (engine::Request<'a, S>, engine::State) { - let req = engine::Request::new(string, 0, usize::MAX, self.code, false); - let state = engine::State::default(); + fn state<'a, S: StrDrive>(&self, string: S) -> (Request<'a, S>, State) { + let req = Request::new(string, 0, usize::MAX, self.code, false); + let state = State::default(); (req, state) } } @@ -54,7 +51,7 @@ fn test_zerowidth() { let (mut req, mut state) = p.state("a:"); req.must_advance = true; assert!(state.search(req)); - assert_eq!(state.string_position, 1); + assert_eq!(state.cursor.position, 1); } #[test] @@ -66,7 +63,10 @@ fn test_repeat_context_panic() { // END GENERATED let (req, mut state) = p.state("axxzaz"); assert!(state.pymatch(&req)); - assert_eq!(*state.marks.raw(), vec![Optioned::some(1), Optioned::some(3)]); + assert_eq!( + *state.marks.raw(), + vec![Optioned::some(1), Optioned::some(3)] + ); } #[test] @@ -77,7 +77,7 @@ fn test_double_max_until() { // END GENERATED let (req, mut state) = p.state("1111"); assert!(state.pymatch(&req)); - assert_eq!(state.string_position, 4); + assert_eq!(state.cursor.position, 4); } #[test] @@ -89,7 +89,7 @@ fn test_info_single() { let (req, mut state) = p.state("baaaa"); assert!(state.search(req)); assert_eq!(state.start, 1); - assert_eq!(state.string_position, 5); + assert_eq!(state.cursor.position, 5); } #[test] @@ -161,7 +161,7 @@ fn test_bug_20998() { let (mut req, mut state) = p.state("ABC"); req.match_all = true; assert!(state.pymatch(&req)); - assert_eq!(state.string_position, 3); + assert_eq!(state.cursor.position, 3); } #[test] @@ -172,5 +172,10 @@ fn test_bigcharset() { // END GENERATED let (req, mut state) = p.state("x "); assert!(state.pymatch(&req)); - assert_eq!(state.string_position, 1); + assert_eq!(state.cursor.position, 1); +} + +#[test] +fn test_search_nonascii() { + // pattern p = re.compile('\xe0+') } From 10e51ba68909e9f09860c4a5d727c00a74cb0d7c Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 14 Jan 2024 10:03:01 +0200 Subject: [PATCH 092/705] improve: use adjust_cursor reduce double calc --- src/engine.rs | 17 +++++++---------- src/string.rs | 25 +++++++++++++++++++++---- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index 97489633d8..a854f8d898 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -7,7 +7,7 @@ use crate::string::{ use super::{SreAtCode, SreCatCode, SreInfo, SreOpcode, StrDrive, StringCursor, MAXREPEAT}; use optional::Optioned; -use std::convert::TryFrom; +use std::{convert::TryFrom, ptr::null}; #[derive(Debug, Clone, Copy)] pub struct Request<'a, S> { @@ -126,17 +126,12 @@ impl State { self.marks.clear(); self.repeat_stack.clear(); self.start = start; - if self.cursor.ptr.is_null() || self.cursor.position > self.start { - self.cursor = req.string.create_cursor(self.start); - } else if self.cursor.position < self.start { - let skip = self.start - self.cursor.position; - S::skip(&mut self.cursor, skip); - } + req.string.adjust_cursor(&mut self.cursor, start); } pub fn pymatch(&mut self, req: &Request) -> bool { self.start = req.start; - self.cursor = req.string.create_cursor(self.start); + req.string.adjust_cursor(&mut self.cursor, self.start); let ctx = MatchContext { cursor: self.cursor, @@ -151,7 +146,7 @@ impl State { pub fn search(&mut self, mut req: Request) -> bool { self.start = req.start; - self.cursor = req.string.create_cursor(self.start); + req.string.adjust_cursor(&mut self.cursor, self.start); if req.start > req.end { return false; @@ -215,7 +210,9 @@ impl State { || ctx.try_peek_code_as::(&req, 1).unwrap() == SreAtCode::BEGINNING_STRING) { - self.reset(&req, req.end); + self.cursor.position = req.end; + self.cursor.ptr = null(); + // self.reset(&req, req.end); return false; } diff --git a/src/string.rs b/src/string.rs index 464901af83..1340c37423 100644 --- a/src/string.rs +++ b/src/string.rs @@ -16,6 +16,7 @@ impl Default for StringCursor { pub trait StrDrive: Copy { fn count(&self) -> usize; fn create_cursor(&self, n: usize) -> StringCursor; + fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize); fn advance(cursor: &mut StringCursor) -> u32; fn peek(cursor: &StringCursor) -> u32; fn skip(cursor: &mut StringCursor, n: usize); @@ -38,6 +39,12 @@ impl<'a> StrDrive for &'a [u8] { } } + #[inline] + fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize) { + cursor.position = n; + cursor.ptr = self[n..].as_ptr(); + } + #[inline] fn advance(cursor: &mut StringCursor) -> u32 { cursor.position += 1; @@ -83,11 +90,21 @@ impl StrDrive for &str { #[inline] fn create_cursor(&self, n: usize) -> StringCursor { - let mut ptr = self.as_ptr(); - for _ in 0..n { - unsafe { next_code_point(&mut ptr) }; + let mut cursor = StringCursor { + ptr: self.as_ptr(), + position: 0, + }; + Self::skip(&mut cursor, n); + cursor + } + + #[inline] + fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize) { + if cursor.ptr.is_null() || cursor.position > n { + *cursor = Self::create_cursor(&self, n); + } else if cursor.position < n { + Self::skip(cursor, n - cursor.position); } - StringCursor { ptr, position: n } } #[inline] From 21fc2059b70ebd5bf4a7c524c40e7d4347e065dc Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sun, 14 Jan 2024 16:02:05 +0200 Subject: [PATCH 093/705] improve: fix double count on _count --- src/engine.rs | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/engine.rs b/src/engine.rs index a854f8d898..34f00234e5 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -441,7 +441,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex let mut count_ctx = ctx; count_ctx.skip_code(4); - if _count(req, state, count_ctx, 1) == 0 { + if _count(req, state, &mut count_ctx, 1) == 0 { state.marks.pop_discard(); break 'result false; } @@ -735,13 +735,13 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex state.cursor = ctx.cursor; - let mut next_ctx = ctx; - next_ctx.skip_code(4); - let count = _count(req, state, next_ctx, max_count); - ctx.skip_char::(count); + let mut count_ctx = ctx; + count_ctx.skip_code(4); + let count = _count(req, state, &mut count_ctx, max_count); if count < min_count { break 'result false; } + ctx.cursor = count_ctx.cursor; let next_code = ctx.peek_code(req, ctx.peek_code(req, 1) as usize + 1); if next_code == SreOpcode::SUCCESS as u32 && ctx.can_success(req) { @@ -768,11 +768,11 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex } else { let mut count_ctx = ctx; count_ctx.skip_code(4); - let count = _count(req, state, count_ctx, min_count); + let count = _count(req, state, &mut count_ctx, min_count); if count < min_count { break 'result false; } - ctx.skip_char::(count); + ctx.cursor = count_ctx.cursor; count as isize }; @@ -845,11 +845,11 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex state.cursor = ctx.cursor; let mut count_ctx = ctx; count_ctx.skip_code(4); - let count = _count(req, state, count_ctx, max_count); + let count = _count(req, state, &mut count_ctx, max_count); if count < min_count { break 'result false; } - ctx.skip_char::(count); + ctx.cursor = count_ctx.cursor; ctx.skip_code_from(req, 1); } SreOpcode::CHARSET @@ -1324,7 +1324,7 @@ fn charset(set: &[u32], ch: u32) -> bool { fn _count( req: &Request, state: &mut State, - mut ctx: MatchContext, + ctx: &mut MatchContext, max_count: usize, ) -> usize { let max_count = std::cmp::min(max_count, ctx.remaining_chars(req)); @@ -1347,28 +1347,28 @@ fn _count( } } SreOpcode::LITERAL => { - general_count_literal(req, &mut ctx, end, |code, c| code == c); + general_count_literal(req, ctx, end, |code, c| code == c); } SreOpcode::NOT_LITERAL => { - general_count_literal(req, &mut ctx, end, |code, c| code != c); + general_count_literal(req, ctx, end, |code, c| code != c); } SreOpcode::LITERAL_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| code == lower_ascii(c)); + general_count_literal(req, ctx, end, |code, c| code == lower_ascii(c)); } SreOpcode::NOT_LITERAL_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| code != lower_ascii(c)); + general_count_literal(req, ctx, end, |code, c| code != lower_ascii(c)); } SreOpcode::LITERAL_LOC_IGNORE => { - general_count_literal(req, &mut ctx, end, char_loc_ignore); + general_count_literal(req, ctx, end, char_loc_ignore); } SreOpcode::NOT_LITERAL_LOC_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| !char_loc_ignore(code, c)); + general_count_literal(req, ctx, end, |code, c| !char_loc_ignore(code, c)); } SreOpcode::LITERAL_UNI_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| code == lower_unicode(c)); + general_count_literal(req, ctx, end, |code, c| code == lower_unicode(c)); } SreOpcode::NOT_LITERAL_UNI_IGNORE => { - general_count_literal(req, &mut ctx, end, |code, c| code != lower_unicode(c)); + general_count_literal(req, ctx, end, |code, c| code != lower_unicode(c)); } _ => { /* General case */ @@ -1383,7 +1383,7 @@ fn _count( ..*state }; - while ctx.cursor.position < end && _match(req, &mut sub_state, ctx) { + while ctx.cursor.position < end && _match(req, &mut sub_state, *ctx) { ctx.advance_char::(); } } From 35229721ead1e96a9135e26d74ae9b0d36f7efa4 Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Sat, 24 Feb 2024 12:32:16 -0500 Subject: [PATCH 094/705] Fix test_cmd_line.py The failing test was unsetting `PYTHONPATH`, but neglecting to unset `RUSTPYTHONPATH`, which obviously was not significant for the original CPython test. Including `RUSTPYTHONPATH` in the test fixes it. --- Lib/test/test_cmd_line.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index 02f060ba2c..88ff71726f 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -411,7 +411,8 @@ def test_empty_PYTHONPATH_issue16309(self): path = ":".join(sys.path) path = path.encode("ascii", "backslashreplace") sys.stdout.buffer.write(path)""" - rc1, out1, err1 = assert_python_ok('-c', code, PYTHONPATH="") + # TODO: RUSTPYTHON we must unset RUSTPYTHONPATH as well + rc1, out1, err1 = assert_python_ok('-c', code, PYTHONPATH="", RUSTPYTHONPATH="") rc2, out2, err2 = assert_python_ok('-c', code, __isolated=False) # regarding to Posix specification, outputs should be equal # for empty and unset PYTHONPATH From ead42beff6265d95c4dc18016735229daef1a56d Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Sat, 24 Feb 2024 13:25:44 -0500 Subject: [PATCH 095/705] Disable test_locale in test_format.py See https://github.com/RustPython/RustPython/issues/5181 --- Lib/test/test_format.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Lib/test/test_format.py b/Lib/test/test_format.py index f6c11a4aad..66e2b077bf 100644 --- a/Lib/test/test_format.py +++ b/Lib/test/test_format.py @@ -420,6 +420,9 @@ def test_non_ascii(self): self.assertEqual(format(1+2j, "\u2007^8"), "\u2007(1+2j)\u2007") self.assertEqual(format(0j, "\u2007^4"), "\u20070j\u2007") + # TODO: RUSTPYTHON formatting does not support locales + # See https://github.com/RustPython/RustPython/issues/5181 + @unittest.expectedFailure def test_locale(self): try: oldloc = locale.setlocale(locale.LC_ALL) From de7e4e49dab823358e37823ae547cd66b8e7bfaf Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Sat, 24 Feb 2024 13:46:22 -0500 Subject: [PATCH 096/705] Disable broken test_socket.py tests There are a substantial number of socket tests that are disabled due to `bind(): bad family` errors. It seems like RustPython only supports a small subset of the required connection families, so the failing tests are broken for the same reasons. --- Lib/test/test_socket.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Lib/test/test_socket.py b/Lib/test/test_socket.py index 35f94a4e22..17e9dae8c0 100644 --- a/Lib/test/test_socket.py +++ b/Lib/test/test_socket.py @@ -2161,12 +2161,16 @@ def testCreateISOTPSocket(self): with socket.socket(socket.PF_CAN, socket.SOCK_DGRAM, socket.CAN_ISOTP) as s: pass + # TODO: RUSTPYTHON, OSError: bind(): bad family + @unittest.expectedFailure def testTooLongInterfaceName(self): # most systems limit IFNAMSIZ to 16, take 1024 to be sure with socket.socket(socket.PF_CAN, socket.SOCK_DGRAM, socket.CAN_ISOTP) as s: with self.assertRaisesRegex(OSError, 'interface name too long'): s.bind(('x' * 1024, 1, 2)) + # TODO: RUSTPYTHON, OSError: bind(): bad family + @unittest.expectedFailure def testBind(self): try: with socket.socket(socket.PF_CAN, socket.SOCK_DGRAM, socket.CAN_ISOTP) as s: From 9b974bda0d8792dfafcc25af5a09d6bd6386704a Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Sun, 10 Mar 2024 22:23:46 -0400 Subject: [PATCH 097/705] Re-enable test_format.test_locale Technically speaking, my system was misconfigured, leading me to disable the test in the first place. `test_locale` calls `locale.setlocale(locale.LC_ALL, '')`, which reads the value of the `LANG` environment variable and uses that to look up and reset all the locale settings. My system has `LANG=en_US.UTF-8`, which is apparently not what this test was expecting. If `LANG` is unset or set to `C`, the test passes, as it does in CI. --- Lib/test/test_format.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/Lib/test/test_format.py b/Lib/test/test_format.py index 66e2b077bf..f6c11a4aad 100644 --- a/Lib/test/test_format.py +++ b/Lib/test/test_format.py @@ -420,9 +420,6 @@ def test_non_ascii(self): self.assertEqual(format(1+2j, "\u2007^8"), "\u2007(1+2j)\u2007") self.assertEqual(format(0j, "\u2007^4"), "\u20070j\u2007") - # TODO: RUSTPYTHON formatting does not support locales - # See https://github.com/RustPython/RustPython/issues/5181 - @unittest.expectedFailure def test_locale(self): try: oldloc = locale.setlocale(locale.LC_ALL) From 23ebbd021b7f65883e05c6430aa80108d5cb3461 Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Sun, 10 Mar 2024 22:53:39 -0400 Subject: [PATCH 098/705] Skip test_format.test_locale I had previously `test_locale` as expected to fail, as it did indeed fail on my system due to unimplemented functionality. As it happens, it passes in CI because the locale settings used there (`C`, I believe) just happen to format integers the same with "%d" as "%n". I mistakenly un-marked it because I thought I misunderstood the problem. --- Lib/test/test_format.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Lib/test/test_format.py b/Lib/test/test_format.py index f6c11a4aad..187270d5b6 100644 --- a/Lib/test/test_format.py +++ b/Lib/test/test_format.py @@ -420,6 +420,9 @@ def test_non_ascii(self): self.assertEqual(format(1+2j, "\u2007^8"), "\u2007(1+2j)\u2007") self.assertEqual(format(0j, "\u2007^4"), "\u20070j\u2007") + # TODO: RUSTPYTHON formatting does not support locales + # See https://github.com/RustPython/RustPython/issues/5181 + @unittest.skip("formatting does not support locales") def test_locale(self): try: oldloc = locale.setlocale(locale.LC_ALL) From 2fde8e91e5dbc070b06060c5b356a60f7e85fb3d Mon Sep 17 00:00:00 2001 From: wellweek <148746285+wellweek@users.noreply.github.com> Date: Mon, 11 Mar 2024 14:01:37 +0800 Subject: [PATCH 099/705] fix some typos (#5187) Signed-off-by: wellweek --- Lib/test/test_unpack.py | 2 +- architecture/architecture.md | 2 +- benches/benchmarks/pystone.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_unpack.py b/Lib/test/test_unpack.py index f5ca1d455b..515ec128a0 100644 --- a/Lib/test/test_unpack.py +++ b/Lib/test/test_unpack.py @@ -162,7 +162,7 @@ def test_extended_oparg_not_ignored(self): ns = {} exec(code, ns) unpack_400 = ns["unpack_400"] - # Warm up the the function for quickening (PEP 659) + # Warm up the function for quickening (PEP 659) for _ in range(30): y = unpack_400(range(400)) self.assertEqual(y, 399) diff --git a/architecture/architecture.md b/architecture/architecture.md index 5b1ae9cc68..a59b6498bf 100644 --- a/architecture/architecture.md +++ b/architecture/architecture.md @@ -101,7 +101,7 @@ Part of the Python standard library that's implemented in Rust. The modules that ### Lib -Python side of the standard libary, copied over (with care) from CPython sourcecode. +Python side of the standard library, copied over (with care) from CPython sourcecode. #### Lib/test diff --git a/benches/benchmarks/pystone.py b/benches/benchmarks/pystone.py index 3faf675ae7..755b4ba85c 100644 --- a/benches/benchmarks/pystone.py +++ b/benches/benchmarks/pystone.py @@ -16,7 +16,7 @@ Version History: - Inofficial version 1.1.1 by Chris Arndt: + Unofficial version 1.1.1 by Chris Arndt: - Make it run under Python 2 and 3 by using "from __future__ import print_function". From 7f02324dcec46bf97ccf0e9cc0b94a8ad5057abb Mon Sep 17 00:00:00 2001 From: Kirill Podoprigora Date: Mon, 11 Mar 2024 15:04:35 +0200 Subject: [PATCH 100/705] Update Lib/test/test_hmac.py to 3.12 version (#5188) --- Lib/test/test_hmac.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Lib/test/test_hmac.py b/Lib/test/test_hmac.py index bc2e02528d..8e1a4a204c 100644 --- a/Lib/test/test_hmac.py +++ b/Lib/test/test_hmac.py @@ -389,6 +389,18 @@ def test_with_digestmod_no_default(self): with self.assertRaisesRegex(TypeError, r'required.*digestmod'): hmac.HMAC(key, msg=data, digestmod='') + # TODO: RUSTPYTHON + @unittest.expectedFailure + def test_with_fallback(self): + cache = getattr(hashlib, '__builtin_constructor_cache') + try: + cache['foo'] = hashlib.sha256 + hexdigest = hmac.digest(b'key', b'message', 'foo').hex() + expected = '6e9ef29b75fffc5b7abae527d58fdadb2fe42e7219011976917343065f58ed4a' + self.assertEqual(hexdigest, expected) + finally: + cache.pop('foo') + class ConstructorTestCase(unittest.TestCase): From 83d1ad8a2cfaa6cb96d7d1646923ed58c030b8df Mon Sep 17 00:00:00 2001 From: Kirill Podoprigora Date: Tue, 12 Mar 2024 15:35:21 +0200 Subject: [PATCH 101/705] Update test_operator.py to 3.12 (#5194) --- Lib/test/test_operator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Lib/test/test_operator.py b/Lib/test/test_operator.py index b7e38c2334..1db738d228 100644 --- a/Lib/test/test_operator.py +++ b/Lib/test/test_operator.py @@ -208,6 +208,9 @@ def test_indexOf(self): nan = float("nan") self.assertEqual(operator.indexOf([nan, nan, 21], nan), 0) self.assertEqual(operator.indexOf([{}, 1, {}, 2], {}), 0) + it = iter('leave the iterator at exactly the position after the match') + self.assertEqual(operator.indexOf(it, 'a'), 2) + self.assertEqual(next(it), 'v') def test_invert(self): operator = self.module From 4e7b3bc8f247bf8a31ba1bd791844e83307dde09 Mon Sep 17 00:00:00 2001 From: Kirill Podoprigora Date: Tue, 12 Mar 2024 15:36:10 +0200 Subject: [PATCH 102/705] Update pprint.py and test_pprint.py to 3.12 (#5195) --- Lib/pprint.py | 16 ---------------- Lib/test/test_pprint.py | 2 +- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/Lib/pprint.py b/Lib/pprint.py index 575688d8eb..34ed12637e 100644 --- a/Lib/pprint.py +++ b/Lib/pprint.py @@ -637,19 +637,6 @@ def _recursion(object): % (type(object).__name__, id(object))) -def _perfcheck(object=None): - import time - if object is None: - object = [("string", (1, 2), [3, 4], {5: 6, 7: 8})] * 100000 - p = PrettyPrinter() - t1 = time.perf_counter() - p._safe_repr(object, {}, None, 0, True) - t2 = time.perf_counter() - p.pformat(object) - t3 = time.perf_counter() - print("_safe_repr:", t2 - t1) - print("pformat:", t3 - t2) - def _wrap_bytes_repr(object, width, allowance): current = b'' last = len(object) // 4 * 4 @@ -666,6 +653,3 @@ def _wrap_bytes_repr(object, width, allowance): current = candidate if current: yield repr(current) - -if __name__ == "__main__": - _perfcheck() diff --git a/Lib/test/test_pprint.py b/Lib/test/test_pprint.py index c7b9893943..6ea7e7db2c 100644 --- a/Lib/test/test_pprint.py +++ b/Lib/test/test_pprint.py @@ -203,7 +203,7 @@ def test_knotted(self): def test_unreadable(self): # Not recursive but not readable anyway pp = pprint.PrettyPrinter() - for unreadable in type(3), pprint, pprint.isrecursive: + for unreadable in object(), int, pprint, pprint.isrecursive: # module-level convenience functions self.assertFalse(pprint.isrecursive(unreadable), "expected not isrecursive for %r" % (unreadable,)) From 855fa1411fc20de7c7cfd76c807bcb435e6873f7 Mon Sep 17 00:00:00 2001 From: Kirill Podoprigora Date: Wed, 13 Mar 2024 00:35:16 +0200 Subject: [PATCH 103/705] Update ftplib and test_ftplib to 3.12 (#5196) --- Lib/ftplib.py | 36 +++++++----------------------------- Lib/test/test_ftplib.py | 22 ++++++++-------------- 2 files changed, 15 insertions(+), 43 deletions(-) diff --git a/Lib/ftplib.py b/Lib/ftplib.py index 7c5a50715f..a56e0c3085 100644 --- a/Lib/ftplib.py +++ b/Lib/ftplib.py @@ -434,10 +434,7 @@ def retrbinary(self, cmd, callback, blocksize=8192, rest=None): """ self.voidcmd('TYPE I') with self.transfercmd(cmd, rest) as conn: - while 1: - data = conn.recv(blocksize) - if not data: - break + while data := conn.recv(blocksize): callback(data) # shutdown ssl layer if _SSLSocket is not None and isinstance(conn, _SSLSocket): @@ -496,10 +493,7 @@ def storbinary(self, cmd, fp, blocksize=8192, callback=None, rest=None): """ self.voidcmd('TYPE I') with self.transfercmd(cmd, rest) as conn: - while 1: - buf = fp.read(blocksize) - if not buf: - break + while buf := fp.read(blocksize): conn.sendall(buf) if callback: callback(buf) @@ -561,7 +555,7 @@ def dir(self, *args): LIST command. (This *should* only be used for a pathname.)''' cmd = 'LIST' func = None - if args[-1:] and type(args[-1]) != type(''): + if args[-1:] and not isinstance(args[-1], str): args, func = args[:-1], args[-1] for arg in args: if arg: @@ -713,28 +707,12 @@ class FTP_TLS(FTP): '221 Goodbye.' >>> ''' - ssl_version = ssl.PROTOCOL_TLS_CLIENT def __init__(self, host='', user='', passwd='', acct='', - keyfile=None, certfile=None, context=None, - timeout=_GLOBAL_DEFAULT_TIMEOUT, source_address=None, *, - encoding='utf-8'): - if context is not None and keyfile is not None: - raise ValueError("context and keyfile arguments are mutually " - "exclusive") - if context is not None and certfile is not None: - raise ValueError("context and certfile arguments are mutually " - "exclusive") - if keyfile is not None or certfile is not None: - import warnings - warnings.warn("keyfile and certfile are deprecated, use a " - "custom context instead", DeprecationWarning, 2) - self.keyfile = keyfile - self.certfile = certfile + *, context=None, timeout=_GLOBAL_DEFAULT_TIMEOUT, + source_address=None, encoding='utf-8'): if context is None: - context = ssl._create_stdlib_context(self.ssl_version, - certfile=certfile, - keyfile=keyfile) + context = ssl._create_stdlib_context() self.context = context self._prot_p = False super().__init__(host, user, passwd, acct, @@ -749,7 +727,7 @@ def auth(self): '''Set up secure control connection by using TLS/SSL.''' if isinstance(self.sock, ssl.SSLSocket): raise ValueError("Already using TLS") - if self.ssl_version >= ssl.PROTOCOL_TLS: + if self.context.protocol >= ssl.PROTOCOL_TLS: resp = self.voidcmd('AUTH TLS') else: resp = self.voidcmd('AUTH SSL') diff --git a/Lib/test/test_ftplib.py b/Lib/test/test_ftplib.py index e8c126ddc4..7e632efa4c 100644 --- a/Lib/test/test_ftplib.py +++ b/Lib/test/test_ftplib.py @@ -21,6 +21,8 @@ from test.support import threading_helper from test.support import socket_helper from test.support import warnings_helper +from test.support import asynchat +from test.support import asyncore from test.support.socket_helper import HOST, HOSTv6 import sys @@ -992,11 +994,11 @@ def test_context(self): ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE - self.assertRaises(ValueError, ftplib.FTP_TLS, keyfile=CERTFILE, + self.assertRaises(TypeError, ftplib.FTP_TLS, keyfile=CERTFILE, context=ctx) - self.assertRaises(ValueError, ftplib.FTP_TLS, certfile=CERTFILE, + self.assertRaises(TypeError, ftplib.FTP_TLS, certfile=CERTFILE, context=ctx) - self.assertRaises(ValueError, ftplib.FTP_TLS, certfile=CERTFILE, + self.assertRaises(TypeError, ftplib.FTP_TLS, certfile=CERTFILE, keyfile=CERTFILE, context=ctx) self.client = ftplib.FTP_TLS(context=ctx, timeout=TIMEOUT) @@ -1160,18 +1162,10 @@ def test__all__(self): support.check__all__(self, ftplib, not_exported=not_exported) -def test_main(): - tests = [TestFTPClass, TestTimeouts, - TestIPv6Environment, - TestTLS_FTPClassMixin, TestTLS_FTPClass, - MiscTestCase] - +def setUpModule(): thread_info = threading_helper.threading_setup() - try: - support.run_unittest(*tests) - finally: - threading_helper.threading_cleanup(*thread_info) + unittest.addModuleCleanup(threading_helper.threading_cleanup, *thread_info) if __name__ == '__main__': - test_main() + unittest.main() From d8f2bd04ace5c0f07e3855f633c4bfdb0564e713 Mon Sep 17 00:00:00 2001 From: Kirill Podoprigora Date: Wed, 13 Mar 2024 08:22:24 +0200 Subject: [PATCH 104/705] Update cgitb.py to 3.12 (#5197) --- Lib/cgitb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/cgitb.py b/Lib/cgitb.py index 8ce0e833a9..f6b97f25c5 100644 --- a/Lib/cgitb.py +++ b/Lib/cgitb.py @@ -74,7 +74,7 @@ def lookup(name, frame, locals): return 'global', frame.f_globals[name] if '__builtins__' in frame.f_globals: builtins = frame.f_globals['__builtins__'] - if type(builtins) is type({}): + if isinstance(builtins, dict): if name in builtins: return 'builtin', builtins[name] else: From 92c8b371ae5db0d95bd8199bc42b08af115bb88a Mon Sep 17 00:00:00 2001 From: Kirill Podoprigora Date: Wed, 13 Mar 2024 08:22:57 +0200 Subject: [PATCH 105/705] Update colorsys.py and test_colorsys.py to 3.12 (#5198) --- Lib/colorsys.py | 2 +- Lib/test/test_colorsys.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Lib/colorsys.py b/Lib/colorsys.py index 9bdc83e377..bc897bd0f9 100644 --- a/Lib/colorsys.py +++ b/Lib/colorsys.py @@ -83,7 +83,7 @@ def rgb_to_hls(r, g, b): if l <= 0.5: s = rangec / sumc else: - s = rangec / (2.0-sumc) + s = rangec / (2.0-maxc-minc) # Not always 2.0-sumc: gh-106498. rc = (maxc-r) / rangec gc = (maxc-g) / rangec bc = (maxc-b) / rangec diff --git a/Lib/test/test_colorsys.py b/Lib/test/test_colorsys.py index a24e3adcb4..74d76294b0 100644 --- a/Lib/test/test_colorsys.py +++ b/Lib/test/test_colorsys.py @@ -69,6 +69,16 @@ def test_hls_values(self): self.assertTripleEqual(hls, colorsys.rgb_to_hls(*rgb)) self.assertTripleEqual(rgb, colorsys.hls_to_rgb(*hls)) + def test_hls_nearwhite(self): # gh-106498 + values = ( + # rgb, hls: these do not work in reverse + ((0.9999999999999999, 1, 1), (0.5, 1.0, 1.0)), + ((1, 0.9999999999999999, 0.9999999999999999), (0.0, 1.0, 1.0)), + ) + for rgb, hls in values: + self.assertTripleEqual(hls, colorsys.rgb_to_hls(*rgb)) + self.assertTripleEqual((1.0, 1.0, 1.0), colorsys.hls_to_rgb(*hls)) + def test_yiq_roundtrip(self): for r in frange(0.0, 1.0, 0.2): for g in frange(0.0, 1.0, 0.2): From 426e582ba039492b789b114d26b29f3dfa86c56e Mon Sep 17 00:00:00 2001 From: Nikita Sobolev Date: Fri, 15 Mar 2024 16:15:45 +0300 Subject: [PATCH 106/705] Remove incorrect `@expectedFailure`s from `test_cmd_line` (#5201) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After you suggestion in https://github.com/python/cpython/issues/116504#issuecomment-1999239012 I went to take a look at `test_cmd_line` in RustPython (it was so long ago I contributed to this amazing project, so may thing had changed!), and I've noticed this. This is a problem, here' the simplest demo: ```python import unittest class TestMe(unittest.TestCase): @unittest.expectedFailure def test_me(self): def run(): raise ValueError with self.subTest(run=run): run() if __name__ == '__main__': unittest.main() ``` This works as expected: ``` » ./python.exe ex.py x ---------------------------------------------------------------------- Ran 1 test in 0.001s OK (expected failures=1) ``` This does not: ```python import unittest class TestMe(unittest.TestCase): def test_me(self): @unittest.expectedFailure def run(): raise ValueError with self.subTest(run=run): run() if __name__ == '__main__': unittest.main() ``` Produces: ``` » ./python.exe ex.py E ====================================================================== ERROR: test_me (__main__.TestMe.test_me) (run=.run at 0x1057a2150>) ---------------------------------------------------------------------- Traceback (most recent call last): File "/Users/sobolev/Desktop/cpython2/ex.py", line 10, in test_me run() ~~~^^ File "/Users/sobolev/Desktop/cpython2/ex.py", line 7, in run raise ValueError ValueError ---------------------------------------------------------------------- Ran 1 test in 0.001s FAILED (errors=1) ``` So, I propose to remove these decorators, let's only keep `TODO` comments to indicate separate failures. --- Lib/test/test_cmd_line.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index 88ff71726f..6644a3cd5c 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -278,13 +278,11 @@ def test_invalid_utf8_arg(self): code = 'import sys, os; s=os.fsencode(sys.argv[1]); print(ascii(s))' # TODO: RUSTPYTHON - @unittest.expectedFailure def run_default(arg): cmd = [sys.executable, '-c', code, arg] return subprocess.run(cmd, stdout=subprocess.PIPE, text=True) # TODO: RUSTPYTHON - @unittest.expectedFailure def run_c_locale(arg): cmd = [sys.executable, '-c', code, arg] env = dict(os.environ) @@ -293,7 +291,6 @@ def run_c_locale(arg): text=True, env=env) # TODO: RUSTPYTHON - @unittest.expectedFailure def run_utf8_mode(arg): cmd = [sys.executable, '-X', 'utf8', '-c', code, arg] return subprocess.run(cmd, stdout=subprocess.PIPE, text=True) From 12601d0b44c719183d559fa73d76ab6561255ed9 Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Mon, 18 Mar 2024 16:57:28 +0900 Subject: [PATCH 107/705] integrate sre_engine crate to workspace --- Cargo.lock | 38 ++++++++++++++++++++++++++++++++---- Cargo.toml | 5 +++-- vm/sre_engine/Cargo.toml | 8 ++++---- vm/sre_engine/tests/tests.rs | 2 +- 4 files changed, 42 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c8d0342708..52afbb053f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1428,7 +1428,16 @@ version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d829733185c1ca374f17e52b762f24f535ec625d2cc1f070e34c8a9068f341b" dependencies = [ - "num_enum_derive", + "num_enum_derive 0.5.9", +] + +[[package]] +name = "num_enum" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02339744ee7253741199f897151b38e72257d13802d4ee837285cc2990a90845" +dependencies = [ + "num_enum_derive 0.7.2", ] [[package]] @@ -1443,6 +1452,18 @@ dependencies = [ "syn 1.0.107", ] +[[package]] +name = "num_enum_derive" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "681030a937600a36906c185595136d26abfebb4aa9c65701cefcaf8578bb982b" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.32", +] + [[package]] name = "once_cell" version = "1.18.0" @@ -2165,6 +2186,15 @@ dependencies = [ "rustpython-derive", ] +[[package]] +name = "rustpython-sre_engine" +version = "0.6.0" +dependencies = [ + "bitflags 2.4.0", + "num_enum 0.7.2", + "optional", +] + [[package]] name = "rustpython-stdlib" version = "0.3.0" @@ -2200,7 +2230,7 @@ dependencies = [ "num-complex", "num-integer", "num-traits", - "num_enum", + "num_enum 0.7.2", "once_cell", "openssl", "openssl-probe", @@ -2270,7 +2300,7 @@ dependencies = [ "num-integer", "num-traits", "num_cpus", - "num_enum", + "num_enum 0.7.2", "once_cell", "optional", "parking_lot", @@ -2527,7 +2557,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a490c5c46c35dba9a6f5e7ee8e4d67e775eb2d2da0f115750b8d10e1c1ac2d28" dependencies = [ "bitflags 1.3.2", - "num_enum", + "num_enum 0.5.9", "optional", ] diff --git a/Cargo.toml b/Cargo.toml index bfc882fdc5..0f4fb49dc3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ include = ["LICENSE", "Cargo.toml", "src/**/*.rs"] resolver = "2" members = [ "compiler", "compiler/core", "compiler/codegen", - ".", "common", "derive", "jit", "vm", "pylib", "stdlib", "wasm/lib", "derive-impl", + ".", "common", "derive", "jit", "vm", "vm/sre_engine", "pylib", "stdlib", "wasm/lib", "derive-impl", ] [workspace.dependencies] @@ -27,6 +27,7 @@ rustpython-jit = { path = "jit", version = "0.3.0" } rustpython-vm = { path = "vm", default-features = false, version = "0.3.0" } rustpython-pylib = { path = "pylib", version = "0.3.0" } rustpython-stdlib = { path = "stdlib", default-features = false, version = "0.3.0" } +rustpython-sre_engine = { path = "vm/sre_engine", version = "0.6.0" } rustpython-doc = { git = "https://github.com/RustPython/__doc__", tag = "0.3.0", version = "0.3.0" } rustpython-literal = { git = "https://github.com/RustPython/Parser.git", rev = "29c4728dbedc7e69cc2560b9b34058bbba9b1303" } @@ -64,7 +65,7 @@ malachite-base = "0.4.4" num-complex = "0.4.0" num-integer = "0.1.44" num-traits = "0.2" -num_enum = "0.5.7" +num_enum = "0.7" once_cell = "1.18" parking_lot = "0.12.1" paste = "1.0.7" diff --git a/vm/sre_engine/Cargo.toml b/vm/sre_engine/Cargo.toml index e54f124ac0..2caa8b73e5 100644 --- a/vm/sre_engine/Cargo.toml +++ b/vm/sre_engine/Cargo.toml @@ -1,15 +1,15 @@ [package] -name = "sre-engine" +name = "rustpython-sre_engine" version = "0.6.0" authors = ["Kangzhi Shi ", "RustPython Team"] description = "A low-level implementation of Python's SRE regex engine" -repository = "https://github.com/RustPython/sre-engine" +repository = "https://github.com/RustPython/RustPython" license = "MIT" edition = "2021" keywords = ["regex"] include = ["LICENSE", "src/**/*.rs"] [dependencies] -num_enum = "0.7" -bitflags = "2" +num_enum = { workspace = true } +bitflags = { workspace = true } optional = "0.5" diff --git a/vm/sre_engine/tests/tests.rs b/vm/sre_engine/tests/tests.rs index f589c62e6e..53494c5e3d 100644 --- a/vm/sre_engine/tests/tests.rs +++ b/vm/sre_engine/tests/tests.rs @@ -1,4 +1,4 @@ -use sre_engine::{Request, State, StrDrive}; +use rustpython_sre_engine::{Request, State, StrDrive}; struct Pattern { code: &'static [u32], From ac7851704487ba7f37479946ef8957ead2e68097 Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Thu, 21 Mar 2024 01:44:03 -0400 Subject: [PATCH 108/705] Skip TestScander.test_uninstantiable (#5204) This test was marked as an expected failure. Because the garbage collector is missing, that meant that the `os.scandir` object went unclosed. This object was squatting on the file descriptors of all the files contained in the test directory, which was breaking test_zipfile. --- Lib/test/test_os.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py index 097124b7b5..c880a9b902 100644 --- a/Lib/test/test_os.py +++ b/Lib/test/test_os.py @@ -4249,7 +4249,8 @@ def assert_stat_equal(self, stat1, stat2, skip_fields): self.assertEqual(stat1, stat2) # TODO: RUSTPPYTHON (AssertionError: TypeError not raised by ScandirIter) - @unittest.expectedFailure + # TODO: See https://github.com/RustPython/RustPython/issues/5190 for skip rationale + @unittest.skip("skipping to avoid the unclosed scandir from squatting on file descriptors") def test_uninstantiable(self): scandir_iter = os.scandir(self.path) self.assertRaises(TypeError, type(scandir_iter)) From 3737f2a0918d94312b9e40b974f3939ce0f44013 Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" <69878+youknowone@users.noreply.github.com> Date: Thu, 21 Mar 2024 21:48:29 +0900 Subject: [PATCH 109/705] make adding a single module simpler for interpreter users (#4792) --- src/interpreter.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/interpreter.rs b/src/interpreter.rs index b84f167ae4..6b0d2b5bde 100644 --- a/src/interpreter.rs +++ b/src/interpreter.rs @@ -1,4 +1,4 @@ -use rustpython_vm::{Interpreter, Settings, VirtualMachine}; +use rustpython_vm::{builtins::PyModule, Interpreter, PyRef, Settings, VirtualMachine}; pub type InitHook = Box; @@ -63,6 +63,15 @@ impl InterpreterConfig { self.init_hooks.push(hook); self } + pub fn add_native_module( + self, + name: String, + make_module: fn(&VirtualMachine) -> PyRef, + ) -> Self { + self.init_hook(Box::new(move |vm| { + vm.add_native_module(name, Box::new(make_module)) + })) + } #[cfg(feature = "stdlib")] pub fn init_stdlib(self) -> Self { self.init_hook(Box::new(init_stdlib)) From 5ee5531f327d7505739276686607c3b32eaeff63 Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Thu, 21 Mar 2024 08:51:57 -0400 Subject: [PATCH 110/705] Properly unload modules between tests (#5192) There seems to have been a bug in the libregrtest code which unloaded modules between tests. The previous state was calculated using `sys.modules.keys()`, which is actually a mutable object that is updated as the underlying `sys.modules` is updated. The result was that modules were not unloaded between tests, which is the root cause for `test_unittest` failing when run after `test_import` and `test_importlib`. This code is copied from 3.12. Ideally all of `libregrtest` should probably be updated as it seems wildly out of date, but that's a lot more work. --- Lib/test/libregrtest/main.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/Lib/test/libregrtest/main.py b/Lib/test/libregrtest/main.py index fba24e4f32..e1d19e1e4a 100644 --- a/Lib/test/libregrtest/main.py +++ b/Lib/test/libregrtest/main.py @@ -373,7 +373,7 @@ def run_tests_sequential(self): import trace self.tracer = trace.Trace(trace=False, count=True) - save_modules = sys.modules.keys() + save_modules = set(sys.modules) print("Run tests sequentially") @@ -409,10 +409,18 @@ def run_tests_sequential(self): # be quiet: say nothing if the test passed shortly previous_test = None - # Unload the newly imported modules (best effort finalization) - for module in sys.modules.keys(): - if module not in save_modules and module.startswith("test."): - import_helper.unload(module) + # Unload the newly imported test modules (best effort finalization) + new_modules = [module for module in sys.modules + if module not in save_modules and + module.startswith(("test.", "test_"))] + for module in new_modules: + sys.modules.pop(module, None) + # Remove the attribute of the parent module. + parent, _, name = module.rpartition('.') + try: + delattr(sys.modules[parent], name) + except (KeyError, AttributeError): + pass if previous_test: print(previous_test) From 85c427b8423bc8620d9b39bf22743da4ea05cb03 Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Thu, 21 Mar 2024 11:12:01 -0400 Subject: [PATCH 111/705] Reset exception in WithCleanupFinish (#5203) Context managers have an `__exit__` function that returns a boolean-like object. If the object is truthy, then exceptions are suppressed. If an exception was thrown while resolving that boolean, it would leak and live on in the error stack, getting tacked on to all future exceptions. This caused several mysterious test failures which would only trigger after this very specific event was tested in `test_with`. The solution is to move a call to `vm.set_exception()` before attempting the `try_to_bool()` which threw the error. Minimal example to reproduce the bug: ```py import sys import traceback class cm(object): def __init__(self): pass def __enter__(self): return 3 def __exit__(self, a, b, c): class Bool: def __bool__(self): 1 // 0 return Bool() try: with cm(): raise Exception("Should NOT see this") except ZeroDivisionError: print("exception caught, as expected") print("There should now be no exception") traceback.print_exc() print(sys.exc_info()) ``` --- vm/src/frame.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vm/src/frame.rs b/vm/src/frame.rs index a0af23336f..56e0f433da 100644 --- a/vm/src/frame.rs +++ b/vm/src/frame.rs @@ -939,10 +939,10 @@ impl ExecutingFrame<'_> { _ => self.fatal("WithCleanupFinish expects a FinallyHandler block on stack"), }; - let suppress_exception = self.pop_value().try_to_bool(vm)?; - vm.set_exception(prev_exc); + let suppress_exception = self.pop_value().try_to_bool(vm)?; + if suppress_exception { Ok(None) } else if let Some(reason) = reason { From e3150776300045da5ddf74afbd97abe89d7481bb Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Thu, 21 Mar 2024 11:28:33 -0400 Subject: [PATCH 112/705] Add TODO: RUSTPYTHON to skip reason --- Lib/test/test_os.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py index c880a9b902..21ce9bc329 100644 --- a/Lib/test/test_os.py +++ b/Lib/test/test_os.py @@ -4250,7 +4250,7 @@ def assert_stat_equal(self, stat1, stat2, skip_fields): # TODO: RUSTPPYTHON (AssertionError: TypeError not raised by ScandirIter) # TODO: See https://github.com/RustPython/RustPython/issues/5190 for skip rationale - @unittest.skip("skipping to avoid the unclosed scandir from squatting on file descriptors") + @unittest.skip("TODO: RUSTPYTHON, avoid the unclosed scandir from squatting on file descriptors") def test_uninstantiable(self): scandir_iter = os.scandir(self.path) self.assertRaises(TypeError, type(scandir_iter)) From e6c73883eadb1c3feb6c8422826107a98efa0ecb Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Thu, 21 Mar 2024 13:36:28 -0400 Subject: [PATCH 113/705] Revert test skip --- Lib/test/test_os.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py index 21ce9bc329..e9ab681357 100644 --- a/Lib/test/test_os.py +++ b/Lib/test/test_os.py @@ -4248,9 +4248,8 @@ def assert_stat_equal(self, stat1, stat2, skip_fields): else: self.assertEqual(stat1, stat2) - # TODO: RUSTPPYTHON (AssertionError: TypeError not raised by ScandirIter) - # TODO: See https://github.com/RustPython/RustPython/issues/5190 for skip rationale - @unittest.skip("TODO: RUSTPYTHON, avoid the unclosed scandir from squatting on file descriptors") + # TODO: RUSTPYTHON (AssertionError: TypeError not raised by ScandirIter) + @unittest.expectedFailure def test_uninstantiable(self): scandir_iter = os.scandir(self.path) self.assertRaises(TypeError, type(scandir_iter)) From 0a24e106baa25db64783ff25534f4e53a29005f9 Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Thu, 21 Mar 2024 13:37:46 -0400 Subject: [PATCH 114/705] Increase threshold for zipfile test_many_opens It turns out that there are many other tests that can impact test_many_opens by leaving unclosed file handles. Rather than fix them all, it is easier to simply increase the threshold for the problematic test. --- Lib/test/test_zipfile.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index fd4a3918e6..43178ca26b 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -2561,17 +2561,22 @@ def test_write_after_read(self): self.assertEqual(data1, self.data1) self.assertEqual(data2, self.data2) + # TODO: RUSTPYTHON other tests can impact the file descriptor incrementor + # by leaving file handles unclosed. If there are more than 100 files in + # TESTFN and references to them are left unclosed and ungarbage collected + # in another test, then fileno() will always be too high for this test to + # pass. The solution is to increase the number of files from 100 to 200 def test_many_opens(self): # Verify that read() and open() promptly close the file descriptor, # and don't rely on the garbage collector to free resources. self.make_test_archive(TESTFN2) with zipfile.ZipFile(TESTFN2, mode="r") as zipf: - for x in range(100): + for x in range(200): zipf.read('ones') with zipf.open('ones') as zopen1: pass with open(os.devnull, "rb") as f: - self.assertLess(f.fileno(), 100) + self.assertLess(f.fileno(), 200) def test_write_while_reading(self): with zipfile.ZipFile(TESTFN2, 'w', zipfile.ZIP_DEFLATED) as zipf: From 90724b32ec1283c941f992132baf398fa87a1a2b Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Thu, 21 Mar 2024 21:25:53 -0400 Subject: [PATCH 115/705] Implement new clippy lints (#5208) * Implement new clippy lints clippy was just updated and has a few minor issues with the code base. * Forgotten lint hidden behind feature --- common/src/static_cell.rs | 2 +- common/src/str.rs | 2 +- vm/src/builtins/memory.rs | 2 +- vm/src/stdlib/os.rs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/common/src/static_cell.rs b/common/src/static_cell.rs index 01a54db29c..7f16dad399 100644 --- a/common/src/static_cell.rs +++ b/common/src/static_cell.rs @@ -46,7 +46,7 @@ mod non_threading { F: FnOnce() -> Result, { self.inner - .with(|x| x.get_or_try_init(|| f().map(leak)).map(|&x| x)) + .with(|x| x.get_or_try_init(|| f().map(leak)).copied()) } } diff --git a/common/src/str.rs b/common/src/str.rs index cdee03f14f..48fdb0f95a 100644 --- a/common/src/str.rs +++ b/common/src/str.rs @@ -250,7 +250,7 @@ pub mod levenshtein { pub fn levenshtein_distance(a: &str, b: &str, max_cost: usize) -> usize { thread_local! { - static BUFFER: RefCell<[usize; MAX_STRING_SIZE]> = RefCell::new([0usize; MAX_STRING_SIZE]); + static BUFFER: RefCell<[usize; MAX_STRING_SIZE]> = const { RefCell::new([0usize; MAX_STRING_SIZE]) }; } if a == b { diff --git a/vm/src/builtins/memory.rs b/vm/src/builtins/memory.rs index 2c436ca316..aca2114bf0 100644 --- a/vm/src/builtins/memory.rs +++ b/vm/src/builtins/memory.rs @@ -1047,7 +1047,7 @@ impl Hashable for PyMemoryView { } Ok(zelf.contiguous_or_collect(|bytes| vm.state.hash_secret.hash_bytes(bytes))) }) - .map(|&x| x) + .copied() } } diff --git a/vm/src/stdlib/os.rs b/vm/src/stdlib/os.rs index 376c18fb3a..bd76c8ed95 100644 --- a/vm/src/stdlib/os.rs +++ b/vm/src/stdlib/os.rs @@ -698,7 +698,7 @@ pub(super) mod _os { if self.is_symlink(vm)? { do_stat(true) } else { - lstat().map(Clone::clone) + lstat().cloned() } })? } else { From df363c0ba7613e54588f1f4b4b60981eb49d516f Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Thu, 21 Mar 2024 21:26:40 -0400 Subject: [PATCH 116/705] Skip typing test which causes other failures (#5207) --- Lib/test/test_typing.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Lib/test/test_typing.py b/Lib/test/test_typing.py index 95fd3748e6..b6a167f998 100644 --- a/Lib/test/test_typing.py +++ b/Lib/test/test_typing.py @@ -1466,6 +1466,10 @@ def __new__(cls, *args): with self.assertRaises(TypeError): C[int](a=42) + # TODO: RUSTPYTHON the last line breaks any tests that use unittest.mock + # See https://github.com/RustPython/RustPython/issues/5190#issuecomment-2010535802 + # It's possible that updating typing to 3.12 will resolve this + @unittest.skip("TODO: RUSTPYTHON this test breaks other tests that use unittest.mock") def test_protocols_bad_subscripts(self): T = TypeVar('T') S = TypeVar('S') From 1dd9a2fbe45f7f34f5c505d5e9774603bd34d6bb Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Fri, 22 Mar 2024 11:28:49 +0900 Subject: [PATCH 117/705] suppress clippy warnings --- vm/sre_engine/src/engine.rs | 5 +++++ vm/sre_engine/src/string.rs | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/vm/sre_engine/src/engine.rs b/vm/sre_engine/src/engine.rs index 34f00234e5..fb7d766e29 100644 --- a/vm/sre_engine/src/engine.rs +++ b/vm/sre_engine/src/engine.rs @@ -283,6 +283,8 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex let mut context_stack = vec![]; let mut popped_result = false; + // NOTE: 'result loop is not an actual loop but break label + #[allow(clippy::never_loop)] 'coro: loop { popped_result = 'result: loop { let yielded = 'context: loop { @@ -513,6 +515,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex loop { macro_rules! general_op_literal { ($f:expr) => {{ + #[allow(clippy::redundant_closure_call)] if ctx.at_end(req) || !$f(ctx.peek_code(req, 1), ctx.peek_char::()) { break 'result false; } @@ -523,6 +526,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex macro_rules! general_op_in { ($f:expr) => {{ + #[allow(clippy::redundant_closure_call)] if ctx.at_end(req) || !$f(&ctx.pattern(req)[2..], ctx.peek_char::()) { break 'result false; @@ -551,6 +555,7 @@ fn _match(req: &Request, state: &mut State, mut ctx: MatchContex }; for _ in group_start..group_end { + #[allow(clippy::redundant_closure_call)] if ctx.at_end(req) || $f(ctx.peek_char::()) != $f(gctx.peek_char::()) { diff --git a/vm/sre_engine/src/string.rs b/vm/sre_engine/src/string.rs index 1340c37423..e3f14ef019 100644 --- a/vm/sre_engine/src/string.rs +++ b/vm/sre_engine/src/string.rs @@ -101,7 +101,7 @@ impl StrDrive for &str { #[inline] fn adjust_cursor(&self, cursor: &mut StringCursor, n: usize) { if cursor.ptr.is_null() || cursor.position > n { - *cursor = Self::create_cursor(&self, n); + *cursor = Self::create_cursor(self, n); } else if cursor.position < n { Self::skip(cursor, n - cursor.position); } From 280337a305b66d7bb9da13efa48f33dba9980766 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sat, 18 Nov 2023 15:29:58 +0200 Subject: [PATCH 118/705] Add Lib/re/* from CPython 3.12 --- Lib/{re.py => re/__init__.py} | 221 ++++--- Lib/re/_casefix.py | 106 ++++ Lib/re/_compiler.py | 763 +++++++++++++++++++++++ Lib/re/_constants.py | 219 +++++++ Lib/re/_parser.py | 1081 +++++++++++++++++++++++++++++++++ 5 files changed, 2301 insertions(+), 89 deletions(-) rename Lib/{re.py => re/__init__.py} (69%) create mode 100644 Lib/re/_casefix.py create mode 100644 Lib/re/_compiler.py create mode 100644 Lib/re/_constants.py create mode 100644 Lib/re/_parser.py diff --git a/Lib/re.py b/Lib/re/__init__.py similarity index 69% rename from Lib/re.py rename to Lib/re/__init__.py index bfb7b1ccd9..428d1b0d5f 100644 --- a/Lib/re.py +++ b/Lib/re/__init__.py @@ -122,65 +122,40 @@ """ import enum -import sre_compile -import sre_parse +from . import _compiler, _parser import functools -try: - import _locale -except ImportError: - _locale = None +import _sre # public symbols __all__ = [ "match", "fullmatch", "search", "sub", "subn", "split", - "findall", "finditer", "compile", "purge", "template", "escape", + "findall", "finditer", "compile", "purge", "escape", "error", "Pattern", "Match", "A", "I", "L", "M", "S", "X", "U", "ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", - "UNICODE", + "UNICODE", "NOFLAG", "RegexFlag", ] __version__ = "2.2.1" -class RegexFlag(enum.IntFlag): - ASCII = A = sre_compile.SRE_FLAG_ASCII # assume ascii "locale" - IGNORECASE = I = sre_compile.SRE_FLAG_IGNORECASE # ignore case - LOCALE = L = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale - UNICODE = U = sre_compile.SRE_FLAG_UNICODE # assume unicode "locale" - MULTILINE = M = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline - DOTALL = S = sre_compile.SRE_FLAG_DOTALL # make dot match newline - VERBOSE = X = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments +@enum.global_enum +@enum._simple_enum(enum.IntFlag, boundary=enum.KEEP) +class RegexFlag: + NOFLAG = 0 + ASCII = A = _compiler.SRE_FLAG_ASCII # assume ascii "locale" + IGNORECASE = I = _compiler.SRE_FLAG_IGNORECASE # ignore case + LOCALE = L = _compiler.SRE_FLAG_LOCALE # assume current 8-bit locale + UNICODE = U = _compiler.SRE_FLAG_UNICODE # assume unicode "locale" + MULTILINE = M = _compiler.SRE_FLAG_MULTILINE # make anchors look for newline + DOTALL = S = _compiler.SRE_FLAG_DOTALL # make dot match newline + VERBOSE = X = _compiler.SRE_FLAG_VERBOSE # ignore whitespace and comments # sre extensions (experimental, don't rely on these) - TEMPLATE = T = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking - DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation - - def __repr__(self): - if self._name_ is not None: - return f're.{self._name_}' - value = self._value_ - members = [] - negative = value < 0 - if negative: - value = ~value - for m in self.__class__: - if value & m._value_: - value &= ~m._value_ - members.append(f're.{m._name_}') - if value: - members.append(hex(value)) - res = '|'.join(members) - if negative: - if len(members) > 1: - res = f'~({res})' - else: - res = f'~{res}' - return res + DEBUG = _compiler.SRE_FLAG_DEBUG # dump pattern after compilation __str__ = object.__str__ - -globals().update(RegexFlag.__members__) + _numeric_repr_ = hex # sre exception -error = sre_compile.error +error = _compiler.error # -------------------------------------------------------------------- # public interface @@ -200,16 +175,39 @@ def search(pattern, string, flags=0): a Match object, or None if no match was found.""" return _compile(pattern, flags).search(string) -def sub(pattern, repl, string, count=0, flags=0): +class _ZeroSentinel(int): + pass +_zero_sentinel = _ZeroSentinel() + +def sub(pattern, repl, string, *args, count=_zero_sentinel, flags=_zero_sentinel): """Return the string obtained by replacing the leftmost non-overlapping occurrences of the pattern in string by the replacement repl. repl can be either a string or a callable; if a string, backslash escapes in it are processed. If it is a callable, it's passed the Match object and must return a replacement string to be used.""" + if args: + if count is not _zero_sentinel: + raise TypeError("sub() got multiple values for argument 'count'") + count, *args = args + if args: + if flags is not _zero_sentinel: + raise TypeError("sub() got multiple values for argument 'flags'") + flags, *args = args + if args: + raise TypeError("sub() takes from 3 to 5 positional arguments " + "but %d were given" % (5 + len(args))) + + import warnings + warnings.warn( + "'count' is passed as positional argument", + DeprecationWarning, stacklevel=2 + ) + return _compile(pattern, flags).sub(repl, string, count) +sub.__text_signature__ = '(pattern, repl, string, count=0, flags=0)' -def subn(pattern, repl, string, count=0, flags=0): +def subn(pattern, repl, string, *args, count=_zero_sentinel, flags=_zero_sentinel): """Return a 2-tuple containing (new_string, number). new_string is the string obtained by replacing the leftmost non-overlapping occurrences of the pattern in the source @@ -218,9 +216,28 @@ def subn(pattern, repl, string, count=0, flags=0): callable; if a string, backslash escapes in it are processed. If it is a callable, it's passed the Match object and must return a replacement string to be used.""" + if args: + if count is not _zero_sentinel: + raise TypeError("subn() got multiple values for argument 'count'") + count, *args = args + if args: + if flags is not _zero_sentinel: + raise TypeError("subn() got multiple values for argument 'flags'") + flags, *args = args + if args: + raise TypeError("subn() takes from 3 to 5 positional arguments " + "but %d were given" % (5 + len(args))) + + import warnings + warnings.warn( + "'count' is passed as positional argument", + DeprecationWarning, stacklevel=2 + ) + return _compile(pattern, flags).subn(repl, string, count) +subn.__text_signature__ = '(pattern, repl, string, count=0, flags=0)' -def split(pattern, string, maxsplit=0, flags=0): +def split(pattern, string, *args, maxsplit=_zero_sentinel, flags=_zero_sentinel): """Split the source string by the occurrences of the pattern, returning a list containing the resulting substrings. If capturing parentheses are used in pattern, then the text of all @@ -228,7 +245,26 @@ def split(pattern, string, maxsplit=0, flags=0): list. If maxsplit is nonzero, at most maxsplit splits occur, and the remainder of the string is returned as the final element of the list.""" + if args: + if maxsplit is not _zero_sentinel: + raise TypeError("split() got multiple values for argument 'maxsplit'") + maxsplit, *args = args + if args: + if flags is not _zero_sentinel: + raise TypeError("split() got multiple values for argument 'flags'") + flags, *args = args + if args: + raise TypeError("split() takes from 2 to 4 positional arguments " + "but %d were given" % (4 + len(args))) + + import warnings + warnings.warn( + "'maxsplit' is passed as positional argument", + DeprecationWarning, stacklevel=2 + ) + return _compile(pattern, flags).split(string, maxsplit) +split.__text_signature__ = '(pattern, string, maxsplit=0, flags=0)' def findall(pattern, string, flags=0): """Return a list of all non-overlapping matches in the string. @@ -254,11 +290,9 @@ def compile(pattern, flags=0): def purge(): "Clear the regular expression caches" _cache.clear() - _compile_repl.cache_clear() + _cache2.clear() + _compile_template.cache_clear() -def template(pattern, flags=0): - "Compile a template pattern, returning a Pattern object" - return _compile(pattern, flags|T) # SPECIAL_CHARS # closing ')', '}' and ']' @@ -277,60 +311,69 @@ def escape(pattern): pattern = str(pattern, 'latin1') return pattern.translate(_special_chars_map).encode('latin1') -Pattern = type(sre_compile.compile('', 0)) -Match = type(sre_compile.compile('', 0).match('')) +Pattern = type(_compiler.compile('', 0)) +Match = type(_compiler.compile('', 0).match('')) # -------------------------------------------------------------------- # internals -_cache = {} # ordered! - +# Use the fact that dict keeps the insertion order. +# _cache2 uses the simple FIFO policy which has better latency. +# _cache uses the LRU policy which has better hit rate. +_cache = {} # LRU +_cache2 = {} # FIFO _MAXCACHE = 512 +_MAXCACHE2 = 256 +assert _MAXCACHE2 < _MAXCACHE + def _compile(pattern, flags): # internal: compile pattern if isinstance(flags, RegexFlag): flags = flags.value try: - return _cache[type(pattern), pattern, flags] + return _cache2[type(pattern), pattern, flags] except KeyError: pass - if isinstance(pattern, Pattern): - if flags: - raise ValueError( - "cannot process flags argument with a compiled pattern") - return pattern - if not sre_compile.isstring(pattern): - raise TypeError("first argument must be string or compiled pattern") - p = sre_compile.compile(pattern, flags) - if not (flags & DEBUG): + + key = (type(pattern), pattern, flags) + # Item in _cache should be moved to the end if found. + p = _cache.pop(key, None) + if p is None: + if isinstance(pattern, Pattern): + if flags: + raise ValueError( + "cannot process flags argument with a compiled pattern") + return pattern + if not _compiler.isstring(pattern): + raise TypeError("first argument must be string or compiled pattern") + p = _compiler.compile(pattern, flags) + if flags & DEBUG: + return p if len(_cache) >= _MAXCACHE: - # Drop the oldest item + # Drop the least recently used item. + # next(iter(_cache)) is known to have linear amortized time, + # but it is used here to avoid a dependency from using OrderedDict. + # For the small _MAXCACHE value it doesn't make much of a difference. try: del _cache[next(iter(_cache))] except (StopIteration, RuntimeError, KeyError): pass - _cache[type(pattern), pattern, flags] = p + # Append to the end. + _cache[key] = p + + if len(_cache2) >= _MAXCACHE2: + # Drop the oldest item. + try: + del _cache2[next(iter(_cache2))] + except (StopIteration, RuntimeError, KeyError): + pass + _cache2[key] = p return p @functools.lru_cache(_MAXCACHE) -def _compile_repl(repl, pattern): +def _compile_template(pattern, repl): # internal: compile replacement pattern - return sre_parse.parse_template(repl, pattern) - -def _expand(pattern, match, template): - # internal: Match.expand implementation hook - template = sre_parse.parse_template(template, pattern) - return sre_parse.expand_template(template, match) - -def _subx(pattern, template): - # internal: Pattern.sub/subn implementation helper - template = _compile_repl(template, pattern) - if not template[0] and len(template[1]) == 1: - # literal replacement - return template[1][0] - def filter(match, template=template): - return sre_parse.expand_template(template, match) - return filter + return _sre.template(pattern, _parser.parse_template(repl, pattern)) # register myself for pickling @@ -346,22 +389,22 @@ def _pickle(p): class Scanner: def __init__(self, lexicon, flags=0): - from sre_constants import BRANCH, SUBPATTERN + from ._constants import BRANCH, SUBPATTERN if isinstance(flags, RegexFlag): flags = flags.value self.lexicon = lexicon # combine phrases into a compound pattern p = [] - s = sre_parse.State() + s = _parser.State() s.flags = flags for phrase, action in lexicon: gid = s.opengroup() - p.append(sre_parse.SubPattern(s, [ - (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))), + p.append(_parser.SubPattern(s, [ + (SUBPATTERN, (gid, 0, 0, _parser.parse(phrase, flags))), ])) s.closegroup(gid, p[-1]) - p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) - self.scanner = sre_compile.compile(p) + p = _parser.SubPattern(s, [(BRANCH, (None, p))]) + self.scanner = _compiler.compile(p) def scan(self, string): result = [] append = result.append diff --git a/Lib/re/_casefix.py b/Lib/re/_casefix.py new file mode 100644 index 0000000000..06507d08be --- /dev/null +++ b/Lib/re/_casefix.py @@ -0,0 +1,106 @@ +# Auto-generated by Tools/scripts/generate_re_casefix.py. + +# Maps the code of lowercased character to codes of different lowercased +# characters which have the same uppercase. +_EXTRA_CASES = { + # LATIN SMALL LETTER I: LATIN SMALL LETTER DOTLESS I + 0x0069: (0x0131,), # 'i': 'ı' + # LATIN SMALL LETTER S: LATIN SMALL LETTER LONG S + 0x0073: (0x017f,), # 's': 'ſ' + # MICRO SIGN: GREEK SMALL LETTER MU + 0x00b5: (0x03bc,), # 'µ': 'μ' + # LATIN SMALL LETTER DOTLESS I: LATIN SMALL LETTER I + 0x0131: (0x0069,), # 'ı': 'i' + # LATIN SMALL LETTER LONG S: LATIN SMALL LETTER S + 0x017f: (0x0073,), # 'ſ': 's' + # COMBINING GREEK YPOGEGRAMMENI: GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI + 0x0345: (0x03b9, 0x1fbe), # '\u0345': 'ιι' + # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS: GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA + 0x0390: (0x1fd3,), # 'ΐ': 'ΐ' + # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS: GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA + 0x03b0: (0x1fe3,), # 'ΰ': 'ΰ' + # GREEK SMALL LETTER BETA: GREEK BETA SYMBOL + 0x03b2: (0x03d0,), # 'β': 'ϐ' + # GREEK SMALL LETTER EPSILON: GREEK LUNATE EPSILON SYMBOL + 0x03b5: (0x03f5,), # 'ε': 'ϵ' + # GREEK SMALL LETTER THETA: GREEK THETA SYMBOL + 0x03b8: (0x03d1,), # 'θ': 'ϑ' + # GREEK SMALL LETTER IOTA: COMBINING GREEK YPOGEGRAMMENI, GREEK PROSGEGRAMMENI + 0x03b9: (0x0345, 0x1fbe), # 'ι': '\u0345ι' + # GREEK SMALL LETTER KAPPA: GREEK KAPPA SYMBOL + 0x03ba: (0x03f0,), # 'κ': 'ϰ' + # GREEK SMALL LETTER MU: MICRO SIGN + 0x03bc: (0x00b5,), # 'μ': 'µ' + # GREEK SMALL LETTER PI: GREEK PI SYMBOL + 0x03c0: (0x03d6,), # 'π': 'ϖ' + # GREEK SMALL LETTER RHO: GREEK RHO SYMBOL + 0x03c1: (0x03f1,), # 'ρ': 'ϱ' + # GREEK SMALL LETTER FINAL SIGMA: GREEK SMALL LETTER SIGMA + 0x03c2: (0x03c3,), # 'ς': 'σ' + # GREEK SMALL LETTER SIGMA: GREEK SMALL LETTER FINAL SIGMA + 0x03c3: (0x03c2,), # 'σ': 'ς' + # GREEK SMALL LETTER PHI: GREEK PHI SYMBOL + 0x03c6: (0x03d5,), # 'φ': 'ϕ' + # GREEK BETA SYMBOL: GREEK SMALL LETTER BETA + 0x03d0: (0x03b2,), # 'ϐ': 'β' + # GREEK THETA SYMBOL: GREEK SMALL LETTER THETA + 0x03d1: (0x03b8,), # 'ϑ': 'θ' + # GREEK PHI SYMBOL: GREEK SMALL LETTER PHI + 0x03d5: (0x03c6,), # 'ϕ': 'φ' + # GREEK PI SYMBOL: GREEK SMALL LETTER PI + 0x03d6: (0x03c0,), # 'ϖ': 'π' + # GREEK KAPPA SYMBOL: GREEK SMALL LETTER KAPPA + 0x03f0: (0x03ba,), # 'ϰ': 'κ' + # GREEK RHO SYMBOL: GREEK SMALL LETTER RHO + 0x03f1: (0x03c1,), # 'ϱ': 'ρ' + # GREEK LUNATE EPSILON SYMBOL: GREEK SMALL LETTER EPSILON + 0x03f5: (0x03b5,), # 'ϵ': 'ε' + # CYRILLIC SMALL LETTER VE: CYRILLIC SMALL LETTER ROUNDED VE + 0x0432: (0x1c80,), # 'в': 'ᲀ' + # CYRILLIC SMALL LETTER DE: CYRILLIC SMALL LETTER LONG-LEGGED DE + 0x0434: (0x1c81,), # 'д': 'ᲁ' + # CYRILLIC SMALL LETTER O: CYRILLIC SMALL LETTER NARROW O + 0x043e: (0x1c82,), # 'о': 'ᲂ' + # CYRILLIC SMALL LETTER ES: CYRILLIC SMALL LETTER WIDE ES + 0x0441: (0x1c83,), # 'с': 'ᲃ' + # CYRILLIC SMALL LETTER TE: CYRILLIC SMALL LETTER TALL TE, CYRILLIC SMALL LETTER THREE-LEGGED TE + 0x0442: (0x1c84, 0x1c85), # 'т': 'ᲄᲅ' + # CYRILLIC SMALL LETTER HARD SIGN: CYRILLIC SMALL LETTER TALL HARD SIGN + 0x044a: (0x1c86,), # 'ъ': 'ᲆ' + # CYRILLIC SMALL LETTER YAT: CYRILLIC SMALL LETTER TALL YAT + 0x0463: (0x1c87,), # 'ѣ': 'ᲇ' + # CYRILLIC SMALL LETTER ROUNDED VE: CYRILLIC SMALL LETTER VE + 0x1c80: (0x0432,), # 'ᲀ': 'в' + # CYRILLIC SMALL LETTER LONG-LEGGED DE: CYRILLIC SMALL LETTER DE + 0x1c81: (0x0434,), # 'ᲁ': 'д' + # CYRILLIC SMALL LETTER NARROW O: CYRILLIC SMALL LETTER O + 0x1c82: (0x043e,), # 'ᲂ': 'о' + # CYRILLIC SMALL LETTER WIDE ES: CYRILLIC SMALL LETTER ES + 0x1c83: (0x0441,), # 'ᲃ': 'с' + # CYRILLIC SMALL LETTER TALL TE: CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER THREE-LEGGED TE + 0x1c84: (0x0442, 0x1c85), # 'ᲄ': 'тᲅ' + # CYRILLIC SMALL LETTER THREE-LEGGED TE: CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER TALL TE + 0x1c85: (0x0442, 0x1c84), # 'ᲅ': 'тᲄ' + # CYRILLIC SMALL LETTER TALL HARD SIGN: CYRILLIC SMALL LETTER HARD SIGN + 0x1c86: (0x044a,), # 'ᲆ': 'ъ' + # CYRILLIC SMALL LETTER TALL YAT: CYRILLIC SMALL LETTER YAT + 0x1c87: (0x0463,), # 'ᲇ': 'ѣ' + # CYRILLIC SMALL LETTER UNBLENDED UK: CYRILLIC SMALL LETTER MONOGRAPH UK + 0x1c88: (0xa64b,), # 'ᲈ': 'ꙋ' + # LATIN SMALL LETTER S WITH DOT ABOVE: LATIN SMALL LETTER LONG S WITH DOT ABOVE + 0x1e61: (0x1e9b,), # 'ṡ': 'ẛ' + # LATIN SMALL LETTER LONG S WITH DOT ABOVE: LATIN SMALL LETTER S WITH DOT ABOVE + 0x1e9b: (0x1e61,), # 'ẛ': 'ṡ' + # GREEK PROSGEGRAMMENI: COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA + 0x1fbe: (0x0345, 0x03b9), # 'ι': '\u0345ι' + # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA: GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + 0x1fd3: (0x0390,), # 'ΐ': 'ΐ' + # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA: GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + 0x1fe3: (0x03b0,), # 'ΰ': 'ΰ' + # CYRILLIC SMALL LETTER MONOGRAPH UK: CYRILLIC SMALL LETTER UNBLENDED UK + 0xa64b: (0x1c88,), # 'ꙋ': 'ᲈ' + # LATIN SMALL LIGATURE LONG S T: LATIN SMALL LIGATURE ST + 0xfb05: (0xfb06,), # 'ſt': 'st' + # LATIN SMALL LIGATURE ST: LATIN SMALL LIGATURE LONG S T + 0xfb06: (0xfb05,), # 'st': 'ſt' +} diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py new file mode 100644 index 0000000000..f87712d6d6 --- /dev/null +++ b/Lib/re/_compiler.py @@ -0,0 +1,763 @@ +# +# Secret Labs' Regular Expression Engine +# +# convert template to internal format +# +# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. +# +# See the __init__.py file for information on usage and redistribution. +# + +"""Internal support module for sre""" + +import _sre +from . import _parser +from ._constants import * +from ._casefix import _EXTRA_CASES + +assert _sre.MAGIC == MAGIC, "SRE module mismatch" + +_LITERAL_CODES = {LITERAL, NOT_LITERAL} +_SUCCESS_CODES = {SUCCESS, FAILURE} +_ASSERT_CODES = {ASSERT, ASSERT_NOT} +_UNIT_CODES = _LITERAL_CODES | {ANY, IN} + +_REPEATING_CODES = { + MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE), + MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE), + POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE), +} + +def _combine_flags(flags, add_flags, del_flags, + TYPE_FLAGS=_parser.TYPE_FLAGS): + if add_flags & TYPE_FLAGS: + flags &= ~TYPE_FLAGS + return (flags | add_flags) & ~del_flags + +def _compile(code, pattern, flags): + # internal: compile a (sub)pattern + emit = code.append + _len = len + LITERAL_CODES = _LITERAL_CODES + REPEATING_CODES = _REPEATING_CODES + SUCCESS_CODES = _SUCCESS_CODES + ASSERT_CODES = _ASSERT_CODES + iscased = None + tolower = None + fixes = None + if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE: + if flags & SRE_FLAG_UNICODE: + iscased = _sre.unicode_iscased + tolower = _sre.unicode_tolower + fixes = _EXTRA_CASES + else: + iscased = _sre.ascii_iscased + tolower = _sre.ascii_tolower + for op, av in pattern: + if op in LITERAL_CODES: + if not flags & SRE_FLAG_IGNORECASE: + emit(op) + emit(av) + elif flags & SRE_FLAG_LOCALE: + emit(OP_LOCALE_IGNORE[op]) + emit(av) + elif not iscased(av): + emit(op) + emit(av) + else: + lo = tolower(av) + if not fixes: # ascii + emit(OP_IGNORE[op]) + emit(lo) + elif lo not in fixes: + emit(OP_UNICODE_IGNORE[op]) + emit(lo) + else: + emit(IN_UNI_IGNORE) + skip = _len(code); emit(0) + if op is NOT_LITERAL: + emit(NEGATE) + for k in (lo,) + fixes[lo]: + emit(LITERAL) + emit(k) + emit(FAILURE) + code[skip] = _len(code) - skip + elif op is IN: + charset, hascased = _optimize_charset(av, iscased, tolower, fixes) + if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: + emit(IN_LOC_IGNORE) + elif not hascased: + emit(IN) + elif not fixes: # ascii + emit(IN_IGNORE) + else: + emit(IN_UNI_IGNORE) + skip = _len(code); emit(0) + _compile_charset(charset, flags, code) + code[skip] = _len(code) - skip + elif op is ANY: + if flags & SRE_FLAG_DOTALL: + emit(ANY_ALL) + else: + emit(ANY) + elif op in REPEATING_CODES: + if _simple(av[2]): + emit(REPEATING_CODES[op][2]) + skip = _len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + emit(SUCCESS) + code[skip] = _len(code) - skip + else: + emit(REPEATING_CODES[op][0]) + skip = _len(code); emit(0) + emit(av[0]) + emit(av[1]) + _compile(code, av[2], flags) + code[skip] = _len(code) - skip + emit(REPEATING_CODES[op][1]) + elif op is SUBPATTERN: + group, add_flags, del_flags, p = av + if group: + emit(MARK) + emit((group-1)*2) + # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags)) + _compile(code, p, _combine_flags(flags, add_flags, del_flags)) + if group: + emit(MARK) + emit((group-1)*2+1) + elif op is ATOMIC_GROUP: + # Atomic Groups are handled by starting with an Atomic + # Group op code, then putting in the atomic group pattern + # and finally a success op code to tell any repeat + # operations within the Atomic Group to stop eating and + # pop their stack if they reach it + emit(ATOMIC_GROUP) + skip = _len(code); emit(0) + _compile(code, av, flags) + emit(SUCCESS) + code[skip] = _len(code) - skip + elif op in SUCCESS_CODES: + emit(op) + elif op in ASSERT_CODES: + emit(op) + skip = _len(code); emit(0) + if av[0] >= 0: + emit(0) # look ahead + else: + lo, hi = av[1].getwidth() + if lo > MAXCODE: + raise error("looks too much behind") + if lo != hi: + raise error("look-behind requires fixed-width pattern") + emit(lo) # look behind + _compile(code, av[1], flags) + emit(SUCCESS) + code[skip] = _len(code) - skip + elif op is AT: + emit(op) + if flags & SRE_FLAG_MULTILINE: + av = AT_MULTILINE.get(av, av) + if flags & SRE_FLAG_LOCALE: + av = AT_LOCALE.get(av, av) + elif flags & SRE_FLAG_UNICODE: + av = AT_UNICODE.get(av, av) + emit(av) + elif op is BRANCH: + emit(op) + tail = [] + tailappend = tail.append + for av in av[1]: + skip = _len(code); emit(0) + # _compile_info(code, av, flags) + _compile(code, av, flags) + emit(JUMP) + tailappend(_len(code)); emit(0) + code[skip] = _len(code) - skip + emit(FAILURE) # end of branch + for tail in tail: + code[tail] = _len(code) - tail + elif op is CATEGORY: + emit(op) + if flags & SRE_FLAG_LOCALE: + av = CH_LOCALE[av] + elif flags & SRE_FLAG_UNICODE: + av = CH_UNICODE[av] + emit(av) + elif op is GROUPREF: + if not flags & SRE_FLAG_IGNORECASE: + emit(op) + elif flags & SRE_FLAG_LOCALE: + emit(GROUPREF_LOC_IGNORE) + elif not fixes: # ascii + emit(GROUPREF_IGNORE) + else: + emit(GROUPREF_UNI_IGNORE) + emit(av-1) + elif op is GROUPREF_EXISTS: + emit(op) + emit(av[0]-1) + skipyes = _len(code); emit(0) + _compile(code, av[1], flags) + if av[2]: + emit(JUMP) + skipno = _len(code); emit(0) + code[skipyes] = _len(code) - skipyes + 1 + _compile(code, av[2], flags) + code[skipno] = _len(code) - skipno + else: + code[skipyes] = _len(code) - skipyes + 1 + else: + raise error("internal: unsupported operand type %r" % (op,)) + +def _compile_charset(charset, flags, code): + # compile charset subprogram + emit = code.append + for op, av in charset: + emit(op) + if op is NEGATE: + pass + elif op is LITERAL: + emit(av) + elif op is RANGE or op is RANGE_UNI_IGNORE: + emit(av[0]) + emit(av[1]) + elif op is CHARSET: + code.extend(av) + elif op is BIGCHARSET: + code.extend(av) + elif op is CATEGORY: + if flags & SRE_FLAG_LOCALE: + emit(CH_LOCALE[av]) + elif flags & SRE_FLAG_UNICODE: + emit(CH_UNICODE[av]) + else: + emit(av) + else: + raise error("internal: unsupported set operator %r" % (op,)) + emit(FAILURE) + +def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): + # internal: optimize character set + out = [] + tail = [] + charmap = bytearray(256) + hascased = False + for op, av in charset: + while True: + try: + if op is LITERAL: + if fixup: + lo = fixup(av) + charmap[lo] = 1 + if fixes and lo in fixes: + for k in fixes[lo]: + charmap[k] = 1 + if not hascased and iscased(av): + hascased = True + else: + charmap[av] = 1 + elif op is RANGE: + r = range(av[0], av[1]+1) + if fixup: + if fixes: + for i in map(fixup, r): + charmap[i] = 1 + if i in fixes: + for k in fixes[i]: + charmap[k] = 1 + else: + for i in map(fixup, r): + charmap[i] = 1 + if not hascased: + hascased = any(map(iscased, r)) + else: + for i in r: + charmap[i] = 1 + elif op is NEGATE: + out.append((op, av)) + else: + tail.append((op, av)) + except IndexError: + if len(charmap) == 256: + # character set contains non-UCS1 character codes + charmap += b'\0' * 0xff00 + continue + # Character set contains non-BMP character codes. + # For range, all BMP characters in the range are already + # proceeded. + if fixup: + hascased = True + # For now, IN_UNI_IGNORE+LITERAL and + # IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP + # characters, because two characters (at least one of + # which is not in the BMP) match case-insensitively + # if and only if: + # 1) c1.lower() == c2.lower() + # 2) c1.lower() == c2 or c1.lower().upper() == c2 + # Also, both c.lower() and c.lower().upper() are single + # characters for every non-BMP character. + if op is RANGE: + op = RANGE_UNI_IGNORE + tail.append((op, av)) + break + + # compress character map + runs = [] + q = 0 + while True: + p = charmap.find(1, q) + if p < 0: + break + if len(runs) >= 2: + runs = None + break + q = charmap.find(0, p) + if q < 0: + runs.append((p, len(charmap))) + break + runs.append((p, q)) + if runs is not None: + # use literal/range + for p, q in runs: + if q - p == 1: + out.append((LITERAL, p)) + else: + out.append((RANGE, (p, q - 1))) + out += tail + # if the case was changed or new representation is more compact + if hascased or len(out) < len(charset): + return out, hascased + # else original character set is good enough + return charset, hascased + + # use bitmap + if len(charmap) == 256: + data = _mk_bitmap(charmap) + out.append((CHARSET, data)) + out += tail + return out, hascased + + # To represent a big charset, first a bitmap of all characters in the + # set is constructed. Then, this bitmap is sliced into chunks of 256 + # characters, duplicate chunks are eliminated, and each chunk is + # given a number. In the compiled expression, the charset is + # represented by a 32-bit word sequence, consisting of one word for + # the number of different chunks, a sequence of 256 bytes (64 words) + # of chunk numbers indexed by their original chunk position, and a + # sequence of 256-bit chunks (8 words each). + + # Compression is normally good: in a typical charset, large ranges of + # Unicode will be either completely excluded (e.g. if only cyrillic + # letters are to be matched), or completely included (e.g. if large + # subranges of Kanji match). These ranges will be represented by + # chunks of all one-bits or all zero-bits. + + # Matching can be also done efficiently: the more significant byte of + # the Unicode character is an index into the chunk number, and the + # less significant byte is a bit index in the chunk (just like the + # CHARSET matching). + + charmap = bytes(charmap) # should be hashable + comps = {} + mapping = bytearray(256) + block = 0 + data = bytearray() + for i in range(0, 65536, 256): + chunk = charmap[i: i + 256] + if chunk in comps: + mapping[i // 256] = comps[chunk] + else: + mapping[i // 256] = comps[chunk] = block + block += 1 + data += chunk + data = _mk_bitmap(data) + data[0:0] = [block] + _bytes_to_codes(mapping) + out.append((BIGCHARSET, data)) + out += tail + return out, hascased + +_CODEBITS = _sre.CODESIZE * 8 +MAXCODE = (1 << _CODEBITS) - 1 +_BITS_TRANS = b'0' + b'1' * 255 +def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): + s = bits.translate(_BITS_TRANS)[::-1] + return [_int(s[i - _CODEBITS: i], 2) + for i in range(len(s), 0, -_CODEBITS)] + +def _bytes_to_codes(b): + # Convert block indices to word array + a = memoryview(b).cast('I') + assert a.itemsize == _sre.CODESIZE + assert len(a) * a.itemsize == len(b) + return a.tolist() + +def _simple(p): + # check if this subpattern is a "simple" operator + if len(p) != 1: + return False + op, av = p[0] + if op is SUBPATTERN: + return av[0] is None and _simple(av[-1]) + return op in _UNIT_CODES + +def _generate_overlap_table(prefix): + """ + Generate an overlap table for the following prefix. + An overlap table is a table of the same size as the prefix which + informs about the potential self-overlap for each index in the prefix: + - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...] + - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with + prefix[0:k] + """ + table = [0] * len(prefix) + for i in range(1, len(prefix)): + idx = table[i - 1] + while prefix[i] != prefix[idx]: + if idx == 0: + table[i] = 0 + break + idx = table[idx - 1] + else: + table[i] = idx + 1 + return table + +def _get_iscased(flags): + if not flags & SRE_FLAG_IGNORECASE: + return None + elif flags & SRE_FLAG_UNICODE: + return _sre.unicode_iscased + else: + return _sre.ascii_iscased + +def _get_literal_prefix(pattern, flags): + # look for literal prefix + prefix = [] + prefixappend = prefix.append + prefix_skip = None + iscased = _get_iscased(flags) + for op, av in pattern.data: + if op is LITERAL: + if iscased and iscased(av): + break + prefixappend(av) + elif op is SUBPATTERN: + group, add_flags, del_flags, p = av + flags1 = _combine_flags(flags, add_flags, del_flags) + if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE: + break + prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1) + if prefix_skip is None: + if group is not None: + prefix_skip = len(prefix) + elif prefix_skip1 is not None: + prefix_skip = len(prefix) + prefix_skip1 + prefix.extend(prefix1) + if not got_all: + break + else: + break + else: + return prefix, prefix_skip, True + return prefix, prefix_skip, False + +def _get_charset_prefix(pattern, flags): + while True: + if not pattern.data: + return None + op, av = pattern.data[0] + if op is not SUBPATTERN: + break + group, add_flags, del_flags, pattern = av + flags = _combine_flags(flags, add_flags, del_flags) + if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: + return None + + iscased = _get_iscased(flags) + if op is LITERAL: + if iscased and iscased(av): + return None + return [(op, av)] + elif op is BRANCH: + charset = [] + charsetappend = charset.append + for p in av[1]: + if not p: + return None + op, av = p[0] + if op is LITERAL and not (iscased and iscased(av)): + charsetappend((op, av)) + else: + return None + return charset + elif op is IN: + charset = av + if iscased: + for op, av in charset: + if op is LITERAL: + if iscased(av): + return None + elif op is RANGE: + if av[1] > 0xffff: + return None + if any(map(iscased, range(av[0], av[1]+1))): + return None + return charset + return None + +def _compile_info(code, pattern, flags): + # internal: compile an info block. in the current version, + # this contains min/max pattern width, and an optional literal + # prefix or a character map + lo, hi = pattern.getwidth() + if hi > MAXCODE: + hi = MAXCODE + if lo == 0: + code.extend([INFO, 4, 0, lo, hi]) + return + # look for a literal prefix + prefix = [] + prefix_skip = 0 + charset = [] # not used + if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): + # look for literal prefix + prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) + # if no prefix, look for charset prefix + if not prefix: + charset = _get_charset_prefix(pattern, flags) +## if prefix: +## print("*** PREFIX", prefix, prefix_skip) +## if charset: +## print("*** CHARSET", charset) + # add an info block + emit = code.append + emit(INFO) + skip = len(code); emit(0) + # literal flag + mask = 0 + if prefix: + mask = SRE_INFO_PREFIX + if prefix_skip is None and got_all: + mask = mask | SRE_INFO_LITERAL + elif charset: + mask = mask | SRE_INFO_CHARSET + emit(mask) + # pattern length + if lo < MAXCODE: + emit(lo) + else: + emit(MAXCODE) + prefix = prefix[:MAXCODE] + emit(hi) + # add literal prefix + if prefix: + emit(len(prefix)) # length + if prefix_skip is None: + prefix_skip = len(prefix) + emit(prefix_skip) # skip + code.extend(prefix) + # generate overlap table + code.extend(_generate_overlap_table(prefix)) + elif charset: + charset, hascased = _optimize_charset(charset) + assert not hascased + _compile_charset(charset, flags, code) + code[skip] = len(code) - skip + +def isstring(obj): + return isinstance(obj, (str, bytes)) + +def _code(p, flags): + + flags = p.state.flags | flags + code = [] + + # compile info block + _compile_info(code, p, flags) + + # compile the pattern + _compile(code, p.data, flags) + + code.append(SUCCESS) + + return code + +def _hex_code(code): + return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code) + +def dis(code): + import sys + + labels = set() + level = 0 + offset_width = len(str(len(code) - 1)) + + def dis_(start, end): + def print_(*args, to=None): + if to is not None: + labels.add(to) + args += ('(to %d)' % (to,),) + print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'), + end=' '*(level-1)) + print(*args) + + def print_2(*args): + print(end=' '*(offset_width + 2*level)) + print(*args) + + nonlocal level + level += 1 + i = start + while i < end: + start = i + op = code[i] + i += 1 + op = OPCODES[op] + if op in (SUCCESS, FAILURE, ANY, ANY_ALL, + MAX_UNTIL, MIN_UNTIL, NEGATE): + print_(op) + elif op in (LITERAL, NOT_LITERAL, + LITERAL_IGNORE, NOT_LITERAL_IGNORE, + LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE, + LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE): + arg = code[i] + i += 1 + print_(op, '%#02x (%r)' % (arg, chr(arg))) + elif op is AT: + arg = code[i] + i += 1 + arg = str(ATCODES[arg]) + assert arg[:3] == 'AT_' + print_(op, arg[3:]) + elif op is CATEGORY: + arg = code[i] + i += 1 + arg = str(CHCODES[arg]) + assert arg[:9] == 'CATEGORY_' + print_(op, arg[9:]) + elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE): + skip = code[i] + print_(op, skip, to=i+skip) + dis_(i+1, i+skip) + i += skip + elif op in (RANGE, RANGE_UNI_IGNORE): + lo, hi = code[i: i+2] + i += 2 + print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi))) + elif op is CHARSET: + print_(op, _hex_code(code[i: i + 256//_CODEBITS])) + i += 256//_CODEBITS + elif op is BIGCHARSET: + arg = code[i] + i += 1 + mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder) + for x in code[i: i + 256//_sre.CODESIZE])) + print_(op, arg, mapping) + i += 256//_sre.CODESIZE + level += 1 + for j in range(arg): + print_2(_hex_code(code[i: i + 256//_CODEBITS])) + i += 256//_CODEBITS + level -= 1 + elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE, + GROUPREF_LOC_IGNORE): + arg = code[i] + i += 1 + print_(op, arg) + elif op is JUMP: + skip = code[i] + print_(op, skip, to=i+skip) + i += 1 + elif op is BRANCH: + skip = code[i] + print_(op, skip, to=i+skip) + while skip: + dis_(i+1, i+skip) + i += skip + start = i + skip = code[i] + if skip: + print_('branch', skip, to=i+skip) + else: + print_(FAILURE) + i += 1 + elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE, + POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE): + skip, min, max = code[i: i+3] + if max == MAXREPEAT: + max = 'MAXREPEAT' + print_(op, skip, min, max, to=i+skip) + dis_(i+3, i+skip) + i += skip + elif op is GROUPREF_EXISTS: + arg, skip = code[i: i+2] + print_(op, arg, skip, to=i+skip) + i += 2 + elif op in (ASSERT, ASSERT_NOT): + skip, arg = code[i: i+2] + print_(op, skip, arg, to=i+skip) + dis_(i+2, i+skip) + i += skip + elif op is ATOMIC_GROUP: + skip = code[i] + print_(op, skip, to=i+skip) + dis_(i+1, i+skip) + i += skip + elif op is INFO: + skip, flags, min, max = code[i: i+4] + if max == MAXREPEAT: + max = 'MAXREPEAT' + print_(op, skip, bin(flags), min, max, to=i+skip) + start = i+4 + if flags & SRE_INFO_PREFIX: + prefix_len, prefix_skip = code[i+4: i+6] + print_2(' prefix_skip', prefix_skip) + start = i + 6 + prefix = code[start: start+prefix_len] + print_2(' prefix', + '[%s]' % ', '.join('%#02x' % x for x in prefix), + '(%r)' % ''.join(map(chr, prefix))) + start += prefix_len + print_2(' overlap', code[start: start+prefix_len]) + start += prefix_len + if flags & SRE_INFO_CHARSET: + level += 1 + print_2('in') + dis_(start, i+skip) + level -= 1 + i += skip + else: + raise ValueError(op) + + level -= 1 + + dis_(0, len(code)) + + +def compile(p, flags=0): + # internal: convert pattern list to internal format + + if isstring(p): + pattern = p + p = _parser.parse(p, flags) + else: + pattern = None + + code = _code(p, flags) + + if flags & SRE_FLAG_DEBUG: + print() + dis(code) + + # map in either direction + groupindex = p.state.groupdict + indexgroup = [None] * p.state.groups + for k, i in groupindex.items(): + indexgroup[i] = k + + return _sre.compile( + pattern, flags | p.state.flags, code, + p.state.groups-1, + groupindex, tuple(indexgroup) + ) diff --git a/Lib/re/_constants.py b/Lib/re/_constants.py new file mode 100644 index 0000000000..d8e483ac4f --- /dev/null +++ b/Lib/re/_constants.py @@ -0,0 +1,219 @@ +# +# Secret Labs' Regular Expression Engine +# +# various symbols used by the regular expression engine. +# run this script to update the _sre include files! +# +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. +# +# See the __init__.py file for information on usage and redistribution. +# + +"""Internal support module for sre""" + +# update when constants are added or removed + +MAGIC = 20230612 + +from _sre import MAXREPEAT, MAXGROUPS + +# SRE standard exception (access as sre.error) +# should this really be here? + +class error(Exception): + """Exception raised for invalid regular expressions. + + Attributes: + + msg: The unformatted error message + pattern: The regular expression pattern + pos: The index in the pattern where compilation failed (may be None) + lineno: The line corresponding to pos (may be None) + colno: The column corresponding to pos (may be None) + """ + + __module__ = 're' + + def __init__(self, msg, pattern=None, pos=None): + self.msg = msg + self.pattern = pattern + self.pos = pos + if pattern is not None and pos is not None: + msg = '%s at position %d' % (msg, pos) + if isinstance(pattern, str): + newline = '\n' + else: + newline = b'\n' + self.lineno = pattern.count(newline, 0, pos) + 1 + self.colno = pos - pattern.rfind(newline, 0, pos) + if newline in pattern: + msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno) + else: + self.lineno = self.colno = None + super().__init__(msg) + + +class _NamedIntConstant(int): + def __new__(cls, value, name): + self = super(_NamedIntConstant, cls).__new__(cls, value) + self.name = name + return self + + def __repr__(self): + return self.name + + __reduce__ = None + +MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT') + +def _makecodes(*names): + items = [_NamedIntConstant(i, name) for i, name in enumerate(names)] + globals().update({item.name: item for item in items}) + return items + +# operators +OPCODES = _makecodes( + # failure=0 success=1 (just because it looks better that way :-) + 'FAILURE', 'SUCCESS', + + 'ANY', 'ANY_ALL', + 'ASSERT', 'ASSERT_NOT', + 'AT', + 'BRANCH', + 'CATEGORY', + 'CHARSET', 'BIGCHARSET', + 'GROUPREF', 'GROUPREF_EXISTS', + 'IN', + 'INFO', + 'JUMP', + 'LITERAL', + 'MARK', + 'MAX_UNTIL', + 'MIN_UNTIL', + 'NOT_LITERAL', + 'NEGATE', + 'RANGE', + 'REPEAT', + 'REPEAT_ONE', + 'SUBPATTERN', + 'MIN_REPEAT_ONE', + 'ATOMIC_GROUP', + 'POSSESSIVE_REPEAT', + 'POSSESSIVE_REPEAT_ONE', + + 'GROUPREF_IGNORE', + 'IN_IGNORE', + 'LITERAL_IGNORE', + 'NOT_LITERAL_IGNORE', + + 'GROUPREF_LOC_IGNORE', + 'IN_LOC_IGNORE', + 'LITERAL_LOC_IGNORE', + 'NOT_LITERAL_LOC_IGNORE', + + 'GROUPREF_UNI_IGNORE', + 'IN_UNI_IGNORE', + 'LITERAL_UNI_IGNORE', + 'NOT_LITERAL_UNI_IGNORE', + 'RANGE_UNI_IGNORE', + + # The following opcodes are only occurred in the parser output, + # but not in the compiled code. + 'MIN_REPEAT', 'MAX_REPEAT', +) +del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT + +# positions +ATCODES = _makecodes( + 'AT_BEGINNING', 'AT_BEGINNING_LINE', 'AT_BEGINNING_STRING', + 'AT_BOUNDARY', 'AT_NON_BOUNDARY', + 'AT_END', 'AT_END_LINE', 'AT_END_STRING', + + 'AT_LOC_BOUNDARY', 'AT_LOC_NON_BOUNDARY', + + 'AT_UNI_BOUNDARY', 'AT_UNI_NON_BOUNDARY', +) + +# categories +CHCODES = _makecodes( + 'CATEGORY_DIGIT', 'CATEGORY_NOT_DIGIT', + 'CATEGORY_SPACE', 'CATEGORY_NOT_SPACE', + 'CATEGORY_WORD', 'CATEGORY_NOT_WORD', + 'CATEGORY_LINEBREAK', 'CATEGORY_NOT_LINEBREAK', + + 'CATEGORY_LOC_WORD', 'CATEGORY_LOC_NOT_WORD', + + 'CATEGORY_UNI_DIGIT', 'CATEGORY_UNI_NOT_DIGIT', + 'CATEGORY_UNI_SPACE', 'CATEGORY_UNI_NOT_SPACE', + 'CATEGORY_UNI_WORD', 'CATEGORY_UNI_NOT_WORD', + 'CATEGORY_UNI_LINEBREAK', 'CATEGORY_UNI_NOT_LINEBREAK', +) + + +# replacement operations for "ignore case" mode +OP_IGNORE = { + LITERAL: LITERAL_IGNORE, + NOT_LITERAL: NOT_LITERAL_IGNORE, +} + +OP_LOCALE_IGNORE = { + LITERAL: LITERAL_LOC_IGNORE, + NOT_LITERAL: NOT_LITERAL_LOC_IGNORE, +} + +OP_UNICODE_IGNORE = { + LITERAL: LITERAL_UNI_IGNORE, + NOT_LITERAL: NOT_LITERAL_UNI_IGNORE, +} + +AT_MULTILINE = { + AT_BEGINNING: AT_BEGINNING_LINE, + AT_END: AT_END_LINE +} + +AT_LOCALE = { + AT_BOUNDARY: AT_LOC_BOUNDARY, + AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY +} + +AT_UNICODE = { + AT_BOUNDARY: AT_UNI_BOUNDARY, + AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY +} + +CH_LOCALE = { + CATEGORY_DIGIT: CATEGORY_DIGIT, + CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT, + CATEGORY_SPACE: CATEGORY_SPACE, + CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE, + CATEGORY_WORD: CATEGORY_LOC_WORD, + CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD, + CATEGORY_LINEBREAK: CATEGORY_LINEBREAK, + CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK +} + +CH_UNICODE = { + CATEGORY_DIGIT: CATEGORY_UNI_DIGIT, + CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT, + CATEGORY_SPACE: CATEGORY_UNI_SPACE, + CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE, + CATEGORY_WORD: CATEGORY_UNI_WORD, + CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD, + CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK, + CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK +} + +# flags +SRE_FLAG_IGNORECASE = 2 # case insensitive +SRE_FLAG_LOCALE = 4 # honour system locale +SRE_FLAG_MULTILINE = 8 # treat target as multiline string +SRE_FLAG_DOTALL = 16 # treat target as a single string +SRE_FLAG_UNICODE = 32 # use unicode "locale" +SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments +SRE_FLAG_DEBUG = 128 # debugging +SRE_FLAG_ASCII = 256 # use ascii "locale" + +# flags for INFO primitive +SRE_INFO_PREFIX = 1 # has prefix +SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix) +SRE_INFO_CHARSET = 4 # pattern starts with character from given set diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py new file mode 100644 index 0000000000..f3c779340f --- /dev/null +++ b/Lib/re/_parser.py @@ -0,0 +1,1081 @@ +# +# Secret Labs' Regular Expression Engine +# +# convert re-style regular expression to sre pattern +# +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. +# +# See the __init__.py file for information on usage and redistribution. +# + +"""Internal support module for sre""" + +# XXX: show string offset and offending character for all errors + +from ._constants import * + +SPECIAL_CHARS = ".\\[{()*+?^$|" +REPEAT_CHARS = "*+?{" + +DIGITS = frozenset("0123456789") + +OCTDIGITS = frozenset("01234567") +HEXDIGITS = frozenset("0123456789abcdefABCDEF") +ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") + +WHITESPACE = frozenset(" \t\n\r\v\f") + +_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT}) +_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) + +ESCAPES = { + r"\a": (LITERAL, ord("\a")), + r"\b": (LITERAL, ord("\b")), + r"\f": (LITERAL, ord("\f")), + r"\n": (LITERAL, ord("\n")), + r"\r": (LITERAL, ord("\r")), + r"\t": (LITERAL, ord("\t")), + r"\v": (LITERAL, ord("\v")), + r"\\": (LITERAL, ord("\\")) +} + +CATEGORIES = { + r"\A": (AT, AT_BEGINNING_STRING), # start of string + r"\b": (AT, AT_BOUNDARY), + r"\B": (AT, AT_NON_BOUNDARY), + r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), + r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), + r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), + r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), + r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), + r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), + r"\Z": (AT, AT_END_STRING), # end of string +} + +FLAGS = { + # standard flags + "i": SRE_FLAG_IGNORECASE, + "L": SRE_FLAG_LOCALE, + "m": SRE_FLAG_MULTILINE, + "s": SRE_FLAG_DOTALL, + "x": SRE_FLAG_VERBOSE, + # extensions + "a": SRE_FLAG_ASCII, + "u": SRE_FLAG_UNICODE, +} + +TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE +GLOBAL_FLAGS = SRE_FLAG_DEBUG + +# Maximal value returned by SubPattern.getwidth(). +# Must be larger than MAXREPEAT, MAXCODE and sys.maxsize. +MAXWIDTH = 1 << 64 + +class State: + # keeps track of state for parsing + def __init__(self): + self.flags = 0 + self.groupdict = {} + self.groupwidths = [None] # group 0 + self.lookbehindgroups = None + self.grouprefpos = {} + @property + def groups(self): + return len(self.groupwidths) + def opengroup(self, name=None): + gid = self.groups + self.groupwidths.append(None) + if self.groups > MAXGROUPS: + raise error("too many groups") + if name is not None: + ogid = self.groupdict.get(name, None) + if ogid is not None: + raise error("redefinition of group name %r as group %d; " + "was group %d" % (name, gid, ogid)) + self.groupdict[name] = gid + return gid + def closegroup(self, gid, p): + self.groupwidths[gid] = p.getwidth() + def checkgroup(self, gid): + return gid < self.groups and self.groupwidths[gid] is not None + + def checklookbehindgroup(self, gid, source): + if self.lookbehindgroups is not None: + if not self.checkgroup(gid): + raise source.error('cannot refer to an open group') + if gid >= self.lookbehindgroups: + raise source.error('cannot refer to group defined in the same ' + 'lookbehind subpattern') + +class SubPattern: + # a subpattern, in intermediate form + def __init__(self, state, data=None): + self.state = state + if data is None: + data = [] + self.data = data + self.width = None + + def dump(self, level=0): + seqtypes = (tuple, list) + for op, av in self.data: + print(level*" " + str(op), end='') + if op is IN: + # member sublanguage + print() + for op, a in av: + print((level+1)*" " + str(op), a) + elif op is BRANCH: + print() + for i, a in enumerate(av[1]): + if i: + print(level*" " + "OR") + a.dump(level+1) + elif op is GROUPREF_EXISTS: + condgroup, item_yes, item_no = av + print('', condgroup) + item_yes.dump(level+1) + if item_no: + print(level*" " + "ELSE") + item_no.dump(level+1) + elif isinstance(av, SubPattern): + print() + av.dump(level+1) + elif isinstance(av, seqtypes): + nl = False + for a in av: + if isinstance(a, SubPattern): + if not nl: + print() + a.dump(level+1) + nl = True + else: + if not nl: + print(' ', end='') + print(a, end='') + nl = False + if not nl: + print() + else: + print('', av) + def __repr__(self): + return repr(self.data) + def __len__(self): + return len(self.data) + def __delitem__(self, index): + del self.data[index] + def __getitem__(self, index): + if isinstance(index, slice): + return SubPattern(self.state, self.data[index]) + return self.data[index] + def __setitem__(self, index, code): + self.data[index] = code + def insert(self, index, code): + self.data.insert(index, code) + def append(self, code): + self.data.append(code) + def getwidth(self): + # determine the width (min, max) for this subpattern + if self.width is not None: + return self.width + lo = hi = 0 + for op, av in self.data: + if op is BRANCH: + i = MAXWIDTH + j = 0 + for av in av[1]: + l, h = av.getwidth() + i = min(i, l) + j = max(j, h) + lo = lo + i + hi = hi + j + elif op is ATOMIC_GROUP: + i, j = av.getwidth() + lo = lo + i + hi = hi + j + elif op is SUBPATTERN: + i, j = av[-1].getwidth() + lo = lo + i + hi = hi + j + elif op in _REPEATCODES: + i, j = av[2].getwidth() + lo = lo + i * av[0] + if av[1] == MAXREPEAT and j: + hi = MAXWIDTH + else: + hi = hi + j * av[1] + elif op in _UNITCODES: + lo = lo + 1 + hi = hi + 1 + elif op is GROUPREF: + i, j = self.state.groupwidths[av] + lo = lo + i + hi = hi + j + elif op is GROUPREF_EXISTS: + i, j = av[1].getwidth() + if av[2] is not None: + l, h = av[2].getwidth() + i = min(i, l) + j = max(j, h) + else: + i = 0 + lo = lo + i + hi = hi + j + elif op is SUCCESS: + break + self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH) + return self.width + +class Tokenizer: + def __init__(self, string): + self.istext = isinstance(string, str) + self.string = string + if not self.istext: + string = str(string, 'latin1') + self.decoded_string = string + self.index = 0 + self.next = None + self.__next() + def __next(self): + index = self.index + try: + char = self.decoded_string[index] + except IndexError: + self.next = None + return + if char == "\\": + index += 1 + try: + char += self.decoded_string[index] + except IndexError: + raise error("bad escape (end of pattern)", + self.string, len(self.string) - 1) from None + self.index = index + 1 + self.next = char + def match(self, char): + if char == self.next: + self.__next() + return True + return False + def get(self): + this = self.next + self.__next() + return this + def getwhile(self, n, charset): + result = '' + for _ in range(n): + c = self.next + if c not in charset: + break + result += c + self.__next() + return result + def getuntil(self, terminator, name): + result = '' + while True: + c = self.next + self.__next() + if c is None: + if not result: + raise self.error("missing " + name) + raise self.error("missing %s, unterminated name" % terminator, + len(result)) + if c == terminator: + if not result: + raise self.error("missing " + name, 1) + break + result += c + return result + @property + def pos(self): + return self.index - len(self.next or '') + def tell(self): + return self.index - len(self.next or '') + def seek(self, index): + self.index = index + self.__next() + + def error(self, msg, offset=0): + if not self.istext: + msg = msg.encode('ascii', 'backslashreplace').decode('ascii') + return error(msg, self.string, self.tell() - offset) + + def checkgroupname(self, name, offset): + if not (self.istext or name.isascii()): + msg = "bad character in group name %a" % name + raise self.error(msg, len(name) + offset) + if not name.isidentifier(): + msg = "bad character in group name %r" % name + raise self.error(msg, len(name) + offset) + +def _class_escape(source, escape): + # handle escape code inside character class + code = ESCAPES.get(escape) + if code: + return code + code = CATEGORIES.get(escape) + if code and code[0] is IN: + return code + try: + c = escape[1:2] + if c == "x": + # hexadecimal escape (exactly two digits) + escape += source.getwhile(2, HEXDIGITS) + if len(escape) != 4: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "u" and source.istext: + # unicode escape (exactly four digits) + escape += source.getwhile(4, HEXDIGITS) + if len(escape) != 6: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "U" and source.istext: + # unicode escape (exactly eight digits) + escape += source.getwhile(8, HEXDIGITS) + if len(escape) != 10: + raise source.error("incomplete escape %s" % escape, len(escape)) + c = int(escape[2:], 16) + chr(c) # raise ValueError for invalid code + return LITERAL, c + elif c == "N" and source.istext: + import unicodedata + # named unicode escape e.g. \N{EM DASH} + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') + try: + c = ord(unicodedata.lookup(charname)) + except (KeyError, TypeError): + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) from None + return LITERAL, c + elif c in OCTDIGITS: + # octal escape (up to three digits) + escape += source.getwhile(2, OCTDIGITS) + c = int(escape[1:], 8) + if c > 0o377: + raise source.error('octal escape value %s outside of ' + 'range 0-0o377' % escape, len(escape)) + return LITERAL, c + elif c in DIGITS: + raise ValueError + if len(escape) == 2: + if c in ASCIILETTERS: + raise source.error('bad escape %s' % escape, len(escape)) + return LITERAL, ord(escape[1]) + except ValueError: + pass + raise source.error("bad escape %s" % escape, len(escape)) + +def _escape(source, escape, state): + # handle escape code in expression + code = CATEGORIES.get(escape) + if code: + return code + code = ESCAPES.get(escape) + if code: + return code + try: + c = escape[1:2] + if c == "x": + # hexadecimal escape + escape += source.getwhile(2, HEXDIGITS) + if len(escape) != 4: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "u" and source.istext: + # unicode escape (exactly four digits) + escape += source.getwhile(4, HEXDIGITS) + if len(escape) != 6: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "U" and source.istext: + # unicode escape (exactly eight digits) + escape += source.getwhile(8, HEXDIGITS) + if len(escape) != 10: + raise source.error("incomplete escape %s" % escape, len(escape)) + c = int(escape[2:], 16) + chr(c) # raise ValueError for invalid code + return LITERAL, c + elif c == "N" and source.istext: + import unicodedata + # named unicode escape e.g. \N{EM DASH} + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') + try: + c = ord(unicodedata.lookup(charname)) + except (KeyError, TypeError): + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) from None + return LITERAL, c + elif c == "0": + # octal escape + escape += source.getwhile(2, OCTDIGITS) + return LITERAL, int(escape[1:], 8) + elif c in DIGITS: + # octal escape *or* decimal group reference (sigh) + if source.next in DIGITS: + escape += source.get() + if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and + source.next in OCTDIGITS): + # got three octal digits; this is an octal escape + escape += source.get() + c = int(escape[1:], 8) + if c > 0o377: + raise source.error('octal escape value %s outside of ' + 'range 0-0o377' % escape, + len(escape)) + return LITERAL, c + # not an octal escape, so this is a group reference + group = int(escape[1:]) + if group < state.groups: + if not state.checkgroup(group): + raise source.error("cannot refer to an open group", + len(escape)) + state.checklookbehindgroup(group, source) + return GROUPREF, group + raise source.error("invalid group reference %d" % group, len(escape) - 1) + if len(escape) == 2: + if c in ASCIILETTERS: + raise source.error("bad escape %s" % escape, len(escape)) + return LITERAL, ord(escape[1]) + except ValueError: + pass + raise source.error("bad escape %s" % escape, len(escape)) + +def _uniq(items): + return list(dict.fromkeys(items)) + +def _parse_sub(source, state, verbose, nested): + # parse an alternation: a|b|c + + items = [] + itemsappend = items.append + sourcematch = source.match + start = source.tell() + while True: + itemsappend(_parse(source, state, verbose, nested + 1, + not nested and not items)) + if not sourcematch("|"): + break + if not nested: + verbose = state.flags & SRE_FLAG_VERBOSE + + if len(items) == 1: + return items[0] + + subpattern = SubPattern(state) + + # check if all items share a common prefix + while True: + prefix = None + for item in items: + if not item: + break + if prefix is None: + prefix = item[0] + elif item[0] != prefix: + break + else: + # all subitems start with a common "prefix". + # move it out of the branch + for item in items: + del item[0] + subpattern.append(prefix) + continue # check next one + break + + # check if the branch can be replaced by a character set + set = [] + for item in items: + if len(item) != 1: + break + op, av = item[0] + if op is LITERAL: + set.append((op, av)) + elif op is IN and av[0][0] is not NEGATE: + set.extend(av) + else: + break + else: + # we can store this as a character set instead of a + # branch (the compiler may optimize this even more) + subpattern.append((IN, _uniq(set))) + return subpattern + + subpattern.append((BRANCH, (None, items))) + return subpattern + +def _parse(source, state, verbose, nested, first=False): + # parse a simple pattern + subpattern = SubPattern(state) + + # precompute constants into local variables + subpatternappend = subpattern.append + sourceget = source.get + sourcematch = source.match + _len = len + _ord = ord + + while True: + + this = source.next + if this is None: + break # end of pattern + if this in "|)": + break # end of subpattern + sourceget() + + if verbose: + # skip whitespace and comments + if this in WHITESPACE: + continue + if this == "#": + while True: + this = sourceget() + if this is None or this == "\n": + break + continue + + if this[0] == "\\": + code = _escape(source, this, state) + subpatternappend(code) + + elif this not in SPECIAL_CHARS: + subpatternappend((LITERAL, _ord(this))) + + elif this == "[": + here = source.tell() - 1 + # character set + set = [] + setappend = set.append +## if sourcematch(":"): +## pass # handle character classes + if source.next == '[': + import warnings + warnings.warn( + 'Possible nested set at position %d' % source.tell(), + FutureWarning, stacklevel=nested + 6 + ) + negate = sourcematch("^") + # check remaining characters + while True: + this = sourceget() + if this is None: + raise source.error("unterminated character set", + source.tell() - here) + if this == "]" and set: + break + elif this[0] == "\\": + code1 = _class_escape(source, this) + else: + if set and this in '-&~|' and source.next == this: + import warnings + warnings.warn( + 'Possible set %s at position %d' % ( + 'difference' if this == '-' else + 'intersection' if this == '&' else + 'symmetric difference' if this == '~' else + 'union', + source.tell() - 1), + FutureWarning, stacklevel=nested + 6 + ) + code1 = LITERAL, _ord(this) + if sourcematch("-"): + # potential range + that = sourceget() + if that is None: + raise source.error("unterminated character set", + source.tell() - here) + if that == "]": + if code1[0] is IN: + code1 = code1[1][0] + setappend(code1) + setappend((LITERAL, _ord("-"))) + break + if that[0] == "\\": + code2 = _class_escape(source, that) + else: + if that == '-': + import warnings + warnings.warn( + 'Possible set difference at position %d' % ( + source.tell() - 2), + FutureWarning, stacklevel=nested + 6 + ) + code2 = LITERAL, _ord(that) + if code1[0] != LITERAL or code2[0] != LITERAL: + msg = "bad character range %s-%s" % (this, that) + raise source.error(msg, len(this) + 1 + len(that)) + lo = code1[1] + hi = code2[1] + if hi < lo: + msg = "bad character range %s-%s" % (this, that) + raise source.error(msg, len(this) + 1 + len(that)) + setappend((RANGE, (lo, hi))) + else: + if code1[0] is IN: + code1 = code1[1][0] + setappend(code1) + + set = _uniq(set) + # XXX: should move set optimization to compiler! + if _len(set) == 1 and set[0][0] is LITERAL: + # optimization + if negate: + subpatternappend((NOT_LITERAL, set[0][1])) + else: + subpatternappend(set[0]) + else: + if negate: + set.insert(0, (NEGATE, None)) + # charmap optimization can't be added here because + # global flags still are not known + subpatternappend((IN, set)) + + elif this in REPEAT_CHARS: + # repeat previous item + here = source.tell() + if this == "?": + min, max = 0, 1 + elif this == "*": + min, max = 0, MAXREPEAT + + elif this == "+": + min, max = 1, MAXREPEAT + elif this == "{": + if source.next == "}": + subpatternappend((LITERAL, _ord(this))) + continue + + min, max = 0, MAXREPEAT + lo = hi = "" + while source.next in DIGITS: + lo += sourceget() + if sourcematch(","): + while source.next in DIGITS: + hi += sourceget() + else: + hi = lo + if not sourcematch("}"): + subpatternappend((LITERAL, _ord(this))) + source.seek(here) + continue + + if lo: + min = int(lo) + if min >= MAXREPEAT: + raise OverflowError("the repetition number is too large") + if hi: + max = int(hi) + if max >= MAXREPEAT: + raise OverflowError("the repetition number is too large") + if max < min: + raise source.error("min repeat greater than max repeat", + source.tell() - here) + else: + raise AssertionError("unsupported quantifier %r" % (char,)) + # figure out which item to repeat + if subpattern: + item = subpattern[-1:] + else: + item = None + if not item or item[0][0] is AT: + raise source.error("nothing to repeat", + source.tell() - here + len(this)) + if item[0][0] in _REPEATCODES: + raise source.error("multiple repeat", + source.tell() - here + len(this)) + if item[0][0] is SUBPATTERN: + group, add_flags, del_flags, p = item[0][1] + if group is None and not add_flags and not del_flags: + item = p + if sourcematch("?"): + # Non-Greedy Match + subpattern[-1] = (MIN_REPEAT, (min, max, item)) + elif sourcematch("+"): + # Possessive Match (Always Greedy) + subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item)) + else: + # Greedy Match + subpattern[-1] = (MAX_REPEAT, (min, max, item)) + + elif this == ".": + subpatternappend((ANY, None)) + + elif this == "(": + start = source.tell() - 1 + capture = True + atomic = False + name = None + add_flags = 0 + del_flags = 0 + if sourcematch("?"): + # options + char = sourceget() + if char is None: + raise source.error("unexpected end of pattern") + if char == "P": + # python extensions + if sourcematch("<"): + # named group: skip forward to end of name + name = source.getuntil(">", "group name") + source.checkgroupname(name, 1) + elif sourcematch("="): + # named backreference + name = source.getuntil(")", "group name") + source.checkgroupname(name, 1) + gid = state.groupdict.get(name) + if gid is None: + msg = "unknown group name %r" % name + raise source.error(msg, len(name) + 1) + if not state.checkgroup(gid): + raise source.error("cannot refer to an open group", + len(name) + 1) + state.checklookbehindgroup(gid, source) + subpatternappend((GROUPREF, gid)) + continue + + else: + char = sourceget() + if char is None: + raise source.error("unexpected end of pattern") + raise source.error("unknown extension ?P" + char, + len(char) + 2) + elif char == ":": + # non-capturing group + capture = False + elif char == "#": + # comment + while True: + if source.next is None: + raise source.error("missing ), unterminated comment", + source.tell() - start) + if sourceget() == ")": + break + continue + + elif char in "=!<": + # lookahead assertions + dir = 1 + if char == "<": + char = sourceget() + if char is None: + raise source.error("unexpected end of pattern") + if char not in "=!": + raise source.error("unknown extension ?<" + char, + len(char) + 2) + dir = -1 # lookbehind + lookbehindgroups = state.lookbehindgroups + if lookbehindgroups is None: + state.lookbehindgroups = state.groups + p = _parse_sub(source, state, verbose, nested + 1) + if dir < 0: + if lookbehindgroups is None: + state.lookbehindgroups = None + if not sourcematch(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + if char == "=": + subpatternappend((ASSERT, (dir, p))) + elif p: + subpatternappend((ASSERT_NOT, (dir, p))) + else: + subpatternappend((FAILURE, ())) + continue + + elif char == "(": + # conditional backreference group + condname = source.getuntil(")", "group name") + if not (condname.isdecimal() and condname.isascii()): + source.checkgroupname(condname, 1) + condgroup = state.groupdict.get(condname) + if condgroup is None: + msg = "unknown group name %r" % condname + raise source.error(msg, len(condname) + 1) + else: + condgroup = int(condname) + if not condgroup: + raise source.error("bad group number", + len(condname) + 1) + if condgroup >= MAXGROUPS: + msg = "invalid group reference %d" % condgroup + raise source.error(msg, len(condname) + 1) + if condgroup not in state.grouprefpos: + state.grouprefpos[condgroup] = ( + source.tell() - len(condname) - 1 + ) + if not (condname.isdecimal() and condname.isascii()): + import warnings + warnings.warn( + "bad character in group name %s at position %d" % + (repr(condname) if source.istext else ascii(condname), + source.tell() - len(condname) - 1), + DeprecationWarning, stacklevel=nested + 6 + ) + state.checklookbehindgroup(condgroup, source) + item_yes = _parse(source, state, verbose, nested + 1) + if source.match("|"): + item_no = _parse(source, state, verbose, nested + 1) + if source.next == "|": + raise source.error("conditional backref with more than two branches") + else: + item_no = None + if not source.match(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) + continue + + elif char == ">": + # non-capturing, atomic group + capture = False + atomic = True + elif char in FLAGS or char == "-": + # flags + flags = _parse_flags(source, state, char) + if flags is None: # global flags + if not first or subpattern: + raise source.error('global flags not at the start ' + 'of the expression', + source.tell() - start) + verbose = state.flags & SRE_FLAG_VERBOSE + continue + + add_flags, del_flags = flags + capture = False + else: + raise source.error("unknown extension ?" + char, + len(char) + 1) + + # parse group contents + if capture: + try: + group = state.opengroup(name) + except error as err: + raise source.error(err.msg, len(name) + 1) from None + else: + group = None + sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and + not (del_flags & SRE_FLAG_VERBOSE)) + p = _parse_sub(source, state, sub_verbose, nested + 1) + if not source.match(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + if group is not None: + state.closegroup(group, p) + if atomic: + assert group is None + subpatternappend((ATOMIC_GROUP, p)) + else: + subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) + + elif this == "^": + subpatternappend((AT, AT_BEGINNING)) + + elif this == "$": + subpatternappend((AT, AT_END)) + + else: + raise AssertionError("unsupported special character %r" % (char,)) + + # unpack non-capturing groups + for i in range(len(subpattern))[::-1]: + op, av = subpattern[i] + if op is SUBPATTERN: + group, add_flags, del_flags, p = av + if group is None and not add_flags and not del_flags: + subpattern[i: i+1] = p + + return subpattern + +def _parse_flags(source, state, char): + sourceget = source.get + add_flags = 0 + del_flags = 0 + if char != "-": + while True: + flag = FLAGS[char] + if source.istext: + if char == 'L': + msg = "bad inline flags: cannot use 'L' flag with a str pattern" + raise source.error(msg) + else: + if char == 'u': + msg = "bad inline flags: cannot use 'u' flag with a bytes pattern" + raise source.error(msg) + add_flags |= flag + if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag: + msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible" + raise source.error(msg) + char = sourceget() + if char is None: + raise source.error("missing -, : or )") + if char in ")-:": + break + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing -, : or )" + raise source.error(msg, len(char)) + if char == ")": + state.flags |= add_flags + return None + if add_flags & GLOBAL_FLAGS: + raise source.error("bad inline flags: cannot turn on global flag", 1) + if char == "-": + char = sourceget() + if char is None: + raise source.error("missing flag") + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing flag" + raise source.error(msg, len(char)) + while True: + flag = FLAGS[char] + if flag & TYPE_FLAGS: + msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'" + raise source.error(msg) + del_flags |= flag + char = sourceget() + if char is None: + raise source.error("missing :") + if char == ":": + break + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing :" + raise source.error(msg, len(char)) + assert char == ":" + if del_flags & GLOBAL_FLAGS: + raise source.error("bad inline flags: cannot turn off global flag", 1) + if add_flags & del_flags: + raise source.error("bad inline flags: flag turned on and off", 1) + return add_flags, del_flags + +def fix_flags(src, flags): + # Check and fix flags according to the type of pattern (str or bytes) + if isinstance(src, str): + if flags & SRE_FLAG_LOCALE: + raise ValueError("cannot use LOCALE flag with a str pattern") + if not flags & SRE_FLAG_ASCII: + flags |= SRE_FLAG_UNICODE + elif flags & SRE_FLAG_UNICODE: + raise ValueError("ASCII and UNICODE flags are incompatible") + else: + if flags & SRE_FLAG_UNICODE: + raise ValueError("cannot use UNICODE flag with a bytes pattern") + if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: + raise ValueError("ASCII and LOCALE flags are incompatible") + return flags + +def parse(str, flags=0, state=None): + # parse 're' pattern into list of (opcode, argument) tuples + + source = Tokenizer(str) + + if state is None: + state = State() + state.flags = flags + state.str = str + + p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0) + p.state.flags = fix_flags(str, p.state.flags) + + if source.next is not None: + assert source.next == ")" + raise source.error("unbalanced parenthesis") + + for g in p.state.grouprefpos: + if g >= p.state.groups: + msg = "invalid group reference %d" % g + raise error(msg, str, p.state.grouprefpos[g]) + + if flags & SRE_FLAG_DEBUG: + p.dump() + + return p + +def parse_template(source, pattern): + # parse 're' replacement string into list of literals and + # group references + s = Tokenizer(source) + sget = s.get + result = [] + literal = [] + lappend = literal.append + def addliteral(): + if s.istext: + result.append(''.join(literal)) + else: + # The tokenizer implicitly decodes bytes objects as latin-1, we must + # therefore re-encode the final representation. + result.append(''.join(literal).encode('latin-1')) + del literal[:] + def addgroup(index, pos): + if index > pattern.groups: + raise s.error("invalid group reference %d" % index, pos) + addliteral() + result.append(index) + groupindex = pattern.groupindex + while True: + this = sget() + if this is None: + break # end of replacement string + if this[0] == "\\": + # group + c = this[1] + if c == "g": + if not s.match("<"): + raise s.error("missing <") + name = s.getuntil(">", "group name") + if not (name.isdecimal() and name.isascii()): + s.checkgroupname(name, 1) + try: + index = groupindex[name] + except KeyError: + raise IndexError("unknown group name %r" % name) from None + else: + index = int(name) + if index >= MAXGROUPS: + raise s.error("invalid group reference %d" % index, + len(name) + 1) + if not (name.isdecimal() and name.isascii()): + import warnings + warnings.warn( + "bad character in group name %s at position %d" % + (repr(name) if s.istext else ascii(name), + s.tell() - len(name) - 1), + DeprecationWarning, stacklevel=5 + ) + addgroup(index, len(name) + 1) + elif c == "0": + if s.next in OCTDIGITS: + this += sget() + if s.next in OCTDIGITS: + this += sget() + lappend(chr(int(this[1:], 8) & 0xff)) + elif c in DIGITS: + isoctal = False + if s.next in DIGITS: + this += sget() + if (c in OCTDIGITS and this[2] in OCTDIGITS and + s.next in OCTDIGITS): + this += sget() + isoctal = True + c = int(this[1:], 8) + if c > 0o377: + raise s.error('octal escape value %s outside of ' + 'range 0-0o377' % this, len(this)) + lappend(chr(c)) + if not isoctal: + addgroup(int(this[1:]), len(this) - 1) + else: + try: + this = chr(ESCAPES[this][1]) + except KeyError: + if c in ASCIILETTERS: + raise s.error('bad escape %s' % this, len(this)) from None + lappend(this) + else: + lappend(this) + addliteral() + return result From ebe555203a09cf55c981177c9b1cbd6489076a19 Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sat, 18 Nov 2023 20:47:42 +0200 Subject: [PATCH 119/705] Replace Lib/sre_* from CPython --- Lib/re/_compiler.py | 3 + Lib/re/_constants.py | 4 +- Lib/re/_parser.py | 7 +- Lib/sre_compile.py | 789 +------------------------------ Lib/sre_constants.py | 220 +-------- Lib/sre_parse.py | 1069 +----------------------------------------- 6 files changed, 27 insertions(+), 2065 deletions(-) diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index f87712d6d6..861bbdb130 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -101,6 +101,8 @@ def _compile(code, pattern, flags): else: emit(ANY) elif op in REPEATING_CODES: + if flags & SRE_FLAG_TEMPLATE: + raise error("internal: unsupported template operator %r" % (op,)) if _simple(av[2]): emit(REPEATING_CODES[op][2]) skip = _len(code); emit(0) @@ -761,3 +763,4 @@ def compile(p, flags=0): p.state.groups-1, groupindex, tuple(indexgroup) ) + diff --git a/Lib/re/_constants.py b/Lib/re/_constants.py index d8e483ac4f..92494e385c 100644 --- a/Lib/re/_constants.py +++ b/Lib/re/_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20230612 +MAGIC = 20221023 from _sre import MAXREPEAT, MAXGROUPS @@ -204,6 +204,7 @@ def _makecodes(*names): } # flags +SRE_FLAG_TEMPLATE = 1 # template mode (unknown purpose, deprecated) SRE_FLAG_IGNORECASE = 2 # case insensitive SRE_FLAG_LOCALE = 4 # honour system locale SRE_FLAG_MULTILINE = 8 # treat target as multiline string @@ -217,3 +218,4 @@ def _makecodes(*names): SRE_INFO_PREFIX = 1 # has prefix SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix) SRE_INFO_CHARSET = 4 # pattern starts with character from given set +RE_INFO_CHARSET = 4 # pattern starts with character from given set diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index f3c779340f..4a492b79e8 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -61,11 +61,12 @@ "x": SRE_FLAG_VERBOSE, # extensions "a": SRE_FLAG_ASCII, + "t": SRE_FLAG_TEMPLATE, "u": SRE_FLAG_UNICODE, } TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE -GLOBAL_FLAGS = SRE_FLAG_DEBUG +GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE # Maximal value returned by SubPattern.getwidth(). # Must be larger than MAXREPEAT, MAXCODE and sys.maxsize. @@ -780,10 +781,8 @@ def _parse(source, state, verbose, nested, first=False): source.tell() - start) if char == "=": subpatternappend((ASSERT, (dir, p))) - elif p: - subpatternappend((ASSERT_NOT, (dir, p))) else: - subpatternappend((FAILURE, ())) + subpatternappend((ASSERT_NOT, (dir, p))) continue elif char == "(": diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index c6398bfb83..f9da61e648 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -1,784 +1,7 @@ -# -# Secret Labs' Regular Expression Engine -# -# convert template to internal format -# -# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. -# -# See the sre.py file for information on usage and redistribution. -# +import warnings +warnings.warn(f"module {__name__!r} is deprecated", + DeprecationWarning, + stacklevel=2) -"""Internal support module for sre""" - -import _sre -import sre_parse -from sre_constants import * - -assert _sre.MAGIC == MAGIC, "SRE module mismatch" - -_LITERAL_CODES = {LITERAL, NOT_LITERAL} -_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT} -_SUCCESS_CODES = {SUCCESS, FAILURE} -_ASSERT_CODES = {ASSERT, ASSERT_NOT} -_UNIT_CODES = _LITERAL_CODES | {ANY, IN} - -# Sets of lowercase characters which have the same uppercase. -_equivalences = ( - # LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I - (0x69, 0x131), # iı - # LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S - (0x73, 0x17f), # sſ - # MICRO SIGN, GREEK SMALL LETTER MU - (0xb5, 0x3bc), # µμ - # COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI - (0x345, 0x3b9, 0x1fbe), # \u0345ιι - # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA - (0x390, 0x1fd3), # ΐΐ - # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA - (0x3b0, 0x1fe3), # ΰΰ - # GREEK SMALL LETTER BETA, GREEK BETA SYMBOL - (0x3b2, 0x3d0), # βϐ - # GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL - (0x3b5, 0x3f5), # εϵ - # GREEK SMALL LETTER THETA, GREEK THETA SYMBOL - (0x3b8, 0x3d1), # θϑ - # GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL - (0x3ba, 0x3f0), # κϰ - # GREEK SMALL LETTER PI, GREEK PI SYMBOL - (0x3c0, 0x3d6), # πϖ - # GREEK SMALL LETTER RHO, GREEK RHO SYMBOL - (0x3c1, 0x3f1), # ρϱ - # GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA - (0x3c2, 0x3c3), # ςσ - # GREEK SMALL LETTER PHI, GREEK PHI SYMBOL - (0x3c6, 0x3d5), # φϕ - # LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE - (0x1e61, 0x1e9b), # ṡẛ - # LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST - (0xfb05, 0xfb06), # ſtst -) - -# Maps the lowercase code to lowercase codes which have the same uppercase. -_ignorecase_fixes = {i: tuple(j for j in t if i != j) - for t in _equivalences for i in t} - -def _combine_flags(flags, add_flags, del_flags, - TYPE_FLAGS=sre_parse.TYPE_FLAGS): - if add_flags & TYPE_FLAGS: - flags &= ~TYPE_FLAGS - return (flags | add_flags) & ~del_flags - -def _compile(code, pattern, flags): - # internal: compile a (sub)pattern - emit = code.append - _len = len - LITERAL_CODES = _LITERAL_CODES - REPEATING_CODES = _REPEATING_CODES - SUCCESS_CODES = _SUCCESS_CODES - ASSERT_CODES = _ASSERT_CODES - iscased = None - tolower = None - fixes = None - if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE: - if flags & SRE_FLAG_UNICODE: - iscased = _sre.unicode_iscased - tolower = _sre.unicode_tolower - fixes = _ignorecase_fixes - else: - iscased = _sre.ascii_iscased - tolower = _sre.ascii_tolower - for op, av in pattern: - if op in LITERAL_CODES: - if not flags & SRE_FLAG_IGNORECASE: - emit(op) - emit(av) - elif flags & SRE_FLAG_LOCALE: - emit(OP_LOCALE_IGNORE[op]) - emit(av) - elif not iscased(av): - emit(op) - emit(av) - else: - lo = tolower(av) - if not fixes: # ascii - emit(OP_IGNORE[op]) - emit(lo) - elif lo not in fixes: - emit(OP_UNICODE_IGNORE[op]) - emit(lo) - else: - emit(IN_UNI_IGNORE) - skip = _len(code); emit(0) - if op is NOT_LITERAL: - emit(NEGATE) - for k in (lo,) + fixes[lo]: - emit(LITERAL) - emit(k) - emit(FAILURE) - code[skip] = _len(code) - skip - elif op is IN: - charset, hascased = _optimize_charset(av, iscased, tolower, fixes) - if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: - emit(IN_LOC_IGNORE) - elif not hascased: - emit(IN) - elif not fixes: # ascii - emit(IN_IGNORE) - else: - emit(IN_UNI_IGNORE) - skip = _len(code); emit(0) - _compile_charset(charset, flags, code) - code[skip] = _len(code) - skip - elif op is ANY: - if flags & SRE_FLAG_DOTALL: - emit(ANY_ALL) - else: - emit(ANY) - elif op in REPEATING_CODES: - if flags & SRE_FLAG_TEMPLATE: - raise error("internal: unsupported template operator %r" % (op,)) - if _simple(av[2]): - if op is MAX_REPEAT: - emit(REPEAT_ONE) - else: - emit(MIN_REPEAT_ONE) - skip = _len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - emit(SUCCESS) - code[skip] = _len(code) - skip - else: - emit(REPEAT) - skip = _len(code); emit(0) - emit(av[0]) - emit(av[1]) - _compile(code, av[2], flags) - code[skip] = _len(code) - skip - if op is MAX_REPEAT: - emit(MAX_UNTIL) - else: - emit(MIN_UNTIL) - elif op is SUBPATTERN: - group, add_flags, del_flags, p = av - if group: - emit(MARK) - emit((group-1)*2) - # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags)) - _compile(code, p, _combine_flags(flags, add_flags, del_flags)) - if group: - emit(MARK) - emit((group-1)*2+1) - elif op in SUCCESS_CODES: - emit(op) - elif op in ASSERT_CODES: - emit(op) - skip = _len(code); emit(0) - if av[0] >= 0: - emit(0) # look ahead - else: - lo, hi = av[1].getwidth() - if lo != hi: - raise error("look-behind requires fixed-width pattern") - emit(lo) # look behind - _compile(code, av[1], flags) - emit(SUCCESS) - code[skip] = _len(code) - skip - elif op is CALL: - emit(op) - skip = _len(code); emit(0) - _compile(code, av, flags) - emit(SUCCESS) - code[skip] = _len(code) - skip - elif op is AT: - emit(op) - if flags & SRE_FLAG_MULTILINE: - av = AT_MULTILINE.get(av, av) - if flags & SRE_FLAG_LOCALE: - av = AT_LOCALE.get(av, av) - elif flags & SRE_FLAG_UNICODE: - av = AT_UNICODE.get(av, av) - emit(av) - elif op is BRANCH: - emit(op) - tail = [] - tailappend = tail.append - for av in av[1]: - skip = _len(code); emit(0) - # _compile_info(code, av, flags) - _compile(code, av, flags) - emit(JUMP) - tailappend(_len(code)); emit(0) - code[skip] = _len(code) - skip - emit(FAILURE) # end of branch - for tail in tail: - code[tail] = _len(code) - tail - elif op is CATEGORY: - emit(op) - if flags & SRE_FLAG_LOCALE: - av = CH_LOCALE[av] - elif flags & SRE_FLAG_UNICODE: - av = CH_UNICODE[av] - emit(av) - elif op is GROUPREF: - if not flags & SRE_FLAG_IGNORECASE: - emit(op) - elif flags & SRE_FLAG_LOCALE: - emit(GROUPREF_LOC_IGNORE) - elif not fixes: # ascii - emit(GROUPREF_IGNORE) - else: - emit(GROUPREF_UNI_IGNORE) - emit(av-1) - elif op is GROUPREF_EXISTS: - emit(op) - emit(av[0]-1) - skipyes = _len(code); emit(0) - _compile(code, av[1], flags) - if av[2]: - emit(JUMP) - skipno = _len(code); emit(0) - code[skipyes] = _len(code) - skipyes + 1 - _compile(code, av[2], flags) - code[skipno] = _len(code) - skipno - else: - code[skipyes] = _len(code) - skipyes + 1 - else: - raise error("internal: unsupported operand type %r" % (op,)) - -def _compile_charset(charset, flags, code): - # compile charset subprogram - emit = code.append - for op, av in charset: - emit(op) - if op is NEGATE: - pass - elif op is LITERAL: - emit(av) - elif op is RANGE or op is RANGE_UNI_IGNORE: - emit(av[0]) - emit(av[1]) - elif op is CHARSET: - code.extend(av) - elif op is BIGCHARSET: - code.extend(av) - elif op is CATEGORY: - if flags & SRE_FLAG_LOCALE: - emit(CH_LOCALE[av]) - elif flags & SRE_FLAG_UNICODE: - emit(CH_UNICODE[av]) - else: - emit(av) - else: - raise error("internal: unsupported set operator %r" % (op,)) - emit(FAILURE) - -def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): - # internal: optimize character set - out = [] - tail = [] - charmap = bytearray(256) - hascased = False - for op, av in charset: - while True: - try: - if op is LITERAL: - if fixup: - lo = fixup(av) - charmap[lo] = 1 - if fixes and lo in fixes: - for k in fixes[lo]: - charmap[k] = 1 - if not hascased and iscased(av): - hascased = True - else: - charmap[av] = 1 - elif op is RANGE: - r = range(av[0], av[1]+1) - if fixup: - if fixes: - for i in map(fixup, r): - charmap[i] = 1 - if i in fixes: - for k in fixes[i]: - charmap[k] = 1 - else: - for i in map(fixup, r): - charmap[i] = 1 - if not hascased: - hascased = any(map(iscased, r)) - else: - for i in r: - charmap[i] = 1 - elif op is NEGATE: - out.append((op, av)) - else: - tail.append((op, av)) - except IndexError: - if len(charmap) == 256: - # character set contains non-UCS1 character codes - charmap += b'\0' * 0xff00 - continue - # Character set contains non-BMP character codes. - if fixup: - hascased = True - # There are only two ranges of cased non-BMP characters: - # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), - # and for both ranges RANGE_UNI_IGNORE works. - if op is RANGE: - op = RANGE_UNI_IGNORE - tail.append((op, av)) - break - - # compress character map - runs = [] - q = 0 - while True: - p = charmap.find(1, q) - if p < 0: - break - if len(runs) >= 2: - runs = None - break - q = charmap.find(0, p) - if q < 0: - runs.append((p, len(charmap))) - break - runs.append((p, q)) - if runs is not None: - # use literal/range - for p, q in runs: - if q - p == 1: - out.append((LITERAL, p)) - else: - out.append((RANGE, (p, q - 1))) - out += tail - # if the case was changed or new representation is more compact - if hascased or len(out) < len(charset): - return out, hascased - # else original character set is good enough - return charset, hascased - - # use bitmap - if len(charmap) == 256: - data = _mk_bitmap(charmap) - out.append((CHARSET, data)) - out += tail - return out, hascased - - # To represent a big charset, first a bitmap of all characters in the - # set is constructed. Then, this bitmap is sliced into chunks of 256 - # characters, duplicate chunks are eliminated, and each chunk is - # given a number. In the compiled expression, the charset is - # represented by a 32-bit word sequence, consisting of one word for - # the number of different chunks, a sequence of 256 bytes (64 words) - # of chunk numbers indexed by their original chunk position, and a - # sequence of 256-bit chunks (8 words each). - - # Compression is normally good: in a typical charset, large ranges of - # Unicode will be either completely excluded (e.g. if only cyrillic - # letters are to be matched), or completely included (e.g. if large - # subranges of Kanji match). These ranges will be represented by - # chunks of all one-bits or all zero-bits. - - # Matching can be also done efficiently: the more significant byte of - # the Unicode character is an index into the chunk number, and the - # less significant byte is a bit index in the chunk (just like the - # CHARSET matching). - - charmap = bytes(charmap) # should be hashable - comps = {} - mapping = bytearray(256) - block = 0 - data = bytearray() - for i in range(0, 65536, 256): - chunk = charmap[i: i + 256] - if chunk in comps: - mapping[i // 256] = comps[chunk] - else: - mapping[i // 256] = comps[chunk] = block - block += 1 - data += chunk - data = _mk_bitmap(data) - data[0:0] = [block] + _bytes_to_codes(mapping) - out.append((BIGCHARSET, data)) - out += tail - return out, hascased - -_CODEBITS = _sre.CODESIZE * 8 -MAXCODE = (1 << _CODEBITS) - 1 -_BITS_TRANS = b'0' + b'1' * 255 -def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): - s = bits.translate(_BITS_TRANS)[::-1] - return [_int(s[i - _CODEBITS: i], 2) - for i in range(len(s), 0, -_CODEBITS)] - -def _bytes_to_codes(b): - # Convert block indices to word array - a = memoryview(b).cast('I') - assert a.itemsize == _sre.CODESIZE - assert len(a) * a.itemsize == len(b) - return a.tolist() - -def _simple(p): - # check if this subpattern is a "simple" operator - if len(p) != 1: - return False - op, av = p[0] - if op is SUBPATTERN: - return av[0] is None and _simple(av[-1]) - return op in _UNIT_CODES - -def _generate_overlap_table(prefix): - """ - Generate an overlap table for the following prefix. - An overlap table is a table of the same size as the prefix which - informs about the potential self-overlap for each index in the prefix: - - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...] - - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with - prefix[0:k] - """ - table = [0] * len(prefix) - for i in range(1, len(prefix)): - idx = table[i - 1] - while prefix[i] != prefix[idx]: - if idx == 0: - table[i] = 0 - break - idx = table[idx - 1] - else: - table[i] = idx + 1 - return table - -def _get_iscased(flags): - if not flags & SRE_FLAG_IGNORECASE: - return None - elif flags & SRE_FLAG_UNICODE: - return _sre.unicode_iscased - else: - return _sre.ascii_iscased - -def _get_literal_prefix(pattern, flags): - # look for literal prefix - prefix = [] - prefixappend = prefix.append - prefix_skip = None - iscased = _get_iscased(flags) - for op, av in pattern.data: - if op is LITERAL: - if iscased and iscased(av): - break - prefixappend(av) - elif op is SUBPATTERN: - group, add_flags, del_flags, p = av - flags1 = _combine_flags(flags, add_flags, del_flags) - if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE: - break - prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1) - if prefix_skip is None: - if group is not None: - prefix_skip = len(prefix) - elif prefix_skip1 is not None: - prefix_skip = len(prefix) + prefix_skip1 - prefix.extend(prefix1) - if not got_all: - break - else: - break - else: - return prefix, prefix_skip, True - return prefix, prefix_skip, False - -def _get_charset_prefix(pattern, flags): - while True: - if not pattern.data: - return None - op, av = pattern.data[0] - if op is not SUBPATTERN: - break - group, add_flags, del_flags, pattern = av - flags = _combine_flags(flags, add_flags, del_flags) - if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: - return None - - iscased = _get_iscased(flags) - if op is LITERAL: - if iscased and iscased(av): - return None - return [(op, av)] - elif op is BRANCH: - charset = [] - charsetappend = charset.append - for p in av[1]: - if not p: - return None - op, av = p[0] - if op is LITERAL and not (iscased and iscased(av)): - charsetappend((op, av)) - else: - return None - return charset - elif op is IN: - charset = av - if iscased: - for op, av in charset: - if op is LITERAL: - if iscased(av): - return None - elif op is RANGE: - if av[1] > 0xffff: - return None - if any(map(iscased, range(av[0], av[1]+1))): - return None - return charset - return None - -def _compile_info(code, pattern, flags): - # internal: compile an info block. in the current version, - # this contains min/max pattern width, and an optional literal - # prefix or a character map - lo, hi = pattern.getwidth() - if hi > MAXCODE: - hi = MAXCODE - if lo == 0: - code.extend([INFO, 4, 0, lo, hi]) - return - # look for a literal prefix - prefix = [] - prefix_skip = 0 - charset = [] # not used - if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): - # look for literal prefix - prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) - # if no prefix, look for charset prefix - if not prefix: - charset = _get_charset_prefix(pattern, flags) -## if prefix: -## print("*** PREFIX", prefix, prefix_skip) -## if charset: -## print("*** CHARSET", charset) - # add an info block - emit = code.append - emit(INFO) - skip = len(code); emit(0) - # literal flag - mask = 0 - if prefix: - mask = SRE_INFO_PREFIX - if prefix_skip is None and got_all: - mask = mask | SRE_INFO_LITERAL - elif charset: - mask = mask | SRE_INFO_CHARSET - emit(mask) - # pattern length - if lo < MAXCODE: - emit(lo) - else: - emit(MAXCODE) - prefix = prefix[:MAXCODE] - emit(min(hi, MAXCODE)) - # add literal prefix - if prefix: - emit(len(prefix)) # length - if prefix_skip is None: - prefix_skip = len(prefix) - emit(prefix_skip) # skip - code.extend(prefix) - # generate overlap table - code.extend(_generate_overlap_table(prefix)) - elif charset: - charset, hascased = _optimize_charset(charset) - assert not hascased - _compile_charset(charset, flags, code) - code[skip] = len(code) - skip - -def isstring(obj): - return isinstance(obj, (str, bytes)) - -def _code(p, flags): - - flags = p.state.flags | flags - code = [] - - # compile info block - _compile_info(code, p, flags) - - # compile the pattern - _compile(code, p.data, flags) - - code.append(SUCCESS) - - return code - -def _hex_code(code): - return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code) - -def dis(code): - import sys - - labels = set() - level = 0 - offset_width = len(str(len(code) - 1)) - - def dis_(start, end): - def print_(*args, to=None): - if to is not None: - labels.add(to) - args += ('(to %d)' % (to,),) - print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'), - end=' '*(level-1)) - print(*args) - - def print_2(*args): - print(end=' '*(offset_width + 2*level)) - print(*args) - - nonlocal level - level += 1 - i = start - while i < end: - start = i - op = code[i] - i += 1 - op = OPCODES[op] - if op in (SUCCESS, FAILURE, ANY, ANY_ALL, - MAX_UNTIL, MIN_UNTIL, NEGATE): - print_(op) - elif op in (LITERAL, NOT_LITERAL, - LITERAL_IGNORE, NOT_LITERAL_IGNORE, - LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE, - LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE): - arg = code[i] - i += 1 - print_(op, '%#02x (%r)' % (arg, chr(arg))) - elif op is AT: - arg = code[i] - i += 1 - arg = str(ATCODES[arg]) - assert arg[:3] == 'AT_' - print_(op, arg[3:]) - elif op is CATEGORY: - arg = code[i] - i += 1 - arg = str(CHCODES[arg]) - assert arg[:9] == 'CATEGORY_' - print_(op, arg[9:]) - elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE): - skip = code[i] - print_(op, skip, to=i+skip) - dis_(i+1, i+skip) - i += skip - elif op in (RANGE, RANGE_UNI_IGNORE): - lo, hi = code[i: i+2] - i += 2 - print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi))) - elif op is CHARSET: - print_(op, _hex_code(code[i: i + 256//_CODEBITS])) - i += 256//_CODEBITS - elif op is BIGCHARSET: - arg = code[i] - i += 1 - mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder) - for x in code[i: i + 256//_sre.CODESIZE])) - print_(op, arg, mapping) - i += 256//_sre.CODESIZE - level += 1 - for j in range(arg): - print_2(_hex_code(code[i: i + 256//_CODEBITS])) - i += 256//_CODEBITS - level -= 1 - elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE, - GROUPREF_LOC_IGNORE): - arg = code[i] - i += 1 - print_(op, arg) - elif op is JUMP: - skip = code[i] - print_(op, skip, to=i+skip) - i += 1 - elif op is BRANCH: - skip = code[i] - print_(op, skip, to=i+skip) - while skip: - dis_(i+1, i+skip) - i += skip - start = i - skip = code[i] - if skip: - print_('branch', skip, to=i+skip) - else: - print_(FAILURE) - i += 1 - elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE): - skip, min, max = code[i: i+3] - if max == MAXREPEAT: - max = 'MAXREPEAT' - print_(op, skip, min, max, to=i+skip) - dis_(i+3, i+skip) - i += skip - elif op is GROUPREF_EXISTS: - arg, skip = code[i: i+2] - print_(op, arg, skip, to=i+skip) - i += 2 - elif op in (ASSERT, ASSERT_NOT): - skip, arg = code[i: i+2] - print_(op, skip, arg, to=i+skip) - dis_(i+2, i+skip) - i += skip - elif op is INFO: - skip, flags, min, max = code[i: i+4] - if max == MAXREPEAT: - max = 'MAXREPEAT' - print_(op, skip, bin(flags), min, max, to=i+skip) - start = i+4 - if flags & SRE_INFO_PREFIX: - prefix_len, prefix_skip = code[i+4: i+6] - print_2(' prefix_skip', prefix_skip) - start = i + 6 - prefix = code[start: start+prefix_len] - print_2(' prefix', - '[%s]' % ', '.join('%#02x' % x for x in prefix), - '(%r)' % ''.join(map(chr, prefix))) - start += prefix_len - print_2(' overlap', code[start: start+prefix_len]) - start += prefix_len - if flags & SRE_INFO_CHARSET: - level += 1 - print_2('in') - dis_(start, i+skip) - level -= 1 - i += skip - else: - raise ValueError(op) - - level -= 1 - - dis_(0, len(code)) - - -def compile(p, flags=0): - # internal: convert pattern list to internal format - - if isstring(p): - pattern = p - p = sre_parse.parse(p, flags) - else: - pattern = None - - code = _code(p, flags) - - if flags & SRE_FLAG_DEBUG: - print() - dis(code) - - # map in either direction - groupindex = p.state.groupdict - indexgroup = [None] * p.state.groups - for k, i in groupindex.items(): - indexgroup[i] = k - - return _sre.compile( - pattern, flags | p.state.flags, code, - p.state.groups-1, - groupindex, tuple(indexgroup) - ) +from re import _compiler as _ +globals().update({k: v for k, v in vars(_).items() if k[:2] != '__'}) diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 8360acb695..8543e2bc8c 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -1,218 +1,10 @@ -# -# Secret Labs' Regular Expression Engine -# -# various symbols used by the regular expression engine. -# run this script to update the _sre include files! -# -# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. -# -# See the sre.py file for information on usage and redistribution. -# +import warnings +warnings.warn(f"module {__name__!r} is deprecated", + DeprecationWarning, + stacklevel=2) -"""Internal support module for sre""" - -# update when constants are added or removed - -MAGIC = 20171005 - -from _sre import MAXREPEAT, MAXGROUPS - -# SRE standard exception (access as sre.error) -# should this really be here? - -class error(Exception): - """Exception raised for invalid regular expressions. - - Attributes: - - msg: The unformatted error message - pattern: The regular expression pattern - pos: The index in the pattern where compilation failed (may be None) - lineno: The line corresponding to pos (may be None) - colno: The column corresponding to pos (may be None) - """ - - __module__ = 're' - - def __init__(self, msg, pattern=None, pos=None): - self.msg = msg - self.pattern = pattern - self.pos = pos - if pattern is not None and pos is not None: - msg = '%s at position %d' % (msg, pos) - if isinstance(pattern, str): - newline = '\n' - else: - newline = b'\n' - self.lineno = pattern.count(newline, 0, pos) + 1 - self.colno = pos - pattern.rfind(newline, 0, pos) - if newline in pattern: - msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno) - else: - self.lineno = self.colno = None - super().__init__(msg) - - -class _NamedIntConstant(int): - def __new__(cls, value, name): - self = super(_NamedIntConstant, cls).__new__(cls, value) - self.name = name - return self - - def __repr__(self): - return self.name - -MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT') - -def _makecodes(names): - names = names.strip().split() - items = [_NamedIntConstant(i, name) for i, name in enumerate(names)] - globals().update({item.name: item for item in items}) - return items - -# operators -# failure=0 success=1 (just because it looks better that way :-) -OPCODES = _makecodes(""" - FAILURE SUCCESS - - ANY ANY_ALL - ASSERT ASSERT_NOT - AT - BRANCH - CALL - CATEGORY - CHARSET BIGCHARSET - GROUPREF GROUPREF_EXISTS - IN - INFO - JUMP - LITERAL - MARK - MAX_UNTIL - MIN_UNTIL - NOT_LITERAL - NEGATE - RANGE - REPEAT - REPEAT_ONE - SUBPATTERN - MIN_REPEAT_ONE - - GROUPREF_IGNORE - IN_IGNORE - LITERAL_IGNORE - NOT_LITERAL_IGNORE - - GROUPREF_LOC_IGNORE - IN_LOC_IGNORE - LITERAL_LOC_IGNORE - NOT_LITERAL_LOC_IGNORE - - GROUPREF_UNI_IGNORE - IN_UNI_IGNORE - LITERAL_UNI_IGNORE - NOT_LITERAL_UNI_IGNORE - RANGE_UNI_IGNORE - - MIN_REPEAT MAX_REPEAT -""") -del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT - -# positions -ATCODES = _makecodes(""" - AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING - AT_BOUNDARY AT_NON_BOUNDARY - AT_END AT_END_LINE AT_END_STRING - - AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY - - AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY -""") - -# categories -CHCODES = _makecodes(""" - CATEGORY_DIGIT CATEGORY_NOT_DIGIT - CATEGORY_SPACE CATEGORY_NOT_SPACE - CATEGORY_WORD CATEGORY_NOT_WORD - CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK - - CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD - - CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT - CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE - CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD - CATEGORY_UNI_LINEBREAK CATEGORY_UNI_NOT_LINEBREAK -""") - - -# replacement operations for "ignore case" mode -OP_IGNORE = { - LITERAL: LITERAL_IGNORE, - NOT_LITERAL: NOT_LITERAL_IGNORE, -} - -OP_LOCALE_IGNORE = { - LITERAL: LITERAL_LOC_IGNORE, - NOT_LITERAL: NOT_LITERAL_LOC_IGNORE, -} - -OP_UNICODE_IGNORE = { - LITERAL: LITERAL_UNI_IGNORE, - NOT_LITERAL: NOT_LITERAL_UNI_IGNORE, -} - -AT_MULTILINE = { - AT_BEGINNING: AT_BEGINNING_LINE, - AT_END: AT_END_LINE -} - -AT_LOCALE = { - AT_BOUNDARY: AT_LOC_BOUNDARY, - AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY -} - -AT_UNICODE = { - AT_BOUNDARY: AT_UNI_BOUNDARY, - AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY -} - -CH_LOCALE = { - CATEGORY_DIGIT: CATEGORY_DIGIT, - CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT, - CATEGORY_SPACE: CATEGORY_SPACE, - CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE, - CATEGORY_WORD: CATEGORY_LOC_WORD, - CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD, - CATEGORY_LINEBREAK: CATEGORY_LINEBREAK, - CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK -} - -CH_UNICODE = { - CATEGORY_DIGIT: CATEGORY_UNI_DIGIT, - CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT, - CATEGORY_SPACE: CATEGORY_UNI_SPACE, - CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE, - CATEGORY_WORD: CATEGORY_UNI_WORD, - CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD, - CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK, - CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK -} - -# flags -SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking) -SRE_FLAG_IGNORECASE = 2 # case insensitive -SRE_FLAG_LOCALE = 4 # honour system locale -SRE_FLAG_MULTILINE = 8 # treat target as multiline string -SRE_FLAG_DOTALL = 16 # treat target as a single string -SRE_FLAG_UNICODE = 32 # use unicode "locale" -SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments -SRE_FLAG_DEBUG = 128 # debugging -SRE_FLAG_ASCII = 256 # use ascii "locale" - -# flags for INFO primitive -SRE_INFO_PREFIX = 1 # has prefix -SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix) -SRE_INFO_CHARSET = 4 # pattern starts with character from given set +from re import _constants as _ +globals().update({k: v for k, v in vars(_).items() if k[:2] != '__'}) if __name__ == "__main__": def dump(f, d, typ, int_t, prefix): diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 83119168e6..25a3f557d4 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -1,1064 +1,7 @@ -# -# Secret Labs' Regular Expression Engine -# -# convert re-style regular expression to sre pattern -# -# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. -# -# See the sre.py file for information on usage and redistribution. -# +import warnings +warnings.warn(f"module {__name__!r} is deprecated", + DeprecationWarning, + stacklevel=2) -"""Internal support module for sre""" - -# XXX: show string offset and offending character for all errors - -from sre_constants import * - -SPECIAL_CHARS = ".\\[{()*+?^$|" -REPEAT_CHARS = "*+?{" - -DIGITS = frozenset("0123456789") - -OCTDIGITS = frozenset("01234567") -HEXDIGITS = frozenset("0123456789abcdefABCDEF") -ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") - -WHITESPACE = frozenset(" \t\n\r\v\f") - -_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) -_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) - -ESCAPES = { - r"\a": (LITERAL, ord("\a")), - r"\b": (LITERAL, ord("\b")), - r"\f": (LITERAL, ord("\f")), - r"\n": (LITERAL, ord("\n")), - r"\r": (LITERAL, ord("\r")), - r"\t": (LITERAL, ord("\t")), - r"\v": (LITERAL, ord("\v")), - r"\\": (LITERAL, ord("\\")) -} - -CATEGORIES = { - r"\A": (AT, AT_BEGINNING_STRING), # start of string - r"\b": (AT, AT_BOUNDARY), - r"\B": (AT, AT_NON_BOUNDARY), - r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), - r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), - r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), - r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), - r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), - r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), - r"\Z": (AT, AT_END_STRING), # end of string -} - -FLAGS = { - # standard flags - "i": SRE_FLAG_IGNORECASE, - "L": SRE_FLAG_LOCALE, - "m": SRE_FLAG_MULTILINE, - "s": SRE_FLAG_DOTALL, - "x": SRE_FLAG_VERBOSE, - # extensions - "a": SRE_FLAG_ASCII, - "t": SRE_FLAG_TEMPLATE, - "u": SRE_FLAG_UNICODE, -} - -TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE -GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE - -class Verbose(Exception): - pass - -class State: - # keeps track of state for parsing - def __init__(self): - self.flags = 0 - self.groupdict = {} - self.groupwidths = [None] # group 0 - self.lookbehindgroups = None - @property - def groups(self): - return len(self.groupwidths) - def opengroup(self, name=None): - gid = self.groups - self.groupwidths.append(None) - if self.groups > MAXGROUPS: - raise error("too many groups") - if name is not None: - ogid = self.groupdict.get(name, None) - if ogid is not None: - raise error("redefinition of group name %r as group %d; " - "was group %d" % (name, gid, ogid)) - self.groupdict[name] = gid - return gid - def closegroup(self, gid, p): - self.groupwidths[gid] = p.getwidth() - def checkgroup(self, gid): - return gid < self.groups and self.groupwidths[gid] is not None - - def checklookbehindgroup(self, gid, source): - if self.lookbehindgroups is not None: - if not self.checkgroup(gid): - raise source.error('cannot refer to an open group') - if gid >= self.lookbehindgroups: - raise source.error('cannot refer to group defined in the same ' - 'lookbehind subpattern') - -class SubPattern: - # a subpattern, in intermediate form - def __init__(self, state, data=None): - self.state = state - if data is None: - data = [] - self.data = data - self.width = None - - def dump(self, level=0): - nl = True - seqtypes = (tuple, list) - for op, av in self.data: - print(level*" " + str(op), end='') - if op is IN: - # member sublanguage - print() - for op, a in av: - print((level+1)*" " + str(op), a) - elif op is BRANCH: - print() - for i, a in enumerate(av[1]): - if i: - print(level*" " + "OR") - a.dump(level+1) - elif op is GROUPREF_EXISTS: - condgroup, item_yes, item_no = av - print('', condgroup) - item_yes.dump(level+1) - if item_no: - print(level*" " + "ELSE") - item_no.dump(level+1) - elif isinstance(av, seqtypes): - nl = False - for a in av: - if isinstance(a, SubPattern): - if not nl: - print() - a.dump(level+1) - nl = True - else: - if not nl: - print(' ', end='') - print(a, end='') - nl = False - if not nl: - print() - else: - print('', av) - def __repr__(self): - return repr(self.data) - def __len__(self): - return len(self.data) - def __delitem__(self, index): - del self.data[index] - def __getitem__(self, index): - if isinstance(index, slice): - return SubPattern(self.state, self.data[index]) - return self.data[index] - def __setitem__(self, index, code): - self.data[index] = code - def insert(self, index, code): - self.data.insert(index, code) - def append(self, code): - self.data.append(code) - def getwidth(self): - # determine the width (min, max) for this subpattern - if self.width is not None: - return self.width - lo = hi = 0 - for op, av in self.data: - if op is BRANCH: - i = MAXREPEAT - 1 - j = 0 - for av in av[1]: - l, h = av.getwidth() - i = min(i, l) - j = max(j, h) - lo = lo + i - hi = hi + j - elif op is CALL: - i, j = av.getwidth() - lo = lo + i - hi = hi + j - elif op is SUBPATTERN: - i, j = av[-1].getwidth() - lo = lo + i - hi = hi + j - elif op in _REPEATCODES: - i, j = av[2].getwidth() - lo = lo + i * av[0] - hi = hi + j * av[1] - elif op in _UNITCODES: - lo = lo + 1 - hi = hi + 1 - elif op is GROUPREF: - i, j = self.state.groupwidths[av] - lo = lo + i - hi = hi + j - elif op is GROUPREF_EXISTS: - i, j = av[1].getwidth() - if av[2] is not None: - l, h = av[2].getwidth() - i = min(i, l) - j = max(j, h) - else: - i = 0 - lo = lo + i - hi = hi + j - elif op is SUCCESS: - break - self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) - return self.width - -class Tokenizer: - def __init__(self, string): - self.istext = isinstance(string, str) - self.string = string - if not self.istext: - string = str(string, 'latin1') - self.decoded_string = string - self.index = 0 - self.next = None - self.__next() - def __next(self): - index = self.index - try: - char = self.decoded_string[index] - except IndexError: - self.next = None - return - if char == "\\": - index += 1 - try: - char += self.decoded_string[index] - except IndexError: - raise error("bad escape (end of pattern)", - self.string, len(self.string) - 1) from None - self.index = index + 1 - self.next = char - def match(self, char): - if char == self.next: - self.__next() - return True - return False - def get(self): - this = self.next - self.__next() - return this - def getwhile(self, n, charset): - result = '' - for _ in range(n): - c = self.next - if c not in charset: - break - result += c - self.__next() - return result - def getuntil(self, terminator, name): - result = '' - while True: - c = self.next - self.__next() - if c is None: - if not result: - raise self.error("missing " + name) - raise self.error("missing %s, unterminated name" % terminator, - len(result)) - if c == terminator: - if not result: - raise self.error("missing " + name, 1) - break - result += c - return result - @property - def pos(self): - return self.index - len(self.next or '') - def tell(self): - return self.index - len(self.next or '') - def seek(self, index): - self.index = index - self.__next() - - def error(self, msg, offset=0): - return error(msg, self.string, self.tell() - offset) - -def _class_escape(source, escape): - # handle escape code inside character class - code = ESCAPES.get(escape) - if code: - return code - code = CATEGORIES.get(escape) - if code and code[0] is IN: - return code - try: - c = escape[1:2] - if c == "x": - # hexadecimal escape (exactly two digits) - escape += source.getwhile(2, HEXDIGITS) - if len(escape) != 4: - raise source.error("incomplete escape %s" % escape, len(escape)) - return LITERAL, int(escape[2:], 16) - elif c == "u" and source.istext: - # unicode escape (exactly four digits) - escape += source.getwhile(4, HEXDIGITS) - if len(escape) != 6: - raise source.error("incomplete escape %s" % escape, len(escape)) - return LITERAL, int(escape[2:], 16) - elif c == "U" and source.istext: - # unicode escape (exactly eight digits) - escape += source.getwhile(8, HEXDIGITS) - if len(escape) != 10: - raise source.error("incomplete escape %s" % escape, len(escape)) - c = int(escape[2:], 16) - chr(c) # raise ValueError for invalid code - return LITERAL, c - elif c == "N" and source.istext: - import unicodedata - # named unicode escape e.g. \N{EM DASH} - if not source.match('{'): - raise source.error("missing {") - charname = source.getuntil('}', 'character name') - try: - c = ord(unicodedata.lookup(charname)) - except KeyError: - raise source.error("undefined character name %r" % charname, - len(charname) + len(r'\N{}')) - return LITERAL, c - elif c in OCTDIGITS: - # octal escape (up to three digits) - escape += source.getwhile(2, OCTDIGITS) - c = int(escape[1:], 8) - if c > 0o377: - raise source.error('octal escape value %s outside of ' - 'range 0-0o377' % escape, len(escape)) - return LITERAL, c - elif c in DIGITS: - raise ValueError - if len(escape) == 2: - if c in ASCIILETTERS: - raise source.error('bad escape %s' % escape, len(escape)) - return LITERAL, ord(escape[1]) - except ValueError: - pass - raise source.error("bad escape %s" % escape, len(escape)) - -def _escape(source, escape, state): - # handle escape code in expression - code = CATEGORIES.get(escape) - if code: - return code - code = ESCAPES.get(escape) - if code: - return code - try: - c = escape[1:2] - if c == "x": - # hexadecimal escape - escape += source.getwhile(2, HEXDIGITS) - if len(escape) != 4: - raise source.error("incomplete escape %s" % escape, len(escape)) - return LITERAL, int(escape[2:], 16) - elif c == "u" and source.istext: - # unicode escape (exactly four digits) - escape += source.getwhile(4, HEXDIGITS) - if len(escape) != 6: - raise source.error("incomplete escape %s" % escape, len(escape)) - return LITERAL, int(escape[2:], 16) - elif c == "U" and source.istext: - # unicode escape (exactly eight digits) - escape += source.getwhile(8, HEXDIGITS) - if len(escape) != 10: - raise source.error("incomplete escape %s" % escape, len(escape)) - c = int(escape[2:], 16) - chr(c) # raise ValueError for invalid code - return LITERAL, c - elif c == "N" and source.istext: - import unicodedata - # named unicode escape e.g. \N{EM DASH} - if not source.match('{'): - raise source.error("missing {") - charname = source.getuntil('}', 'character name') - try: - c = ord(unicodedata.lookup(charname)) - except KeyError: - raise source.error("undefined character name %r" % charname, - len(charname) + len(r'\N{}')) - return LITERAL, c - elif c == "0": - # octal escape - escape += source.getwhile(2, OCTDIGITS) - return LITERAL, int(escape[1:], 8) - elif c in DIGITS: - # octal escape *or* decimal group reference (sigh) - if source.next in DIGITS: - escape += source.get() - if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and - source.next in OCTDIGITS): - # got three octal digits; this is an octal escape - escape += source.get() - c = int(escape[1:], 8) - if c > 0o377: - raise source.error('octal escape value %s outside of ' - 'range 0-0o377' % escape, - len(escape)) - return LITERAL, c - # not an octal escape, so this is a group reference - group = int(escape[1:]) - if group < state.groups: - if not state.checkgroup(group): - raise source.error("cannot refer to an open group", - len(escape)) - state.checklookbehindgroup(group, source) - return GROUPREF, group - raise source.error("invalid group reference %d" % group, len(escape) - 1) - if len(escape) == 2: - if c in ASCIILETTERS: - raise source.error("bad escape %s" % escape, len(escape)) - return LITERAL, ord(escape[1]) - except ValueError: - pass - raise source.error("bad escape %s" % escape, len(escape)) - -def _uniq(items): - return list(dict.fromkeys(items)) - -def _parse_sub(source, state, verbose, nested): - # parse an alternation: a|b|c - - items = [] - itemsappend = items.append - sourcematch = source.match - start = source.tell() - while True: - itemsappend(_parse(source, state, verbose, nested + 1, - not nested and not items)) - if not sourcematch("|"): - break - - if len(items) == 1: - return items[0] - - subpattern = SubPattern(state) - - # check if all items share a common prefix - while True: - prefix = None - for item in items: - if not item: - break - if prefix is None: - prefix = item[0] - elif item[0] != prefix: - break - else: - # all subitems start with a common "prefix". - # move it out of the branch - for item in items: - del item[0] - subpattern.append(prefix) - continue # check next one - break - - # check if the branch can be replaced by a character set - set = [] - for item in items: - if len(item) != 1: - break - op, av = item[0] - if op is LITERAL: - set.append((op, av)) - elif op is IN and av[0][0] is not NEGATE: - set.extend(av) - else: - break - else: - # we can store this as a character set instead of a - # branch (the compiler may optimize this even more) - subpattern.append((IN, _uniq(set))) - return subpattern - - subpattern.append((BRANCH, (None, items))) - return subpattern - -def _parse(source, state, verbose, nested, first=False): - # parse a simple pattern - subpattern = SubPattern(state) - - # precompute constants into local variables - subpatternappend = subpattern.append - sourceget = source.get - sourcematch = source.match - _len = len - _ord = ord - - while True: - - this = source.next - if this is None: - break # end of pattern - if this in "|)": - break # end of subpattern - sourceget() - - if verbose: - # skip whitespace and comments - if this in WHITESPACE: - continue - if this == "#": - while True: - this = sourceget() - if this is None or this == "\n": - break - continue - - if this[0] == "\\": - code = _escape(source, this, state) - subpatternappend(code) - - elif this not in SPECIAL_CHARS: - subpatternappend((LITERAL, _ord(this))) - - elif this == "[": - here = source.tell() - 1 - # character set - set = [] - setappend = set.append -## if sourcematch(":"): -## pass # handle character classes - if source.next == '[': - import warnings - warnings.warn( - 'Possible nested set at position %d' % source.tell(), - FutureWarning, stacklevel=nested + 6 - ) - negate = sourcematch("^") - # check remaining characters - while True: - this = sourceget() - if this is None: - raise source.error("unterminated character set", - source.tell() - here) - if this == "]" and set: - break - elif this[0] == "\\": - code1 = _class_escape(source, this) - else: - if set and this in '-&~|' and source.next == this: - import warnings - warnings.warn( - 'Possible set %s at position %d' % ( - 'difference' if this == '-' else - 'intersection' if this == '&' else - 'symmetric difference' if this == '~' else - 'union', - source.tell() - 1), - FutureWarning, stacklevel=nested + 6 - ) - code1 = LITERAL, _ord(this) - if sourcematch("-"): - # potential range - that = sourceget() - if that is None: - raise source.error("unterminated character set", - source.tell() - here) - if that == "]": - if code1[0] is IN: - code1 = code1[1][0] - setappend(code1) - setappend((LITERAL, _ord("-"))) - break - if that[0] == "\\": - code2 = _class_escape(source, that) - else: - if that == '-': - import warnings - warnings.warn( - 'Possible set difference at position %d' % ( - source.tell() - 2), - FutureWarning, stacklevel=nested + 6 - ) - code2 = LITERAL, _ord(that) - if code1[0] != LITERAL or code2[0] != LITERAL: - msg = "bad character range %s-%s" % (this, that) - raise source.error(msg, len(this) + 1 + len(that)) - lo = code1[1] - hi = code2[1] - if hi < lo: - msg = "bad character range %s-%s" % (this, that) - raise source.error(msg, len(this) + 1 + len(that)) - setappend((RANGE, (lo, hi))) - else: - if code1[0] is IN: - code1 = code1[1][0] - setappend(code1) - - set = _uniq(set) - # XXX: should move set optimization to compiler! - if _len(set) == 1 and set[0][0] is LITERAL: - # optimization - if negate: - subpatternappend((NOT_LITERAL, set[0][1])) - else: - subpatternappend(set[0]) - else: - if negate: - set.insert(0, (NEGATE, None)) - # charmap optimization can't be added here because - # global flags still are not known - subpatternappend((IN, set)) - - elif this in REPEAT_CHARS: - # repeat previous item - here = source.tell() - if this == "?": - min, max = 0, 1 - elif this == "*": - min, max = 0, MAXREPEAT - - elif this == "+": - min, max = 1, MAXREPEAT - elif this == "{": - if source.next == "}": - subpatternappend((LITERAL, _ord(this))) - continue - - min, max = 0, MAXREPEAT - lo = hi = "" - while source.next in DIGITS: - lo += sourceget() - if sourcematch(","): - while source.next in DIGITS: - hi += sourceget() - else: - hi = lo - if not sourcematch("}"): - subpatternappend((LITERAL, _ord(this))) - source.seek(here) - continue - - if lo: - min = int(lo) - if min >= MAXREPEAT: - raise OverflowError("the repetition number is too large") - if hi: - max = int(hi) - if max >= MAXREPEAT: - raise OverflowError("the repetition number is too large") - if max < min: - raise source.error("min repeat greater than max repeat", - source.tell() - here) - else: - raise AssertionError("unsupported quantifier %r" % (char,)) - # figure out which item to repeat - if subpattern: - item = subpattern[-1:] - else: - item = None - if not item or item[0][0] is AT: - raise source.error("nothing to repeat", - source.tell() - here + len(this)) - if item[0][0] in _REPEATCODES: - raise source.error("multiple repeat", - source.tell() - here + len(this)) - if item[0][0] is SUBPATTERN: - group, add_flags, del_flags, p = item[0][1] - if group is None and not add_flags and not del_flags: - item = p - if sourcematch("?"): - subpattern[-1] = (MIN_REPEAT, (min, max, item)) - else: - subpattern[-1] = (MAX_REPEAT, (min, max, item)) - - elif this == ".": - subpatternappend((ANY, None)) - - elif this == "(": - start = source.tell() - 1 - group = True - name = None - add_flags = 0 - del_flags = 0 - if sourcematch("?"): - # options - char = sourceget() - if char is None: - raise source.error("unexpected end of pattern") - if char == "P": - # python extensions - if sourcematch("<"): - # named group: skip forward to end of name - name = source.getuntil(">", "group name") - if not name.isidentifier(): - msg = "bad character in group name %r" % name - raise source.error(msg, len(name) + 1) - elif sourcematch("="): - # named backreference - name = source.getuntil(")", "group name") - if not name.isidentifier(): - msg = "bad character in group name %r" % name - raise source.error(msg, len(name) + 1) - gid = state.groupdict.get(name) - if gid is None: - msg = "unknown group name %r" % name - raise source.error(msg, len(name) + 1) - if not state.checkgroup(gid): - raise source.error("cannot refer to an open group", - len(name) + 1) - state.checklookbehindgroup(gid, source) - subpatternappend((GROUPREF, gid)) - continue - - else: - char = sourceget() - if char is None: - raise source.error("unexpected end of pattern") - raise source.error("unknown extension ?P" + char, - len(char) + 2) - elif char == ":": - # non-capturing group - group = None - elif char == "#": - # comment - while True: - if source.next is None: - raise source.error("missing ), unterminated comment", - source.tell() - start) - if sourceget() == ")": - break - continue - - elif char in "=!<": - # lookahead assertions - dir = 1 - if char == "<": - char = sourceget() - if char is None: - raise source.error("unexpected end of pattern") - if char not in "=!": - raise source.error("unknown extension ?<" + char, - len(char) + 2) - dir = -1 # lookbehind - lookbehindgroups = state.lookbehindgroups - if lookbehindgroups is None: - state.lookbehindgroups = state.groups - p = _parse_sub(source, state, verbose, nested + 1) - if dir < 0: - if lookbehindgroups is None: - state.lookbehindgroups = None - if not sourcematch(")"): - raise source.error("missing ), unterminated subpattern", - source.tell() - start) - if char == "=": - subpatternappend((ASSERT, (dir, p))) - else: - subpatternappend((ASSERT_NOT, (dir, p))) - continue - - elif char == "(": - # conditional backreference group - condname = source.getuntil(")", "group name") - if condname.isidentifier(): - condgroup = state.groupdict.get(condname) - if condgroup is None: - msg = "unknown group name %r" % condname - raise source.error(msg, len(condname) + 1) - else: - try: - condgroup = int(condname) - if condgroup < 0: - raise ValueError - except ValueError: - msg = "bad character in group name %r" % condname - raise source.error(msg, len(condname) + 1) from None - if not condgroup: - raise source.error("bad group number", - len(condname) + 1) - if condgroup >= MAXGROUPS: - msg = "invalid group reference %d" % condgroup - raise source.error(msg, len(condname) + 1) - state.checklookbehindgroup(condgroup, source) - item_yes = _parse(source, state, verbose, nested + 1) - if source.match("|"): - item_no = _parse(source, state, verbose, nested + 1) - if source.next == "|": - raise source.error("conditional backref with more than two branches") - else: - item_no = None - if not source.match(")"): - raise source.error("missing ), unterminated subpattern", - source.tell() - start) - subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) - continue - - elif char in FLAGS or char == "-": - # flags - flags = _parse_flags(source, state, char) - if flags is None: # global flags - if not first or subpattern: - import warnings - warnings.warn( - 'Flags not at the start of the expression %r%s' % ( - source.string[:20], # truncate long regexes - ' (truncated)' if len(source.string) > 20 else '', - ), - DeprecationWarning, stacklevel=nested + 6 - ) - if (state.flags & SRE_FLAG_VERBOSE) and not verbose: - raise Verbose - continue - - add_flags, del_flags = flags - group = None - else: - raise source.error("unknown extension ?" + char, - len(char) + 1) - - # parse group contents - if group is not None: - try: - group = state.opengroup(name) - except error as err: - raise source.error(err.msg, len(name) + 1) from None - sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and - not (del_flags & SRE_FLAG_VERBOSE)) - p = _parse_sub(source, state, sub_verbose, nested + 1) - if not source.match(")"): - raise source.error("missing ), unterminated subpattern", - source.tell() - start) - if group is not None: - state.closegroup(group, p) - subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) - - elif this == "^": - subpatternappend((AT, AT_BEGINNING)) - - elif this == "$": - subpatternappend((AT, AT_END)) - - else: - raise AssertionError("unsupported special character %r" % (char,)) - - # unpack non-capturing groups - for i in range(len(subpattern))[::-1]: - op, av = subpattern[i] - if op is SUBPATTERN: - group, add_flags, del_flags, p = av - if group is None and not add_flags and not del_flags: - subpattern[i: i+1] = p - - return subpattern - -def _parse_flags(source, state, char): - sourceget = source.get - add_flags = 0 - del_flags = 0 - if char != "-": - while True: - flag = FLAGS[char] - if source.istext: - if char == 'L': - msg = "bad inline flags: cannot use 'L' flag with a str pattern" - raise source.error(msg) - else: - if char == 'u': - msg = "bad inline flags: cannot use 'u' flag with a bytes pattern" - raise source.error(msg) - add_flags |= flag - if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag: - msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible" - raise source.error(msg) - char = sourceget() - if char is None: - raise source.error("missing -, : or )") - if char in ")-:": - break - if char not in FLAGS: - msg = "unknown flag" if char.isalpha() else "missing -, : or )" - raise source.error(msg, len(char)) - if char == ")": - state.flags |= add_flags - return None - if add_flags & GLOBAL_FLAGS: - raise source.error("bad inline flags: cannot turn on global flag", 1) - if char == "-": - char = sourceget() - if char is None: - raise source.error("missing flag") - if char not in FLAGS: - msg = "unknown flag" if char.isalpha() else "missing flag" - raise source.error(msg, len(char)) - while True: - flag = FLAGS[char] - if flag & TYPE_FLAGS: - msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'" - raise source.error(msg) - del_flags |= flag - char = sourceget() - if char is None: - raise source.error("missing :") - if char == ":": - break - if char not in FLAGS: - msg = "unknown flag" if char.isalpha() else "missing :" - raise source.error(msg, len(char)) - assert char == ":" - if del_flags & GLOBAL_FLAGS: - raise source.error("bad inline flags: cannot turn off global flag", 1) - if add_flags & del_flags: - raise source.error("bad inline flags: flag turned on and off", 1) - return add_flags, del_flags - -def fix_flags(src, flags): - # Check and fix flags according to the type of pattern (str or bytes) - if isinstance(src, str): - if flags & SRE_FLAG_LOCALE: - raise ValueError("cannot use LOCALE flag with a str pattern") - if not flags & SRE_FLAG_ASCII: - flags |= SRE_FLAG_UNICODE - elif flags & SRE_FLAG_UNICODE: - raise ValueError("ASCII and UNICODE flags are incompatible") - else: - if flags & SRE_FLAG_UNICODE: - raise ValueError("cannot use UNICODE flag with a bytes pattern") - if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: - raise ValueError("ASCII and LOCALE flags are incompatible") - return flags - -def parse(str, flags=0, state=None): - # parse 're' pattern into list of (opcode, argument) tuples - - source = Tokenizer(str) - - if state is None: - state = State() - state.flags = flags - state.str = str - - try: - p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0) - except Verbose: - # the VERBOSE flag was switched on inside the pattern. to be - # on the safe side, we'll parse the whole thing again... - state = State() - state.flags = flags | SRE_FLAG_VERBOSE - state.str = str - source.seek(0) - p = _parse_sub(source, state, True, 0) - - p.state.flags = fix_flags(str, p.state.flags) - - if source.next is not None: - assert source.next == ")" - raise source.error("unbalanced parenthesis") - - if flags & SRE_FLAG_DEBUG: - p.dump() - - return p - -def parse_template(source, state): - # parse 're' replacement string into list of literals and - # group references - s = Tokenizer(source) - sget = s.get - groups = [] - literals = [] - literal = [] - lappend = literal.append - def addgroup(index, pos): - if index > state.groups: - raise s.error("invalid group reference %d" % index, pos) - if literal: - literals.append(''.join(literal)) - del literal[:] - groups.append((len(literals), index)) - literals.append(None) - groupindex = state.groupindex - while True: - this = sget() - if this is None: - break # end of replacement string - if this[0] == "\\": - # group - c = this[1] - if c == "g": - name = "" - if not s.match("<"): - raise s.error("missing <") - name = s.getuntil(">", "group name") - if name.isidentifier(): - try: - index = groupindex[name] - except KeyError: - raise IndexError("unknown group name %r" % name) - else: - try: - index = int(name) - if index < 0: - raise ValueError - except ValueError: - raise s.error("bad character in group name %r" % name, - len(name) + 1) from None - if index >= MAXGROUPS: - raise s.error("invalid group reference %d" % index, - len(name) + 1) - addgroup(index, len(name) + 1) - elif c == "0": - if s.next in OCTDIGITS: - this += sget() - if s.next in OCTDIGITS: - this += sget() - lappend(chr(int(this[1:], 8) & 0xff)) - elif c in DIGITS: - isoctal = False - if s.next in DIGITS: - this += sget() - if (c in OCTDIGITS and this[2] in OCTDIGITS and - s.next in OCTDIGITS): - this += sget() - isoctal = True - c = int(this[1:], 8) - if c > 0o377: - raise s.error('octal escape value %s outside of ' - 'range 0-0o377' % this, len(this)) - lappend(chr(c)) - if not isoctal: - addgroup(int(this[1:]), len(this) - 1) - else: - try: - this = chr(ESCAPES[this][1]) - except KeyError: - if c in ASCIILETTERS: - raise s.error('bad escape %s' % this, len(this)) - lappend(this) - else: - lappend(this) - if literal: - literals.append(''.join(literal)) - if not isinstance(source, str): - # The tokenizer implicitly decodes bytes objects as latin-1, we must - # therefore re-encode the final representation. - literals = [None if s is None else s.encode('latin-1') for s in literals] - return groups, literals - -def expand_template(template, match): - g = match.group - empty = match.string[:0] - groups, literals = template - literals = literals[:] - try: - for index, group in groups: - literals[index] = g(group) or empty - except IndexError: - raise error("invalid group reference %d" % index) - return empty.join(literals) +from re import _parser as _ +globals().update({k: v for k, v in vars(_).items() if k[:2] != '__'}) From 1e3d57817cbb8a08b6b7c7dc04eefe0e566975ad Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Wed, 22 Nov 2023 23:20:04 +0200 Subject: [PATCH 120/705] Replace re_test.py from CPython 3.12 and mark failed tests --- Cargo.lock | 37 +- Lib/test/test_re.py | 844 ++++++++++++++++++++++++++++++++++---------- vm/Cargo.toml | 4 +- 3 files changed, 658 insertions(+), 227 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 52afbb053f..da13a2a33c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1422,34 +1422,13 @@ dependencies = [ "libc", ] -[[package]] -name = "num_enum" -version = "0.5.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d829733185c1ca374f17e52b762f24f535ec625d2cc1f070e34c8a9068f341b" -dependencies = [ - "num_enum_derive 0.5.9", -] - [[package]] name = "num_enum" version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02339744ee7253741199f897151b38e72257d13802d4ee837285cc2990a90845" dependencies = [ - "num_enum_derive 0.7.2", -] - -[[package]] -name = "num_enum_derive" -version = "0.5.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2be1598bf1c313dcdd12092e3f1920f463462525a21b7b4e11b4168353d0123e" -dependencies = [ - "proc-macro-crate", - "proc-macro2", - "quote", - "syn 1.0.107", + "num_enum_derive", ] [[package]] @@ -2191,7 +2170,7 @@ name = "rustpython-sre_engine" version = "0.6.0" dependencies = [ "bitflags 2.4.0", - "num_enum 0.7.2", + "num_enum", "optional", ] @@ -2230,7 +2209,7 @@ dependencies = [ "num-complex", "num-integer", "num-traits", - "num_enum 0.7.2", + "num_enum", "once_cell", "openssl", "openssl-probe", @@ -2300,7 +2279,7 @@ dependencies = [ "num-integer", "num-traits", "num_cpus", - "num_enum 0.7.2", + "num_enum", "once_cell", "optional", "parking_lot", @@ -2552,12 +2531,10 @@ dependencies = [ [[package]] name = "sre-engine" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a490c5c46c35dba9a6f5e7ee8e4d67e775eb2d2da0f115750b8d10e1c1ac2d28" +version = "0.6.0" dependencies = [ - "bitflags 1.3.2", - "num_enum 0.5.9", + "bitflags 2.4.0", + "num_enum", "optional", ] diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 9b30b4137c..1fd2432aae 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1,15 +1,25 @@ from test.support import (gc_collect, bigmemtest, _2G, cpython_only, captured_stdout, - check_disallow_instantiation) + check_disallow_instantiation, is_emscripten, is_wasi, + SHORT_TIMEOUT) import locale import re -import sre_compile import string +import sys +import time import unittest import warnings from re import Scanner from weakref import proxy +# some platforms lack working multiprocessing +try: + import _multiprocessing +except ImportError: + multiprocessing = None +else: + import multiprocessing + # Misc tests from Tim Peters' re.doc # WARNING: Don't change details in these tests if you don't know @@ -85,10 +95,29 @@ def test_search_star_plus(self): self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) self.assertIsNone(re.match('a+', 'xxx')) + def test_branching(self): + """Test Branching + Test expressions using the OR ('|') operator.""" + self.assertEqual(re.match('(ab|ba)', 'ab').span(), (0, 2)) + self.assertEqual(re.match('(ab|ba)', 'ba').span(), (0, 2)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'abc').span(), + (0, 3)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'bac').span(), + (0, 3)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'ca').span(), + (0, 2)) + self.assertEqual(re.match('(abc|bac|ca|cb)', 'cb').span(), + (0, 2)) + self.assertEqual(re.match('((a)|(b)|(c))', 'a').span(), (0, 1)) + self.assertEqual(re.match('((a)|(b)|(c))', 'b').span(), (0, 1)) + self.assertEqual(re.match('((a)|(b)|(c))', 'c').span(), (0, 1)) + def bump_num(self, matchobj): int_value = int(matchobj.group(0)) return str(int_value + 1) + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_basic_re_sub(self): self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz') self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz') @@ -119,6 +148,7 @@ def test_basic_re_sub(self): self.assertEqual(re.sub('(?Px)', r'\g\g<1>', 'xx'), 'xxxx') self.assertEqual(re.sub('(?Px)', r'\g\g', 'xx'), 'xxxx') self.assertEqual(re.sub('(?Px)', r'\g<1>\g<1>', 'xx'), 'xxxx') + self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx') self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') @@ -131,11 +161,15 @@ def test_basic_re_sub(self): self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest') + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_bug_449964(self): # fails for group followed by other escape self.assertEqual(re.sub(r'(?Px)', r'\g<1>\g<1>\b', 'xx'), 'xx\bxx\b') + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_bug_449000(self): # Test for sub() on escaped characters self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), @@ -159,6 +193,8 @@ def test_bug_3629(self): # A regex that triggered a bug in the sre-code validator re.compile("(?P)(?(quote))") + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_sub_template_numeric_escape(self): # bug 776311 and friends self.assertEqual(re.sub('x', r'\0', 'x'), '\0') @@ -212,6 +248,8 @@ def test_qualified_re_sub(self): self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa') self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa') + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_bug_114660(self): self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), 'hello there') @@ -258,7 +296,15 @@ def test_symbolic_groups_errors(self): self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4) self.checkPatternError('(?P=©)', "bad character in group name '©'", 4) self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3) + self.checkPatternError(b'(?P<\xc2\xb5>x)', + r"bad character in group name '\xc2\xb5'", 4) + self.checkPatternError(b'(?P=\xc2\xb5)', + r"bad character in group name '\xc2\xb5'", 4) + self.checkPatternError(b'(?(\xc2\xb5)y)', + r"bad character in group name '\xc2\xb5'", 3) + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_symbolic_refs(self): self.assertEqual(re.sub('(?Px)|(?Py)', r'\g', 'xx'), '') self.assertEqual(re.sub('(?Px)|(?Py)', r'\2', 'xx'), '') @@ -270,6 +316,8 @@ def test_symbolic_refs(self): pat = '|'.join('x(?P%x)y' % (i, i) for i in range(1, 200 + 1)) self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8') + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_symbolic_refs_errors(self): self.checkTemplateError('(?Px)', r'\g, unterminated name', 3) @@ -290,12 +338,22 @@ def test_symbolic_refs_errors(self): re.sub('(?Px)', r'\g', 'xx') self.checkTemplateError('(?Px)', r'\g<-1>', 'xx', "bad character in group name '-1'", 3) + self.checkTemplateError('(?Px)', r'\g<+1>', 'xx', + "bad character in group name '+1'", 3) + self.checkTemplateError('()'*10, r'\g<1_0>', 'xx', + "bad character in group name '1_0'", 3) + self.checkTemplateError('(?Px)', r'\g< 1 >', 'xx', + "bad character in group name ' 1 '", 3) self.checkTemplateError('(?Px)', r'\g<©>', 'xx', "bad character in group name '©'", 3) + self.checkTemplateError(b'(?Px)', b'\\g<\xc2\xb5>', b'xx', + r"bad character in group name '\xc2\xb5'", 3) self.checkTemplateError('(?Px)', r'\g<㊀>', 'xx', "bad character in group name '㊀'", 3) self.checkTemplateError('(?Px)', r'\g<¹>', 'xx', "bad character in group name '¹'", 3) + self.checkTemplateError('(?Px)', r'\g<१>', 'xx', + "bad character in group name '१'", 3) def test_re_subn(self): self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) @@ -557,16 +615,22 @@ def test_re_groupref_exists(self): pat = '(?:%s)(?(200)z)' % pat self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5)) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_re_groupref_exists_errors(self): self.checkPatternError(r'(?P)(?(0)a|b)', 'bad group number', 10) self.checkPatternError(r'()(?(-1)a|b)', "bad character in group name '-1'", 5) + self.checkPatternError(r'()(?(+1)a|b)', + "bad character in group name '+1'", 5) + self.checkPatternError(r'()'*10 + r'(?(1_0)a|b)', + "bad character in group name '1_0'", 23) + self.checkPatternError(r'()(?( 1 )a|b)', + "bad character in group name ' 1 '", 5) self.checkPatternError(r'()(?(㊀)a|b)', "bad character in group name '㊀'", 5) self.checkPatternError(r'()(?(¹)a|b)', "bad character in group name '¹'", 5) + self.checkPatternError(r'()(?(१)a|b)', + "bad character in group name '१'", 5) self.checkPatternError(r'()(?(1', "missing ), unterminated name", 5) self.checkPatternError(r'()(?(1)a', @@ -582,8 +646,15 @@ def test_re_groupref_exists_errors(self): self.checkPatternError(r'()(?(2)a)', "invalid group reference 2", 5) + def test_re_groupref_exists_validation_bug(self): + for i in range(256): + with self.subTest(code=i): + re.compile(r'()(?(1)\x%02x?)' % i) + + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_re_groupref_overflow(self): - from sre_constants import MAXGROUPS + from re._constants import MAXGROUPS self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx', 'invalid group reference %d' % MAXGROUPS, 3) self.checkPatternError(r'(?P)(?(%d))' % MAXGROUPS, @@ -608,6 +679,8 @@ def test_groupdict(self): 'first second').groupdict(), {'first':'first', 'second':'second'}) + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_expand(self): self.assertEqual(re.match("(?Pfirst) (?Psecond)", "first second") @@ -871,8 +944,6 @@ def test_lookbehind(self): self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)') self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)') - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_ignore_case(self): self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC") @@ -913,8 +984,6 @@ def test_ignore_case(self): self.assertTrue(re.match(r'\ufb05', '\ufb06', re.I)) self.assertTrue(re.match(r'\ufb06', '\ufb05', re.I)) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_ignore_case_set(self): self.assertTrue(re.match(r'[19A]', 'A', re.I)) self.assertTrue(re.match(r'[19a]', 'a', re.I)) @@ -953,8 +1022,6 @@ def test_ignore_case_set(self): self.assertTrue(re.match(r'[19\ufb05]', '\ufb06', re.I)) self.assertTrue(re.match(r'[19\ufb06]', '\ufb05', re.I)) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_ignore_case_range(self): # Issues #3511, #17381. self.assertTrue(re.match(r'[9-a]', '_', re.I)) @@ -1005,33 +1072,6 @@ def test_ignore_case_range(self): def test_category(self): self.assertEqual(re.match(r"(\s)", " ").group(1), " ") - @cpython_only - def test_case_helpers(self): - import _sre - for i in range(128): - c = chr(i) - lo = ord(c.lower()) - self.assertEqual(_sre.ascii_tolower(i), lo) - self.assertEqual(_sre.unicode_tolower(i), lo) - iscased = c in string.ascii_letters - self.assertEqual(_sre.ascii_iscased(i), iscased) - self.assertEqual(_sre.unicode_iscased(i), iscased) - - for i in list(range(128, 0x1000)) + [0x10400, 0x10428]: - c = chr(i) - self.assertEqual(_sre.ascii_tolower(i), i) - if i != 0x0130: - self.assertEqual(_sre.unicode_tolower(i), ord(c.lower())) - iscased = c != c.lower() or c != c.upper() - self.assertFalse(_sre.ascii_iscased(i)) - self.assertEqual(_sre.unicode_iscased(i), - c != c.lower() or c != c.upper()) - - self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130) - self.assertEqual(_sre.unicode_tolower(0x0130), ord('i')) - self.assertFalse(_sre.ascii_iscased(0x0130)) - self.assertTrue(_sre.unicode_iscased(0x0130)) - def test_not_literal(self): self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b") self.assertEqual(re.search(r"\s([^a]*)", " bb").group(1), "bb") @@ -1332,11 +1372,13 @@ def test_nothing_to_repeat(self): 'nothing to repeat', 3) def test_multiple_repeat(self): - for outer_reps in '*', '+', '{1,2}': - for outer_mod in '', '?': + for outer_reps in '*', '+', '?', '{1,2}': + for outer_mod in '', '?', '+': outer_op = outer_reps + outer_mod for inner_reps in '*', '+', '?', '{1,2}': - for inner_mod in '', '?': + for inner_mod in '', '?', '+': + if inner_mod + outer_reps in ('?', '+'): + continue inner_op = inner_reps + inner_mod self.checkPatternError(r'x%s%s' % (inner_op, outer_op), 'multiple repeat', 1 + len(inner_op)) @@ -1491,8 +1533,6 @@ def test_empty_array(self): self.assertIsNone(re.compile(b"bla").match(a)) self.assertEqual(re.compile(b"").match(a).groups(), ()) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_inline_flags(self): # Bug #1700 upper_char = '\u1ea0' # Latin Capital Letter A with Dot Below @@ -1536,70 +1576,27 @@ def test_inline_flags(self): self.assertTrue(re.match('(?x) (?i) ' + upper_char, lower_char)) self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char, re.X)) - p = upper_char + '(?i)' - with self.assertWarns(DeprecationWarning) as warns: - self.assertTrue(re.match(p, lower_char)) - self.assertEqual( - str(warns.warnings[0].message), - 'Flags not at the start of the expression %r' - ' but at position 1' % p - ) - self.assertEqual(warns.warnings[0].filename, __file__) - - p = upper_char + '(?i)%s' % ('.?' * 100) - with self.assertWarns(DeprecationWarning) as warns: - self.assertTrue(re.match(p, lower_char)) - self.assertEqual( - str(warns.warnings[0].message), - 'Flags not at the start of the expression %r (truncated)' - ' but at position 1' % p[:20] - ) - self.assertEqual(warns.warnings[0].filename, __file__) + msg = "global flags not at the start of the expression" + self.checkPatternError(upper_char + '(?i)', msg, 1) # bpo-30605: Compiling a bytes instance regex was throwing a BytesWarning with warnings.catch_warnings(): warnings.simplefilter('error', BytesWarning) - p = b'A(?i)' - with self.assertWarns(DeprecationWarning) as warns: - self.assertTrue(re.match(p, b'a')) - self.assertEqual( - str(warns.warnings[0].message), - 'Flags not at the start of the expression %r' - ' but at position 1' % p - ) - self.assertEqual(warns.warnings[0].filename, __file__) - - with self.assertWarns(DeprecationWarning): - self.assertTrue(re.match('(?s).(?i)' + upper_char, '\n' + lower_char)) - with self.assertWarns(DeprecationWarning): - self.assertTrue(re.match('(?i) ' + upper_char + ' (?x)', lower_char)) - with self.assertWarns(DeprecationWarning): - self.assertTrue(re.match(' (?x) (?i) ' + upper_char, lower_char)) - with self.assertWarns(DeprecationWarning): - self.assertTrue(re.match('^(?i)' + upper_char, lower_char)) - with self.assertWarns(DeprecationWarning): - self.assertTrue(re.match('$|(?i)' + upper_char, lower_char)) - with self.assertWarns(DeprecationWarning) as warns: - self.assertTrue(re.match('(?:(?i)' + upper_char + ')', lower_char)) - self.assertRegex(str(warns.warnings[0].message), - 'Flags not at the start') - self.assertEqual(warns.warnings[0].filename, __file__) - with self.assertWarns(DeprecationWarning) as warns: - self.assertTrue(re.fullmatch('(^)?(?(1)(?i)' + upper_char + ')', - lower_char)) - self.assertRegex(str(warns.warnings[0].message), - 'Flags not at the start') - self.assertEqual(warns.warnings[0].filename, __file__) - with self.assertWarns(DeprecationWarning) as warns: - self.assertTrue(re.fullmatch('($)?(?(1)|(?i)' + upper_char + ')', - lower_char)) - self.assertRegex(str(warns.warnings[0].message), - 'Flags not at the start') - self.assertEqual(warns.warnings[0].filename, __file__) + self.checkPatternError(b'A(?i)', msg, 1) + + self.checkPatternError('(?s).(?i)' + upper_char, msg, 5) + self.checkPatternError('(?i) ' + upper_char + ' (?x)', msg, 7) + self.checkPatternError(' (?x) (?i) ' + upper_char, msg, 1) + self.checkPatternError('^(?i)' + upper_char, msg, 1) + self.checkPatternError('$|(?i)' + upper_char, msg, 2) + self.checkPatternError('(?:(?i)' + upper_char + ')', msg, 3) + self.checkPatternError('(^)?(?(1)(?i)' + upper_char + ')', msg, 9) + self.checkPatternError('($)?(?(1)|(?i)' + upper_char + ')', msg, 10) def test_dollar_matches_twice(self): - "$ matches the end of string, and just before the terminating \n" + r"""Test that $ does not include \n + $ matches the end of string, and just before the terminating \n""" pattern = re.compile('$') self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') @@ -1757,6 +1754,8 @@ def test_comments(self): self.assertTrue(re.fullmatch('(?x)#x\na|#y\nb', 'a')) self.assertTrue(re.fullmatch('(?x)#x\na|#y\nb', 'b')) + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_bug_6509(self): # Replacement strings of both types must parse properly. # all strings @@ -1775,24 +1774,6 @@ def test_bug_6509(self): pat = re.compile(b'..') self.assertEqual(pat.sub(lambda m: b'bytes', b'a5'), b'bytes') - # RUSTPYTHON: here in rustpython, we borrow the string only at the - # time of matching, so we will not check the string type when creating - # SRE_Scanner, expect this, other tests has passed - @cpython_only - def test_dealloc(self): - # issue 3299: check for segfault in debug build - import _sre - # the overflow limit is different on wide and narrow builds and it - # depends on the definition of SRE_CODE (see sre.h). - # 2**128 should be big enough to overflow on both. For smaller values - # a RuntimeError is raised instead of OverflowError. - long_overflow = 2**128 - self.assertRaises(TypeError, re.finditer, "a", {}) - with self.assertRaises(OverflowError): - _sre.compile("abc", 0, [long_overflow], 0, {}, ()) - with self.assertRaises(TypeError): - _sre.compile({}, 0, [], 0, [], []) - def test_search_dot_unicode(self): self.assertTrue(re.search("123.*-", '123abc-')) self.assertTrue(re.search("123.*-", '123\xe9-')) @@ -1850,20 +1831,28 @@ def test_repeat_minmax_overflow(self): self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128) self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128)) - @cpython_only - def test_repeat_minmax_overflow_maxrepeat(self): - try: - from _sre import MAXREPEAT - except ImportError: - self.skipTest('requires _sre.MAXREPEAT constant') - string = "x" * 100000 - self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string)) - self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(), - (0, 100000)) - self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string)) - self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT) - self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT) - self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT) + def test_look_behind_overflow(self): + string = "x" * 2_500_000 + p1 = r"(?<=((.{%d}){%d}){%d})" + p2 = r"(?...), which does + not maintain any stack point created within the group once the + group is finished being evaluated.""" + pattern1 = re.compile(r'a(?>bc|b)c') + self.assertIsNone(pattern1.match('abc')) + self.assertTrue(pattern1.match('abcc')) + self.assertIsNone(re.match(r'(?>.*).', 'abc')) + self.assertTrue(re.match(r'(?>x)++', 'xxx')) + self.assertTrue(re.match(r'(?>x++)', 'xxx')) + self.assertIsNone(re.match(r'(?>x)++x', 'xxx')) + self.assertIsNone(re.match(r'(?>x++)x', 'xxx')) + + # TODO: RUSTPYTHON + @unittest.expectedFailure + def test_fullmatch_atomic_grouping(self): + self.assertTrue(re.fullmatch(r'(?>a+)', 'a')) + self.assertTrue(re.fullmatch(r'(?>a*)', 'a')) + self.assertTrue(re.fullmatch(r'(?>a?)', 'a')) + self.assertTrue(re.fullmatch(r'(?>a{1,3})', 'a')) + self.assertIsNone(re.fullmatch(r'(?>a+)', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>a*)', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>a?)', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>a{1,3})', 'ab')) + self.assertTrue(re.fullmatch(r'(?>a+)b', 'ab')) + self.assertTrue(re.fullmatch(r'(?>a*)b', 'ab')) + self.assertTrue(re.fullmatch(r'(?>a?)b', 'ab')) + self.assertTrue(re.fullmatch(r'(?>a{1,3})b', 'ab')) + + self.assertTrue(re.fullmatch(r'(?>(?:ab)+)', 'ab')) + self.assertTrue(re.fullmatch(r'(?>(?:ab)*)', 'ab')) + self.assertTrue(re.fullmatch(r'(?>(?:ab)?)', 'ab')) + self.assertTrue(re.fullmatch(r'(?>(?:ab){1,3})', 'ab')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab)+)', 'abc')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab)*)', 'abc')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab)?)', 'abc')) + self.assertIsNone(re.fullmatch(r'(?>(?:ab){1,3})', 'abc')) + self.assertTrue(re.fullmatch(r'(?>(?:ab)+)c', 'abc')) + self.assertTrue(re.fullmatch(r'(?>(?:ab)*)c', 'abc')) + self.assertTrue(re.fullmatch(r'(?>(?:ab)?)c', 'abc')) + self.assertTrue(re.fullmatch(r'(?>(?:ab){1,3})c', 'abc')) + + # TODO: RUSTPYTHON + @unittest.expectedFailure + def test_findall_atomic_grouping(self): + self.assertEqual(re.findall(r'(?>a+)', 'aab'), ['aa']) + self.assertEqual(re.findall(r'(?>a*)', 'aab'), ['aa', '', '']) + self.assertEqual(re.findall(r'(?>a?)', 'aab'), ['a', 'a', '', '']) + self.assertEqual(re.findall(r'(?>a{1,3})', 'aab'), ['aa']) + + self.assertEqual(re.findall(r'(?>(?:ab)+)', 'ababc'), ['abab']) + self.assertEqual(re.findall(r'(?>(?:ab)*)', 'ababc'), ['abab', '', '']) + self.assertEqual(re.findall(r'(?>(?:ab)?)', 'ababc'), ['ab', 'ab', '', '']) + self.assertEqual(re.findall(r'(?>(?:ab){1,3})', 'ababc'), ['abab']) + + def test_bug_gh91616(self): + self.assertTrue(re.fullmatch(r'(?s:(?>.*?\.).*)\Z', "a.txt")) # reproducer + self.assertTrue(re.fullmatch(r'(?s:(?=(?P.*?\.))(?P=g0).*)\Z', "a.txt")) + + # TODO: RUSTPYTHON + @unittest.expectedFailure + def test_template_function_and_flag_is_deprecated(self): + with self.assertWarns(DeprecationWarning) as cm: + template_re1 = re.template(r'a') + self.assertIn('re.template()', str(cm.warning)) + self.assertIn('is deprecated', str(cm.warning)) + self.assertIn('function', str(cm.warning)) + self.assertNotIn('flag', str(cm.warning)) + + with self.assertWarns(DeprecationWarning) as cm: + # we deliberately use more flags here to test that that still + # triggers the warning + # if paranoid, we could test multiple different combinations, + # but it's probably not worth it + template_re2 = re.compile(r'a', flags=re.TEMPLATE|re.UNICODE) + self.assertIn('re.TEMPLATE', str(cm.warning)) + self.assertIn('is deprecated', str(cm.warning)) + self.assertIn('flag', str(cm.warning)) + self.assertNotIn('function', str(cm.warning)) + + # while deprecated, is should still function + self.assertEqual(template_re1, template_re2) + self.assertTrue(template_re1.match('ahoy')) + self.assertFalse(template_re1.match('nope')) + + # TODO: RUSTPYTHON + @unittest.expectedFailure + def test_bug_gh106052(self): + # gh-100061 + self.assertEqual(re.match('(?>(?:.(?!D))+)', 'ABCDE').span(), (0, 2)) + self.assertEqual(re.match('(?:.(?!D))++', 'ABCDE').span(), (0, 2)) + self.assertEqual(re.match('(?>(?:.(?!D))*)', 'ABCDE').span(), (0, 2)) + self.assertEqual(re.match('(?:.(?!D))*+', 'ABCDE').span(), (0, 2)) + self.assertEqual(re.match('(?>(?:.(?!D))?)', 'CDE').span(), (0, 0)) + self.assertEqual(re.match('(?:.(?!D))?+', 'CDE').span(), (0, 0)) + self.assertEqual(re.match('(?>(?:.(?!D)){1,3})', 'ABCDE').span(), (0, 2)) + self.assertEqual(re.match('(?:.(?!D)){1,3}+', 'ABCDE').span(), (0, 2)) + # gh-106052 + self.assertEqual(re.match("(?>(?:ab?c)+)", "aca").span(), (0, 2)) + self.assertEqual(re.match("(?:ab?c)++", "aca").span(), (0, 2)) + self.assertEqual(re.match("(?>(?:ab?c)*)", "aca").span(), (0, 2)) + self.assertEqual(re.match("(?:ab?c)*+", "aca").span(), (0, 2)) + self.assertEqual(re.match("(?>(?:ab?c)?)", "a").span(), (0, 0)) + self.assertEqual(re.match("(?:ab?c)?+", "a").span(), (0, 0)) + self.assertEqual(re.match("(?>(?:ab?c){1,3})", "aca").span(), (0, 2)) + self.assertEqual(re.match("(?:ab?c){1,3}+", "aca").span(), (0, 2)) + + # TODO: RUSTPYTHON + @unittest.skipUnless(sys.platform == 'linux', 'multiprocessing related issue') + @unittest.skipIf(multiprocessing is None, 'test requires multiprocessing') + def test_regression_gh94675(self): + pattern = re.compile(r'(?<=[({}])(((//[^\n]*)?[\n])([\000-\040])*)*' + r'((/[^/\[\n]*(([^\n]|(\[\n]*(]*)*\]))' + r'[^/\[]*)*/))((((//[^\n]*)?[\n])' + r'([\000-\040]|(/\*[^*]*\*+' + r'([^/*]\*+)*/))*)+(?=[^\000-\040);\]}]))') + input_js = '''a(function() { + /////////////////////////////////////////////////////////////////// + });''' + p = multiprocessing.Process(target=pattern.sub, args=('', input_js)) + p.start() + p.join(SHORT_TIMEOUT) + try: + self.assertFalse(p.is_alive(), 'pattern.sub() timed out') + finally: + if p.is_alive(): + p.terminate() + p.join() + + +def get_debug_out(pat): + with captured_stdout() as out: + re.compile(pat, re.DEBUG) + return out.getvalue() + + +@cpython_only +class DebugTests(unittest.TestCase): + maxDiff = None + + def test_debug_flag(self): + pat = r'(\.)(?:[ch]|py)(?(1)$|: )' + dump = '''\ +SUBPATTERN 1 0 0 + LITERAL 46 +BRANCH + IN + LITERAL 99 + LITERAL 104 +OR + LITERAL 112 + LITERAL 121 +GROUPREF_EXISTS 1 + AT AT_END +ELSE + LITERAL 58 + LITERAL 32 + + 0. INFO 8 0b1 2 5 (to 9) + prefix_skip 0 + prefix [0x2e] ('.') + overlap [0] + 9: MARK 0 +11. LITERAL 0x2e ('.') +13. MARK 1 +15. BRANCH 10 (to 26) +17. IN 6 (to 24) +19. LITERAL 0x63 ('c') +21. LITERAL 0x68 ('h') +23. FAILURE +24: JUMP 9 (to 34) +26: branch 7 (to 33) +27. LITERAL 0x70 ('p') +29. LITERAL 0x79 ('y') +31. JUMP 2 (to 34) +33: FAILURE +34: GROUPREF_EXISTS 0 6 (to 41) +37. AT END +39. JUMP 5 (to 45) +41: LITERAL 0x3a (':') +43. LITERAL 0x20 (' ') +45: SUCCESS +''' + self.assertEqual(get_debug_out(pat), dump) + # Debug output is output again even a second time (bypassing + # the cache -- issue #20426). + self.assertEqual(get_debug_out(pat), dump) + + def test_atomic_group(self): + self.assertEqual(get_debug_out(r'(?>ab?)'), '''\ +ATOMIC_GROUP + LITERAL 97 + MAX_REPEAT 0 1 + LITERAL 98 + + 0. INFO 4 0b0 1 2 (to 5) + 5: ATOMIC_GROUP 11 (to 17) + 7. LITERAL 0x61 ('a') + 9. REPEAT_ONE 6 0 1 (to 16) +13. LITERAL 0x62 ('b') +15. SUCCESS +16: SUCCESS +17: SUCCESS +''') + + def test_possesive_repeat_one(self): + self.assertEqual(get_debug_out(r'a?+'), '''\ +POSSESSIVE_REPEAT 0 1 + LITERAL 97 + + 0. INFO 4 0b0 0 1 (to 5) + 5: POSSESSIVE_REPEAT_ONE 6 0 1 (to 12) + 9. LITERAL 0x61 ('a') +11. SUCCESS +12: SUCCESS +''') + + def test_possesive_repeat(self): + self.assertEqual(get_debug_out(r'(?:ab)?+'), '''\ +POSSESSIVE_REPEAT 0 1 + LITERAL 97 + LITERAL 98 + + 0. INFO 4 0b0 0 2 (to 5) + 5: POSSESSIVE_REPEAT 7 0 1 (to 13) + 9. LITERAL 0x61 ('a') +11. LITERAL 0x62 ('b') +13: SUCCESS +14. SUCCESS +''') + class PatternReprTests(unittest.TestCase): def check(self, pattern, expected): @@ -2312,11 +2664,13 @@ def test_flags_repr(self): "re.IGNORECASE|re.DOTALL|re.VERBOSE") self.assertEqual(repr(re.I|re.S|re.X|(1<<20)), "re.IGNORECASE|re.DOTALL|re.VERBOSE|0x100000") - self.assertEqual(repr(~re.I), "~re.IGNORECASE") + self.assertEqual( + repr(~re.I), + "re.ASCII|re.LOCALE|re.UNICODE|re.MULTILINE|re.DOTALL|re.VERBOSE|re.TEMPLATE|re.DEBUG") self.assertEqual(repr(~(re.I|re.S|re.X)), - "~(re.IGNORECASE|re.DOTALL|re.VERBOSE)") + "re.ASCII|re.LOCALE|re.UNICODE|re.MULTILINE|re.TEMPLATE|re.DEBUG") self.assertEqual(repr(~(re.I|re.S|re.X|(1<<20))), - "~(re.IGNORECASE|re.DOTALL|re.VERBOSE|0x100000)") + "re.ASCII|re.LOCALE|re.UNICODE|re.MULTILINE|re.TEMPLATE|re.DEBUG|0xffe00") class ImplementationTest(unittest.TestCase): @@ -2337,7 +2691,7 @@ def test_immutable(self): tp.foo = 1 def test_overlap_table(self): - f = sre_compile._generate_overlap_table + f = re._compiler._generate_overlap_table self.assertEqual(f(""), []) self.assertEqual(f("a"), [0]) self.assertEqual(f("abcd"), [0, 0, 0, 0]) @@ -2346,8 +2700,8 @@ def test_overlap_table(self): self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0]) def test_signedness(self): - self.assertGreaterEqual(sre_compile.MAXREPEAT, 0) - self.assertGreaterEqual(sre_compile.MAXGROUPS, 0) + self.assertGreaterEqual(re._compiler.MAXREPEAT, 0) + self.assertGreaterEqual(re._compiler.MAXGROUPS, 0) @cpython_only def test_disallow_instantiation(self): @@ -2357,6 +2711,106 @@ def test_disallow_instantiation(self): pat = re.compile("") check_disallow_instantiation(self, type(pat.scanner(""))) + # TODO: RUSTPYTHON + @unittest.expectedFailure + def test_deprecated_modules(self): + deprecated = { + 'sre_compile': ['compile', 'error', + 'SRE_FLAG_IGNORECASE', 'SUBPATTERN', + '_compile_info'], + 'sre_constants': ['error', 'SRE_FLAG_IGNORECASE', 'SUBPATTERN', + '_NamedIntConstant'], + 'sre_parse': ['SubPattern', 'parse', + 'SRE_FLAG_IGNORECASE', 'SUBPATTERN', + '_parse_sub'], + } + for name in deprecated: + with self.subTest(module=name): + sys.modules.pop(name, None) + with self.assertWarns(DeprecationWarning) as w: + __import__(name) + self.assertEqual(str(w.warning), + f"module {name!r} is deprecated") + self.assertEqual(w.filename, __file__) + self.assertIn(name, sys.modules) + mod = sys.modules[name] + self.assertEqual(mod.__name__, name) + self.assertEqual(mod.__package__, '') + for attr in deprecated[name]: + self.assertTrue(hasattr(mod, attr)) + del sys.modules[name] + + @cpython_only + def test_case_helpers(self): + import _sre + for i in range(128): + c = chr(i) + lo = ord(c.lower()) + self.assertEqual(_sre.ascii_tolower(i), lo) + self.assertEqual(_sre.unicode_tolower(i), lo) + iscased = c in string.ascii_letters + self.assertEqual(_sre.ascii_iscased(i), iscased) + self.assertEqual(_sre.unicode_iscased(i), iscased) + + for i in list(range(128, 0x1000)) + [0x10400, 0x10428]: + c = chr(i) + self.assertEqual(_sre.ascii_tolower(i), i) + if i != 0x0130: + self.assertEqual(_sre.unicode_tolower(i), ord(c.lower())) + iscased = c != c.lower() or c != c.upper() + self.assertFalse(_sre.ascii_iscased(i)) + self.assertEqual(_sre.unicode_iscased(i), + c != c.lower() or c != c.upper()) + + self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130) + self.assertEqual(_sre.unicode_tolower(0x0130), ord('i')) + self.assertFalse(_sre.ascii_iscased(0x0130)) + self.assertTrue(_sre.unicode_iscased(0x0130)) + + @cpython_only + def test_dealloc(self): + # issue 3299: check for segfault in debug build + import _sre + # the overflow limit is different on wide and narrow builds and it + # depends on the definition of SRE_CODE (see sre.h). + # 2**128 should be big enough to overflow on both. For smaller values + # a RuntimeError is raised instead of OverflowError. + long_overflow = 2**128 + self.assertRaises(TypeError, re.finditer, "a", {}) + with self.assertRaises(OverflowError): + _sre.compile("abc", 0, [long_overflow], 0, {}, ()) + with self.assertRaises(TypeError): + _sre.compile({}, 0, [], 0, [], []) + # gh-110590: `TypeError` was overwritten with `OverflowError`: + with self.assertRaises(TypeError): + _sre.compile('', 0, ['abc'], 0, {}, ()) + + @cpython_only + def test_repeat_minmax_overflow_maxrepeat(self): + try: + from _sre import MAXREPEAT + except ImportError: + self.skipTest('requires _sre.MAXREPEAT constant') + string = "x" * 100000 + self.assertIsNone(re.match(r".{%d}" % (MAXREPEAT - 1), string)) + self.assertEqual(re.match(r".{,%d}" % (MAXREPEAT - 1), string).span(), + (0, 100000)) + self.assertIsNone(re.match(r".{%d,}?" % (MAXREPEAT - 1), string)) + self.assertRaises(OverflowError, re.compile, r".{%d}" % MAXREPEAT) + self.assertRaises(OverflowError, re.compile, r".{,%d}" % MAXREPEAT) + self.assertRaises(OverflowError, re.compile, r".{%d,}?" % MAXREPEAT) + + @cpython_only + def test_sre_template_invalid_group_index(self): + # see gh-106524 + import _sre + with self.assertRaises(TypeError) as cm: + _sre.template("", ["", -1, ""]) + self.assertIn("invalid template", str(cm.exception)) + with self.assertRaises(TypeError) as cm: + _sre.template("", ["", (), ""]) + self.assertIn("an integer is required", str(cm.exception)) + class ExternalTests(unittest.TestCase): diff --git a/vm/Cargo.toml b/vm/Cargo.toml index f061f54a85..aa8c6df307 100644 --- a/vm/Cargo.toml +++ b/vm/Cargo.toml @@ -78,11 +78,11 @@ result-like = "0.4.5" timsort = "0.1.2" # RustPython crates implementing functionality based on CPython -sre-engine = "0.4.1" +# sre-engine = "0.4.1" # to work on sre-engine locally or git version # sre-engine = { git = "https://github.com/RustPython/sre-engine", rev = "refs/pull/14/head" } # sre-engine = { git = "https://github.com/RustPython/sre-engine" } -# sre-engine = { path = "../../sre-engine" } +sre-engine = { path = "../../sre-engine" } ## unicode stuff unicode_names2 = { workspace = true } From d9375b9fe19c7f27ac3b71a7160f612acd7fbe2d Mon Sep 17 00:00:00 2001 From: Kangzhi Shi Date: Sat, 25 Nov 2023 22:17:24 +0200 Subject: [PATCH 121/705] impl re.template(), template_compile template_expand subx --- Cargo.lock | 1 + Lib/string.py | 31 ++++++- Lib/test/test_re.py | 38 +------- vm/Cargo.toml | 3 +- vm/src/stdlib/sre.rs | 215 +++++++++++++++++++++++++++++++------------ 5 files changed, 191 insertions(+), 97 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index da13a2a33c..0080135c26 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2532,6 +2532,7 @@ dependencies = [ [[package]] name = "sre-engine" version = "0.6.0" +source = "git+https://github.com/RustPython/sre-engine?rev=refs/pull/17/head#9725808c302b13e873ff7f955dd4c1632f1137fd" dependencies = [ "bitflags 2.4.0", "num_enum", diff --git a/Lib/string.py b/Lib/string.py index 489777b10c..2eab6d4f59 100644 --- a/Lib/string.py +++ b/Lib/string.py @@ -45,7 +45,7 @@ def capwords(s, sep=None): sep is used to split and join the words. """ - return (sep or ' ').join(x.capitalize() for x in s.split(sep)) + return (sep or ' ').join(map(str.capitalize, s.split(sep))) #################################################################### @@ -141,6 +141,35 @@ def convert(mo): self.pattern) return self.pattern.sub(convert, self.template) + def is_valid(self): + for mo in self.pattern.finditer(self.template): + if mo.group('invalid') is not None: + return False + if (mo.group('named') is None + and mo.group('braced') is None + and mo.group('escaped') is None): + # If all the groups are None, there must be + # another group we're not expecting + raise ValueError('Unrecognized named group in pattern', + self.pattern) + return True + + def get_identifiers(self): + ids = [] + for mo in self.pattern.finditer(self.template): + named = mo.group('named') or mo.group('braced') + if named is not None and named not in ids: + # add a named group only the first time it appears + ids.append(named) + elif (named is None + and mo.group('invalid') is None + and mo.group('escaped') is None): + # If all the groups are None, there must be + # another group we're not expecting + raise ValueError('Unrecognized named group in pattern', + self.pattern) + return ids + # Initialize Template.pattern. __init_subclass__() is automatically called # only for subclasses, not for the Template class itself. Template.__init_subclass__() diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 1fd2432aae..5b442acbb1 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -116,8 +116,6 @@ def bump_num(self, matchobj): int_value = int(matchobj.group(0)) return str(int_value + 1) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_basic_re_sub(self): self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz') self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz') @@ -161,15 +159,11 @@ def test_basic_re_sub(self): self.assertEqual(re.sub(r'^\s*', 'X', 'test'), 'Xtest') - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_bug_449964(self): # fails for group followed by other escape self.assertEqual(re.sub(r'(?Px)', r'\g<1>\g<1>\b', 'xx'), 'xx\bxx\b') - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_bug_449000(self): # Test for sub() on escaped characters self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), @@ -193,8 +187,6 @@ def test_bug_3629(self): # A regex that triggered a bug in the sre-code validator re.compile("(?P)(?(quote))") - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_sub_template_numeric_escape(self): # bug 776311 and friends self.assertEqual(re.sub('x', r'\0', 'x'), '\0') @@ -248,8 +240,6 @@ def test_qualified_re_sub(self): self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa') self.assertEqual(re.sub('a', 'b', 'aaaaa', count=1), 'baaaa') - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_bug_114660(self): self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), 'hello there') @@ -303,8 +293,6 @@ def test_symbolic_groups_errors(self): self.checkPatternError(b'(?(\xc2\xb5)y)', r"bad character in group name '\xc2\xb5'", 3) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_symbolic_refs(self): self.assertEqual(re.sub('(?Px)|(?Py)', r'\g', 'xx'), '') self.assertEqual(re.sub('(?Px)|(?Py)', r'\2', 'xx'), '') @@ -316,8 +304,6 @@ def test_symbolic_refs(self): pat = '|'.join('x(?P%x)y' % (i, i) for i in range(1, 200 + 1)) self.assertEqual(re.sub(pat, r'\g<200>', 'xc8yzxc8y'), 'c8zc8') - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_symbolic_refs_errors(self): self.checkTemplateError('(?Px)', r'\g, unterminated name', 3) @@ -651,8 +637,6 @@ def test_re_groupref_exists_validation_bug(self): with self.subTest(code=i): re.compile(r'()(?(1)\x%02x?)' % i) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_re_groupref_overflow(self): from re._constants import MAXGROUPS self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx', @@ -1754,8 +1738,6 @@ def test_comments(self): self.assertTrue(re.fullmatch('(?x)#x\na|#y\nb', 'a')) self.assertTrue(re.fullmatch('(?x)#x\na|#y\nb', 'b')) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_bug_6509(self): # Replacement strings of both types must parse properly. # all strings @@ -1902,8 +1884,6 @@ def test_match_repr(self): ) self.assertRegex(repr(second), pattern) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_zerowidth(self): # Issues 852532, 1647489, 3262, 25054. self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', '']) @@ -2235,8 +2215,6 @@ def test_MIN_REPEAT_ONE_mark_bug(self): p = r'(?:a*?(xx)??z)*' self.assertEqual(re.match(p, s).groups(), ('xx',)) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_ASSERT_NOT_mark_bug(self): # Fixed in issue35859, reported in issue725149. # JUMP_ASSERT_NOT should LASTMARK_SAVE() @@ -2249,16 +2227,12 @@ def test_ASSERT_NOT_mark_bug(self): self.assertEqual(m.span(3), (3, 4)) self.assertEqual(m.groups(), ('b', None, 'b')) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_bug_40736(self): with self.assertRaisesRegex(TypeError, "got 'int'"): re.search("x*", 5) with self.assertRaisesRegex(TypeError, "got 'type'"): re.search("x*", type) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_search_anchor_at_beginning(self): s = 'x'*10**7 start = time.perf_counter() @@ -2273,7 +2247,8 @@ def test_search_anchor_at_beginning(self): # With optimization -- 0.0003 seconds. self.assertLess(t, 0.1) - @unittest.skip('dead lock') + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_possessive_quantifiers(self): """Test Possessive Quantifiers Test quantifiers of the form @+ for some repetition operator @, @@ -2342,7 +2317,6 @@ def test_fullmatch_possessive_quantifiers(self): self.assertTrue(re.fullmatch(r'(?:ab)?+c', 'abc')) self.assertTrue(re.fullmatch(r'(?:ab){1,3}+c', 'abc')) - @unittest.skip("dead lock") def test_findall_possessive_quantifiers(self): self.assertEqual(re.findall(r'a++', 'aab'), ['aa']) self.assertEqual(re.findall(r'a*+', 'aab'), ['aa', '', '']) @@ -2354,8 +2328,6 @@ def test_findall_possessive_quantifiers(self): self.assertEqual(re.findall(r'(?:ab)?+', 'ababc'), ['ab', 'ab', '', '']) self.assertEqual(re.findall(r'(?:ab){1,3}+', 'ababc'), ['abab']) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_atomic_grouping(self): """Test Atomic Grouping Test non-capturing groups of the form (?>...), which does @@ -2399,8 +2371,6 @@ def test_fullmatch_atomic_grouping(self): self.assertTrue(re.fullmatch(r'(?>(?:ab)?)c', 'abc')) self.assertTrue(re.fullmatch(r'(?>(?:ab){1,3})c', 'abc')) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_findall_atomic_grouping(self): self.assertEqual(re.findall(r'(?>a+)', 'aab'), ['aa']) self.assertEqual(re.findall(r'(?>a*)', 'aab'), ['aa', '', '']) @@ -2412,6 +2382,8 @@ def test_findall_atomic_grouping(self): self.assertEqual(re.findall(r'(?>(?:ab)?)', 'ababc'), ['ab', 'ab', '', '']) self.assertEqual(re.findall(r'(?>(?:ab){1,3})', 'ababc'), ['abab']) + # TODO: RUSTPYTHON + @unittest.expectedFailure def test_bug_gh91616(self): self.assertTrue(re.fullmatch(r'(?s:(?>.*?\.).*)\Z', "a.txt")) # reproducer self.assertTrue(re.fullmatch(r'(?s:(?=(?P.*?\.))(?P=g0).*)\Z', "a.txt")) @@ -2442,8 +2414,6 @@ def test_template_function_and_flag_is_deprecated(self): self.assertTrue(template_re1.match('ahoy')) self.assertFalse(template_re1.match('nope')) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_bug_gh106052(self): # gh-100061 self.assertEqual(re.match('(?>(?:.(?!D))+)', 'ABCDE').span(), (0, 2)) diff --git a/vm/Cargo.toml b/vm/Cargo.toml index aa8c6df307..5efd01d442 100644 --- a/vm/Cargo.toml +++ b/vm/Cargo.toml @@ -80,9 +80,8 @@ timsort = "0.1.2" # RustPython crates implementing functionality based on CPython # sre-engine = "0.4.1" # to work on sre-engine locally or git version -# sre-engine = { git = "https://github.com/RustPython/sre-engine", rev = "refs/pull/14/head" } +sre-engine = { git = "https://github.com/RustPython/sre-engine", rev = "refs/pull/17/head" } # sre-engine = { git = "https://github.com/RustPython/sre-engine" } -sre-engine = { path = "../../sre-engine" } ## unicode stuff unicode_names2 = { workspace = true } diff --git a/vm/src/stdlib/sre.rs b/vm/src/stdlib/sre.rs index 93ecd7c24e..b51320cfd2 100644 --- a/vm/src/stdlib/sre.rs +++ b/vm/src/stdlib/sre.rs @@ -5,13 +5,13 @@ mod _sre { use crate::{ atomic_func, builtins::{ - PyCallableIterator, PyDictRef, PyGenericAlias, PyInt, PyList, PyStr, PyStrRef, PyTuple, - PyTupleRef, PyTypeRef, + PyCallableIterator, PyDictRef, PyGenericAlias, PyInt, PyList, PyListRef, PyStr, + PyStrRef, PyTuple, PyTupleRef, PyTypeRef, }, common::{ascii, hash::PyHash}, convert::ToPyObject, function::{ArgCallable, OptionalArg, PosArgs, PyComparisonValue}, - protocol::{PyBuffer, PyMappingMethods}, + protocol::{PyBuffer, PyCallable, PyMappingMethods}, stdlib::sys, types::{AsMapping, Comparable, Hashable, Representable}, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, TryFromBorrowedObject, @@ -22,12 +22,12 @@ mod _sre { use itertools::Itertools; use num_traits::ToPrimitive; use sre_engine::{ - constants::SreFlag, - engine::{lower_ascii, lower_unicode, upper_unicode, Request, SearchIter, State, StrDrive}, + string::{lower_ascii, lower_unicode, upper_unicode}, + Request, SearchIter, SreFlag, State, StrDrive, }; #[pyattr] - pub use sre_engine::{constants::SRE_MAGIC as MAGIC, CODESIZE, MAXGROUPS, MAXREPEAT}; + pub use sre_engine::{CODESIZE, MAXGROUPS, MAXREPEAT, SRE_MAGIC as MAGIC}; #[pyfunction] fn getcodesize() -> usize { @@ -103,6 +103,58 @@ mod _sre { }) } + #[pyattr] + #[pyclass(name = "SRE_Template")] + #[derive(Debug, PyPayload)] + struct Template { + literal: PyObjectRef, + items: Vec<(usize, PyObjectRef)>, + } + + #[pyclass] + impl Template { + fn compile( + pattern: PyRef, + repl: PyObjectRef, + vm: &VirtualMachine, + ) -> PyResult> { + let re = vm.import("re", None, 0)?; + let func = re.get_attr("_compile_template", vm)?; + let result = func.call((pattern, repl.clone()), vm)?; + result + .downcast::() + .map_err(|_| vm.new_runtime_error("expected SRE_Template".to_owned())) + } + } + + #[pyfunction] + fn template( + _pattern: PyObjectRef, + template: PyListRef, + vm: &VirtualMachine, + ) -> PyResult