diff options
| author | Ulf Magnusson <ulfalizer@gmail.com> | 2018-01-25 01:42:32 +0100 |
|---|---|---|
| committer | Ulf Magnusson <ulfalizer@gmail.com> | 2018-01-25 02:21:53 +0100 |
| commit | 8d7235f493115a1371e688f0efdd69c1effea307 (patch) | |
| tree | f43c8c70c5a417f03ebf8f0eb2f0a70ae69e12b3 | |
| parent | 955ea4e599e39fe3b1f049fe1750fe3c0ad09f5d (diff) | |
Tighten up regexes
- Match the simpler strchr()y .config parsing done by the C
implementation
- Spell out \w as [a-zA-Z0-9_]. Easier to verify.
- Use ASCII mode for Python 3 to be consistent with Python 2,
where it's already the default. \s no longer matches obscure Unicode
stuff.
This also speeds up regex matching during parsing by about 15% on Python
3, increasing parsing performance by a few %. Looks like there's a tiny
improvement for Python 2 as well.
| -rw-r--r-- | kconfiglib.py | 25 |
1 files changed, 17 insertions, 8 deletions
diff --git a/kconfiglib.py b/kconfiglib.py index bdc40d6..428b15b 100644 --- a/kconfiglib.py +++ b/kconfiglib.py @@ -565,10 +565,15 @@ class Kconfig(object): # Regular expressions for parsing .config files, with the match() # method assigned directly as a small optimization (microscopic in this # case, but it's consistent with the other regexes) - self._set_re_match = re.compile(r"{}(\w+)=(.*)" - .format(self.config_prefix)).match - self._unset_re_match = re.compile(r"# {}(\w+) is not set" - .format(self.config_prefix)).match + + self._set_re_match = \ + re.compile(r"{}([^=]+)=(.*)".format(self.config_prefix), + _RE_ASCII).match + + self._unset_re_match = \ + re.compile(r"# {}([^ ]+) is not set".format(self.config_prefix), + _RE_ASCII).match + self._print_warnings = warn self._print_undef_assign = False @@ -4333,6 +4338,9 @@ _TYPE_TOKENS = frozenset(( _T_STRING, )) +# Use ASCII regex matching on Python 3. It's already the default on Python 2. +_RE_ASCII = 0 if sys.version_info[0] < 3 else re.ASCII + # Note: This hack is no longer needed as of upstream commit c226456 # (kconfig: warn of unhandled characters in Kconfig commands). It # is kept around for backwards compatibility. @@ -4350,17 +4358,18 @@ _TYPE_TOKENS = frozenset(( # # As an optimization, this regex fails to match for lines containing just a # comment. -_initial_token_re_match = re.compile(r"[^\w#]*(\w+)\s*").match +_initial_token_re_match = \ + re.compile(r"[^A-Za-z0-9_#]*([A-Za-z0-9_]+)\s*", _RE_ASCII).match # Matches an identifier/keyword, also eating trailing whitespace -_id_keyword_re_match = re.compile(r"([\w./-]+)\s*").match +_id_keyword_re_match = re.compile(r"([A-Za-z0-9_/.-]+)\s*", _RE_ASCII).match # Regular expression for finding $-references to symbols in strings -_sym_ref_re_search = re.compile(r"\$([A-Za-z0-9_]+)").search +_sym_ref_re_search = re.compile(r"\$([A-Za-z0-9_]+)", _RE_ASCII).search # Matches a valid right-hand side for an assignment to a string symbol in a # .config file, including escaped characters. Extracts the contents. -_conf_string_re_match = re.compile(r'"((?:[^\\"]|\\.)*)"').match +_conf_string_re_match = re.compile(r'"((?:[^\\"]|\\.)*)"', _RE_ASCII).match # Token to type mapping _TOKEN_TO_TYPE = { |
