Tighten up regexes

- Match the simpler strchr()y .config parsing done by the C implementation - Spell out \w as [a-zA-Z0-9_]. Easier to verify. - Use ASCII mode for Python 3 to be consistent with Python 2, where it's already the default. \s no longer matches obscure Unicode stuff. This also speeds up regex matching during parsing by about 15% on Python 3, increasing parsing performance by a few %. Looks like there's a tiny improvement for Python 2 as well.
author: Ulf Magnusson <ulfalizer@gmail.com> 2018-01-25 01:42:32 +0100
committer: Ulf Magnusson <ulfalizer@gmail.com> 2018-01-25 02:21:53 +0100
commit: 8d7235f493115a1371e688f0efdd69c1effea307 (patch)
tree: f43c8c70c5a417f03ebf8f0eb2f0a70ae69e12b3
parent: 955ea4e599e39fe3b1f049fe1750fe3c0ad09f5d (diff)
1 files changed, 17 insertions, 8 deletions
diff --git a/kconfiglib.py b/kconfiglib.py
index bdc40d6..428b15b 100644
--- a/kconfiglib.py
+++ b/kconfiglib.py
@@ -565,10 +565,15 @@ class Kconfig(object):
         # Regular expressions for parsing .config files, with the match()
         # method assigned directly as a small optimization (microscopic in this
         # case, but it's consistent with the other regexes)
-        self._set_re_match = re.compile(r"{}(\w+)=(.*)"
-                                        .format(self.config_prefix)).match
-        self._unset_re_match = re.compile(r"# {}(\w+) is not set"
-                                          .format(self.config_prefix)).match
+
+        self._set_re_match = \
+            re.compile(r"{}([^=]+)=(.*)".format(self.config_prefix),
+                       _RE_ASCII).match
+
+        self._unset_re_match = \
+            re.compile(r"# {}([^ ]+) is not set".format(self.config_prefix),
+                       _RE_ASCII).match
+
 
         self._print_warnings = warn
         self._print_undef_assign = False
@@ -4333,6 +4338,9 @@ _TYPE_TOKENS = frozenset((
     _T_STRING,
 ))
 
+# Use ASCII regex matching on Python 3. It's already the default on Python 2.
+_RE_ASCII = 0 if sys.version_info[0] < 3 else re.ASCII
+
 # Note: This hack is no longer needed as of upstream commit c226456
 # (kconfig: warn of unhandled characters in Kconfig commands). It
 # is kept around for backwards compatibility.
@@ -4350,17 +4358,18 @@ _TYPE_TOKENS = frozenset((
 #
 # As an optimization, this regex fails to match for lines containing just a
 # comment.
-_initial_token_re_match = re.compile(r"[^\w#]*(\w+)\s*").match
+_initial_token_re_match = \
+    re.compile(r"[^A-Za-z0-9_#]*([A-Za-z0-9_]+)\s*", _RE_ASCII).match
 
 # Matches an identifier/keyword, also eating trailing whitespace
-_id_keyword_re_match = re.compile(r"([\w./-]+)\s*").match
+_id_keyword_re_match = re.compile(r"([A-Za-z0-9_/.-]+)\s*", _RE_ASCII).match
 
 # Regular expression for finding $-references to symbols in strings
-_sym_ref_re_search = re.compile(r"\$([A-Za-z0-9_]+)").search
+_sym_ref_re_search = re.compile(r"\$([A-Za-z0-9_]+)", _RE_ASCII).search
 
 # Matches a valid right-hand side for an assignment to a string symbol in a
 # .config file, including escaped characters. Extracts the contents.
-_conf_string_re_match = re.compile(r'"((?:[^\\"]|\\.)*)"').match
+_conf_string_re_match = re.compile(r'"((?:[^\\"]|\\.)*)"', _RE_ASCII).match
 
 # Token to type mapping
 _TOKEN_TO_TYPE = {
author	Ulf Magnusson <ulfalizer@gmail.com>	2018-01-25 01:42:32 +0100
committer	Ulf Magnusson <ulfalizer@gmail.com>	2018-01-25 02:21:53 +0100
commit	8d7235f493115a1371e688f0efdd69c1effea307 (patch)
tree	f43c8c70c5a417f03ebf8f0eb2f0a70ae69e12b3
parent	955ea4e599e39fe3b1f049fe1750fe3c0ad09f5d (diff)