Reorganize whitespace stripping in _tokenize()

By consistently stripping trailing whitespace instead of stripping initial whitespace in the string/operator case, we do less redundant work and handle the '\n's at the end of lines better. Shaves a few % off the _tokenize() runtime in cProfile and line_profiler.
author: Ulf Magnusson <ulfalizer@gmail.com> 2017-09-29 03:49:35 +0200
committer: Ulf Magnusson <ulfalizer@gmail.com> 2017-09-29 06:15:01 +0200
commit: 5fea36ffd71d8f908133d091860ef4b139cc7e81 (patch)
tree: 3b78d0c6a8724ed741fbeec535b803f33bd07a6b /kconfiglib.py
parent: 3d27a14dbcb057eb99491babe1238bde6cfe66e6 (diff)
1 files changed, 25 insertions, 12 deletions
diff --git a/kconfiglib.py b/kconfiglib.py
index df7591f..274f0fd 100644
--- a/kconfiglib.py
+++ b/kconfiglib.py
@@ -1350,11 +1350,16 @@ class Config(object):
             else:
                 # Not an identifier/keyword
 
-                # Find the next non-whitespace character
-                while i < len(s) and s[i].isspace():
-                    i += 1
-                if i == len(s):
-                    break
+                # Note: _id_keyword_match and _initial_token_match strip
+                # trailing whitespace, making it safe to assume s[i] is the
+                # start of a token here. We manually strip trailing whitespace
+                # below as well.
+                #
+                # An old version stripped whitespace in this spot instead, but
+                # that leads to some redundancy and would cause
+                # _id_keyword_match to be tried against just "\n" fairly often
+                # (because file.readlines() keeps newlines).
+
                 c = s[i]
                 i += 1
 
@@ -1436,7 +1441,13 @@ class Config(object):
                     else:
                         token = _T_GREATER
 
-                else: continue # Invalid characters are ignored
+                else:
+                    # Invalid characters are ignored
+                    continue
+
+                # Skip trailing whitespace
+                while i < len(s) and s[i].isspace():
+                    i += 1
 
             tokens.append(token)
 
@@ -3627,14 +3638,16 @@ _STRING_LEX = frozenset((
 #    command_chars characters.
 # This is why things like "----help--" are accepted.
 #
-# As an optimization, this regex also fails to match for lines containing just
-# a comment, and also matches trailing whitespace so it can be jumped over
-# immediately.
+# In addition to the initial token, the regex also matches trailing whitespace
+# so that we can jump straight to the next token (or to the end of the line if
+# there's just a single token).
+#
+# As an optimization, this regex fails to match for lines containing just a
+# comment.
 _initial_token_re_match = re.compile(r"[^\w#]*(\w+)\s*").match
 
-# Matches an identifier/keyword optionally preceded by whitespace. Also eats
-# trailing whitespace as an optimization.
-_id_keyword_re_match = re.compile(r"\s*([\w./-]+)\s*").match
+# Matches an identifier/keyword, also eating trailing whitespace
+_id_keyword_re_match = re.compile(r"([\w./-]+)\s*").match
 
 # Regular expression for finding $-references to symbols in strings
 _sym_ref_re_search = re.compile(r"\$[A-Za-z0-9_]+").search
author	Ulf Magnusson <ulfalizer@gmail.com>	2017-09-29 03:49:35 +0200
committer	Ulf Magnusson <ulfalizer@gmail.com>	2017-09-29 06:15:01 +0200
commit	5fea36ffd71d8f908133d091860ef4b139cc7e81 (patch)
tree	3b78d0c6a8724ed741fbeec535b803f33bd07a6b /kconfiglib.py
parent	3d27a14dbcb057eb99491babe1238bde6cfe66e6 (diff)