From 5fea36ffd71d8f908133d091860ef4b139cc7e81 Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Fri, 29 Sep 2017 03:49:35 +0200
Subject: Reorganize whitespace stripping in _tokenize()

By consistently stripping trailing whitespace instead of stripping
initial whitespace in the string/operator case, we do less redundant
work and handle the '\n's at the end of lines better. Shaves a few % off
the _tokenize() runtime in cProfile and line_profiler.
---
 kconfiglib.py | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/kconfiglib.py b/kconfiglib.py
index df7591f..274f0fd 100644
--- a/kconfiglib.py
+++ b/kconfiglib.py
@@ -1350,11 +1350,16 @@ class Config(object):
             else:
                 # Not an identifier/keyword
 
-                # Find the next non-whitespace character
-                while i < len(s) and s[i].isspace():
-                    i += 1
-                if i == len(s):
-                    break
+                # Note: _id_keyword_match and _initial_token_match strip
+                # trailing whitespace, making it safe to assume s[i] is the
+                # start of a token here. We manually strip trailing whitespace
+                # below as well.
+                #
+                # An old version stripped whitespace in this spot instead, but
+                # that leads to some redundancy and would cause
+                # _id_keyword_match to be tried against just "\n" fairly often
+                # (because file.readlines() keeps newlines).
+
                 c = s[i]
                 i += 1
 
@@ -1436,7 +1441,13 @@ class Config(object):
                     else:
                         token = _T_GREATER
 
-                else: continue # Invalid characters are ignored
+                else:
+                    # Invalid characters are ignored
+                    continue
+
+                # Skip trailing whitespace
+                while i < len(s) and s[i].isspace():
+                    i += 1
 
             tokens.append(token)
 
@@ -3627,14 +3638,16 @@ _STRING_LEX = frozenset((
 #    command_chars characters.
 # This is why things like "----help--" are accepted.
 #
-# As an optimization, this regex also fails to match for lines containing just
-# a comment, and also matches trailing whitespace so it can be jumped over
-# immediately.
+# In addition to the initial token, the regex also matches trailing whitespace
+# so that we can jump straight to the next token (or to the end of the line if
+# there's just a single token).
+#
+# As an optimization, this regex fails to match for lines containing just a
+# comment.
 _initial_token_re_match = re.compile(r"[^\w#]*(\w+)\s*").match
 
-# Matches an identifier/keyword optionally preceded by whitespace. Also eats
-# trailing whitespace as an optimization.
-_id_keyword_re_match = re.compile(r"\s*([\w./-]+)\s*").match
+# Matches an identifier/keyword, also eating trailing whitespace
+_id_keyword_re_match = re.compile(r"([\w./-]+)\s*").match
 
 # Regular expression for finding $-references to symbols in strings
 _sym_ref_re_search = re.compile(r"\$[A-Za-z0-9_]+").search
-- 
cgit v1.2.3