From 5fea36ffd71d8f908133d091860ef4b139cc7e81 Mon Sep 17 00:00:00 2001 From: Ulf Magnusson Date: Fri, 29 Sep 2017 03:49:35 +0200 Subject: Reorganize whitespace stripping in _tokenize() By consistently stripping trailing whitespace instead of stripping initial whitespace in the string/operator case, we do less redundant work and handle the '\n's at the end of lines better. Shaves a few % off the _tokenize() runtime in cProfile and line_profiler. --- kconfiglib.py | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/kconfiglib.py b/kconfiglib.py index df7591f..274f0fd 100644 --- a/kconfiglib.py +++ b/kconfiglib.py @@ -1350,11 +1350,16 @@ class Config(object): else: # Not an identifier/keyword - # Find the next non-whitespace character - while i < len(s) and s[i].isspace(): - i += 1 - if i == len(s): - break + # Note: _id_keyword_match and _initial_token_match strip + # trailing whitespace, making it safe to assume s[i] is the + # start of a token here. We manually strip trailing whitespace + # below as well. + # + # An old version stripped whitespace in this spot instead, but + # that leads to some redundancy and would cause + # _id_keyword_match to be tried against just "\n" fairly often + # (because file.readlines() keeps newlines). + c = s[i] i += 1 @@ -1436,7 +1441,13 @@ class Config(object): else: token = _T_GREATER - else: continue # Invalid characters are ignored + else: + # Invalid characters are ignored + continue + + # Skip trailing whitespace + while i < len(s) and s[i].isspace(): + i += 1 tokens.append(token) @@ -3627,14 +3638,16 @@ _STRING_LEX = frozenset(( # command_chars characters. # This is why things like "----help--" are accepted. # -# As an optimization, this regex also fails to match for lines containing just -# a comment, and also matches trailing whitespace so it can be jumped over -# immediately. +# In addition to the initial token, the regex also matches trailing whitespace +# so that we can jump straight to the next token (or to the end of the line if +# there's just a single token). +# +# As an optimization, this regex fails to match for lines containing just a +# comment. _initial_token_re_match = re.compile(r"[^\w#]*(\w+)\s*").match -# Matches an identifier/keyword optionally preceded by whitespace. Also eats -# trailing whitespace as an optimization. -_id_keyword_re_match = re.compile(r"\s*([\w./-]+)\s*").match +# Matches an identifier/keyword, also eating trailing whitespace +_id_keyword_re_match = re.compile(r"([\w./-]+)\s*").match # Regular expression for finding $-references to symbols in strings _sym_ref_re_search = re.compile(r"\$[A-Za-z0-9_]+").search -- cgit v1.2.3