Micro-optimize _tokenize() some more

Slightly modifying _STRING_LEX allows the (common) symbol reference case to be detected earlier and removes some assignments. Also modify some comments and air things out a bit.
author: Ulf Magnusson <ulfalizer@gmail.com> 2017-10-01 02:09:34 +0200
committer: Ulf Magnusson <ulfalizer@gmail.com> 2017-10-01 02:50:29 +0200
commit: ab58c2388128df0dfe361b7d1a795c520cbfa55d (patch)
tree: 20e2091198b922af534144874d4dbc660fbad8cb /kconfiglib.py
parent: 2c062711d974be25346776fcb30ab1310cd896e2 (diff)
1 files changed, 37 insertions, 29 deletions
diff --git a/kconfiglib.py b/kconfiglib.py
index fbae1fb..8a2969c 100644
--- a/kconfiglib.py
+++ b/kconfiglib.py
@@ -1267,18 +1267,21 @@ class Config(object):
            register new symbols."""
 
         # Tricky implementation detail: While parsing a token, 'token' refers
-        # to the previous token. See _STRING_LEX for why this is needed.
+        # to the previous token. See _NOT_REF for why this is needed.
 
         if for_eval:
             token = None
             tokens = []
-            i = 0 # The current index in the string being tokenized
+
+            # The current index in the string being tokenized
+            i = 0
 
         else:
             # See comment at _initial_token_re_match definition
             initial_token_match = _initial_token_re_match(s)
             if not initial_token_match:
                 return _Feed(())
+
             keyword = _get_keyword(initial_token_match.group(1))
             if keyword == _T_HELP:
                 # Avoid junk after "help", e.g. "---", being registered as a
@@ -1295,8 +1298,8 @@ class Config(object):
 
         # Main tokenization loop (for tokens past the first one)
         while i < len(s):
-            # Test for an identifier/keyword preceded by whitespace first; this
-            # is the most common case.
+            # Test for an identifier/keyword first. This is the most common
+            # case.
             id_keyword_match = _id_keyword_re_match(s, i)
             if id_keyword_match:
                 # We have an identifier or keyword
@@ -1304,34 +1307,37 @@ class Config(object):
                 # Jump past it
                 i = id_keyword_match.end()
 
-                # Check what it is
+                # Check what it is. lookup_sym() will take care of allocating
+                # new symbols for us the first time we see them. Note that
+                # 'token' still refers to the previous token.
+
                 name = id_keyword_match.group(1)
                 keyword = _get_keyword(name)
                 if keyword is not None:
                     # It's a keyword
                     token = keyword
-                elif token in _STRING_LEX:
-                    # What would ordinarily be considered an identifier is
-                    # treated as a string after certain tokens
-                    token = name
-                else:
-                    # It's a symbol name. _lookup_sym() will take care of
-                    # allocating a new Symbol instance if it's the first time
-                    # we see it.
-                    sym = self._lookup_sym(name, for_eval)
-
-                    # Also handles 'menuconfig'
-                    if token == _T_CONFIG:
-                        # If the previous token is _T_(MENU)CONFIG
-                        # ("(menu)config"), we're tokenizing the first line of
-                        # a symbol definition, and should remember this as a
-                        # location where the symbol is defined
-                        sym._def_locations.append((filename, linenr))
-                    else:
-                        # Otherwise, it's a reference to the symbol
-                        sym._ref_locations.append((filename, linenr))
 
-                    token = sym
+                elif token not in _NOT_REF:
+                    # It's a symbol reference
+                    token = self._lookup_sym(name, for_eval)
+                    token._ref_locations.append((filename, linenr))
+
+                elif token == _T_CONFIG:
+                    # It's a symbol definition
+                    token = self._lookup_sym(name, for_eval)
+                    token._def_locations.append((filename, linenr))
+
+                else:
+                    # It's a case of missing quotes. For example, the
+                    # following is accepted:
+                    #
+                    #   menu unquoted_title
+                    #
+                    #   config A
+                    #       tristate unquoted_prompt
+                    #
+                    #   endmenu
+                    token = name
 
             else:
                 # Not an identifier/keyword
@@ -3636,10 +3642,12 @@ _get_keyword = {
     "visible":        _T_VISIBLE,
 }.get
 
-# Tokens after which identifier-like lexemes are treated as strings. _T_CHOICE
-# is included to avoid symbols being registered for named choices.
-_STRING_LEX = frozenset((
+# Tokens after which identifier-like lexemes are treated as strings, plus
+# _T_CONFIG. This allows us to quickly check if we have a symbol reference (as
+# opposed to a definition or something else) when tokenizing.
+_NOT_REF = frozenset((
     _T_BOOL,
+    _T_CONFIG,
     _T_CHOICE,
     _T_COMMENT,
     _T_HEX,
author	Ulf Magnusson <ulfalizer@gmail.com>	2017-10-01 02:09:34 +0200
committer	Ulf Magnusson <ulfalizer@gmail.com>	2017-10-01 02:50:29 +0200
commit	ab58c2388128df0dfe361b7d1a795c520cbfa55d (patch)
tree	20e2091198b922af534144874d4dbc660fbad8cb /kconfiglib.py
parent	2c062711d974be25346776fcb30ab1310cd896e2 (diff)