Use regexes instead of frozensets to lex ids/keywords.

Faster and a bit more readable.
author: Ulf Magnusson <ulfalizer@gmail.com> 2012-12-19 11:55:06 +0100
committer: Ulf Magnusson <ulfalizer@gmail.com> 2012-12-19 13:12:57 +0100
commit: 5bb8c285d84173d35ae63ba33305a28c6329c0e2 (patch)
tree: a5dacb0730797a81c12b84fa80a1451b28a5247e
parent: 0c275813918b849d0ee9f24ecc41a40b501b7cd8 (diff)
1 files changed, 128 insertions, 129 deletions
diff --git a/kconfiglib.py b/kconfiglib.py
index 5d002e3..0704cc0 100644
--- a/kconfiglib.py
+++ b/kconfiglib.py
@@ -605,23 +605,21 @@ class Config():
         """Returns a _Feed instance containing tokens derived from the string
         's'. Registers any new symbols encountered (via _sym_lookup()).
 
-        (I experimented with a regular expression implementation, but it came
-        out 5% _slower_ and wouldn't have been as flexible.)
+        (I experimented with a pure regular expression implementation, but it
+        came out slower, less readable, and wouldn't have been as flexible.)
 
         for_eval -- True when parsing an expression for a call to
                     Config.eval(), in which case we should not treat the first
                     token specially nor register new symbols."""
-        tokens = []
-        previous = None
-
-        strlen = len(s)
-        # This is a hotspot during parsing, and this speeds things up a bit
-        append = tokens.append
-
-        # The current index in the string being tokenized
-        i = 0
-
-        if not for_eval:
+        s = s.lstrip()
+        if s == "" or s[0] == "#":
+            return _Feed([])
+
+        if for_eval:
+            i = 0 # The current index in the string being tokenized
+            previous = None # The previous token seen
+            tokens = []
+        else:
             # The initial word on a line is parsed specially. Let
             # command_chars = [A-Za-z0-9_]. Then
             #  - leading non-command_chars characters on the line are ignored, and
@@ -629,54 +627,42 @@ class Config():
             #    characters.
             # This is why things like "----help--" are accepted.
 
-            # Ignore leading non-command_chars characters
-            while i < strlen and s[i] not in command_chars:
-                # We need to look out for comments
-                if s[i] == "#":
-                    return _Feed([])
-                i += 1
-            keyword_start = i
-            # Locate the end of the keyword
-            while i < strlen and s[i] in command_chars:
-                i += 1
-            keyword_lexeme = s[keyword_start:i]
-            # Blank line?
-            if keyword_lexeme == "":
+            initial_token_match = initial_token_re.match(s)
+            if initial_token_match is None:
                 return _Feed([])
-            keyword = keywords.get(keyword_lexeme)
+            # The current index in the string being tokenized
+            i = initial_token_match.end()
+
+            keyword = keywords.get(initial_token_match.group(1))
             if keyword is None:
                 # We expect a keyword as the first token
-                _tokenization_error(s, strlen, filename, linenr)
+                _tokenization_error(s, len(s), filename, linenr)
             if keyword == T_HELP:
                 # Avoid junk after "help", e.g. "---", being registered as a
                 # symbol
                 return _Feed([T_HELP])
-            append(keyword)
+            tokens = [keyword]
             previous = keyword
 
-        # Main tokenization loop (handles tokens past the first one)
-        while i < strlen:
-            c = s[i]
-
-            # Arranged by the frequency the cases are encountered, which gives
-            # a small speed-up
-
-            if c.isspace():
-                i += 1
-                continue
-
-            if c in sym_chars:
-                name_start = i
-
-                # Locate the end of the symbol/keyword
-                i += 1
-                while i < strlen and s[i] in sym_chars:
-                    i += 1
-
-                name = s[name_start:i]
+        # _tokenize() is a hotspot during parsing, and this speeds things up a
+        # bit
+        strlen = len(s)
+        append = tokens.append
 
+        # Main tokenization loop. (Handles tokens past the first one.)
+        while i < strlen:
+            # Test for an identifier/keyword preceded by whitespace first; this
+            # is the most common case.
+            id_keyword_match = id_keyword_re.match(s, i)
+            if id_keyword_match:
+                # We have an identifier or keyword. The above also stripped any
+                # whitespace for us.
+                name = id_keyword_match.group(1)
+                # Jump past it
+                i = id_keyword_match.end()
+
+                # Keyword?
                 keyword = keywords.get(name)
-
                 if keyword is not None:
                     append(keyword)
                 # What would ordinarily be considered a name is treated as a
@@ -701,90 +687,104 @@ class Config():
 
                     append(sym)
 
-            elif c == '"' or c == "'":
-                i += 1
+            else:
+                # This restrips whitespace that could have been stripped in the
+                # regex above, but it's worth it since identifiers/keywords are
+                # more common
+                s = s[i:].lstrip()
+                if s == "":
+                    break
+                strlen = len(s)
+                i = 0
+                c = s[0]
 
-                if "\\" in s:
-                    # Slow path: This could probably be sped up, but it's a
-                    # very unusual case anyway.
-                    quote = c
-                    value = ""
-                    while 1:
-                        if i >= strlen:
-                            _tokenization_error(s, strlen, filename, linenr)
-                        c = s[i]
-                        if c == quote:
-                            break
-                        if c == "\\":
-                            if i + 1 >= strlen:
-                                _tokenization_error(s, strlen, filename, linenr)
-                            value += s[i + 1]
-                            i += 2
-                        else:
-                            value += c
-                            i += 1
+                # String literal (constant symbol)
+                if c == '"' or c == "'":
                     i += 1
-                    append(value)
-                else:
-                    # Fast path: If the string contains no backslashes (almost
-                    # always) we can simply look for the matching quote.
-                    end = s.find(c, i)
-                    if end == -1:
-                        _tokenization_error(s, strlen, filename, linenr)
-                    append(s[i:end])
-                    i = end + 1
 
-            elif c == "&":
-                if i + 1 >= strlen:
-                    # Invalid characters are ignored
-                    continue
-                if s[i + 1] != "&":
-                    # Invalid characters are ignored
-                    i += 1
-                    continue
-                append(T_AND)
-                i += 2
+                    if "\\" in s:
+                        # Slow path: This could probably be sped up, but it's a
+                        # very unusual case anyway.
+                        quote = c
+                        value = ""
+                        while 1:
+                            if i >= strlen:
+                                _tokenization_error(s, strlen, filename,
+                                                    linenr)
+                            c = s[i]
+                            if c == quote:
+                                break
+                            if c == "\\":
+                                if i + 1 >= strlen:
+                                    _tokenization_error(s, strlen, filename,
+                                                        linenr)
+                                value += s[i + 1]
+                                i += 2
+                            else:
+                                value += c
+                                i += 1
+                        i += 1
+                        append(value)
+                    else:
+                        # Fast path: If the string contains no backslashes (almost
+                        # always) we can simply look for the matching quote.
+                        end = s.find(c, i)
+                        if end == -1:
+                            _tokenization_error(s, strlen, filename, linenr)
+                        append(s[i:end])
+                        i = end + 1
+
+                elif c == "&":
+                    if i + 1 >= strlen:
+                        # Invalid characters are ignored
+                        continue
+                    if s[i + 1] != "&":
+                        # Invalid characters are ignored
+                        i += 1
+                        continue
+                    append(T_AND)
+                    i += 2
 
-            elif c == "|":
-                if i + 1 >= strlen:
-                    # Invalid characters are ignored
-                    continue
-                if s[i + 1] != "|":
-                    # Invalid characters are ignored
-                    i += 1
-                    continue
-                append(T_OR)
-                i += 2
-
-            elif c == "!":
-                if i + 1 >= strlen:
-                    _tokenization_error(s, strlen, filename, linenr)
-                if s[i + 1] == "=":
-                    append(T_UNEQUAL)
+                elif c == "|":
+                    if i + 1 >= strlen:
+                        # Invalid characters are ignored
+                        continue
+                    if s[i + 1] != "|":
+                        # Invalid characters are ignored
+                        i += 1
+                        continue
+                    append(T_OR)
                     i += 2
-                else:
-                    append(T_NOT)
-                    i += 1
 
-            elif c == "=":
-                append(T_EQUAL)
-                i += 1
+                elif c == "!":
+                    if i + 1 >= strlen:
+                        _tokenization_error(s, strlen, filename, linenr)
+                    if s[i + 1] == "=":
+                        append(T_UNEQUAL)
+                        i += 2
+                    else:
+                        append(T_NOT)
+                        i += 1
 
-            elif c == "(":
-                append(T_OPEN_PAREN)
-                i += 1
+                elif c == "=":
+                    append(T_EQUAL)
+                    i += 1
 
-            elif c == ")":
-                append(T_CLOSE_PAREN)
-                i += 1
+                elif c == "(":
+                    append(T_OPEN_PAREN)
+                    i += 1
 
-            elif c == "#":
-                break
+                elif c == ")":
+                    append(T_CLOSE_PAREN)
+                    i += 1
 
-            else:
-                # Invalid characters are ignored
-                i += 1
-                continue
+                elif c == "#":
+                    break
+
+                else:
+                    # Invalid characters are ignored
+                    i += 1
+                    continue
 
             previous = tokens[-1]
 
@@ -2063,12 +2063,11 @@ bool_str = { False : "false", True : "true" }
 string_lex = frozenset((T_BOOL, T_TRISTATE, T_INT, T_HEX, T_STRING, T_CHOICE,
                         T_PROMPT, T_MENU, T_COMMENT, T_SOURCE, T_MAINMENU))
 
-# Characters that may appear in symbol names
-sym_chars = frozenset(string.ascii_letters + string.digits + "._/-")
+# Matches the initial token on a line; see _tokenize().
+initial_token_re = re.compile(r"[^\w]*(\w+)")
 
-# Characters that might be the first significant character on a line. Other
-# characters are ignored at the beginning of a line.
-command_chars = frozenset(string.ascii_letters + string.digits + "_")
+# Matches an identifier/keyword optionally preceded by whitespace
+id_keyword_re = re.compile(r"\s*([\w./-]+)")
 
 # Regular expressions for parsing .config files
 set_re   = re.compile(r"CONFIG_(\w+)=(.*)")
author	Ulf Magnusson <ulfalizer@gmail.com>	2012-12-19 11:55:06 +0100
committer	Ulf Magnusson <ulfalizer@gmail.com>	2012-12-19 13:12:57 +0100
commit	5bb8c285d84173d35ae63ba33305a28c6329c0e2 (patch)
tree	a5dacb0730797a81c12b84fa80a1451b28a5247e
parent	0c275813918b849d0ee9f24ecc41a40b501b7cd8 (diff)