Skip invalid characters when tokenizing.

Emulate zconf.l w.r.t. invalid characters at different locations within a line. This eliminates the need for special hacks to handle -*help-* "tokens". The Linux 3.7.0 configuration files now parse without errors.
author: Ulf Magnusson <ulfalizer@gmail.com> 2012-11-08 14:34:25 +0100
committer: Ulf Magnusson <ulfalizer@gmail.com> 2012-11-08 14:34:25 +0100
commit: b9409a7ca140f5f677b46293f0c8db52b14280d9 (patch)
tree: 95c1b7b5005b2c33adfc4f468a48c97990194582 /kconfiglib.py
parent: 00c10acf00140f5f14324c7ebbc316b10d0e4688 (diff)
1 files changed, 57 insertions, 14 deletions
diff --git a/kconfiglib.py b/kconfiglib.py
index db8feb4..ee81c50 100644
--- a/kconfiglib.py
+++ b/kconfiglib.py
@@ -812,7 +812,7 @@ class Config():
           implementation and by kconfiglib, so that MODULES needs to be enabled
           for the expression to be true. Pass True here if you want that to
           happen; otherwise, pass False."""
-        return self._eval_expr(self._parse_expr(self._tokenize(s, False),
+        return self._eval_expr(self._parse_expr(self._tokenize(s, True),
                                                 None,
                                                 s,
                                                 transform_m = transform_m))
@@ -899,7 +899,7 @@ class Config():
 
     def _tokenize(self,
                   s,
-                  add_sym_if_not_exists = True,
+                  for_eval = False,
                   filename = None,
                   linenr = None):
         """Returns a _Feed instance containing tokens derived from the string
@@ -908,9 +908,9 @@ class Config():
         (I experimented with a regular expression implementation, but it came
         out 5% _slower_ and wouldn't have been as flexible.)
 
-        add_sym_if_not_exists -- False when we do not want to register new
-                                 symbols, such as when called from
-                                 Config.eval()."""
+        for_eval -- True when parsing an expression for a call to
+                    Config.eval(), in which case we should not treat the first
+                    token specially nor register new symbols."""
         tokens = []
         previous = None
 
@@ -920,6 +920,36 @@ class Config():
         # The current index in the string being tokenized
         i = 0
 
+        if not for_eval:
+            # The initial word on a line is parsed specially. Let
+            # command_chars = [A-Za-z0-9_]. Then
+            #  - leading non-command_chars characters on the line are ignored, and
+            #  - the first token consists the following one or more command_chars
+            #    characters.
+            # This is why things like "----help--" are accepted.
+
+            # Ignore leading non-command_chars characters
+            while i < strlen and s[i] not in command_chars:
+                # We need to look out for comments
+                if s[i] == "#":
+                    return _Feed([])
+                i += 1
+            keyword_start = i
+            # Locate the end of the keyword
+            while i < strlen and s[i] in command_chars:
+                i += 1
+            keyword_lexeme = s[keyword_start:i]
+            # Blank line?
+            if keyword_lexeme == "":
+                return _Feed([])
+            keyword = keywords.get(keyword_lexeme)
+            if keyword is None:
+                # We expect a keyword as the first token
+                _tokenization_error(s, strlen, filename, linenr)
+            append(keyword)
+            previous = keyword
+
+        # Main tokenization loop (handles tokens past the first one)
         while i < strlen:
             c = s[i]
 
@@ -951,17 +981,23 @@ class Config():
 
             elif c == "&":
                 if i + 1 >= strlen:
-                    _tokenization_error(s, strlen, filename, linenr)
+                    # Invalid characters are ignored
+                    continue
                 if s[i + 1] != "&":
-                    _tokenization_error(s, i + 1, filename, linenr)
+                    # Invalid characters are ignored
+                    i += 1
+                    continue
                 append(T_AND)
                 i += 2
 
             elif c == "|":
                 if i + 1 >= strlen:
-                    _tokenization_error(s, strlen, filename, linenr)
+                    # Invalid characters are ignored
+                    continue
                 if s[i + 1] != "|":
-                    _tokenization_error(s, i + 1, filename, linenr)
+                    # Invalid characters are ignored
+                    i += 1
+                    continue
                 append(T_OR)
                 i += 2
 
@@ -989,6 +1025,11 @@ class Config():
             elif c == "#":
                 break
 
+            elif c not in sym_chars:
+                # Invalid characters are ignored
+                i += 1
+                continue
+
             else: # Symbol or keyword
                 name_start = i
 
@@ -1011,7 +1052,7 @@ class Config():
                     # We're dealing with a symbol. _sym_lookup() will take care
                     # of allocating a new Symbol instance if it's the first
                     # time we see it.
-                    sym = self._sym_lookup(name, add_sym_if_not_exists)
+                    sym = self._sym_lookup(name, not for_eval)
 
                     if previous == T_CONFIG:
                         # If the previous token is T_CONFIG ("config"), we're
@@ -1194,7 +1235,7 @@ class Config():
 
                 linenr = line_feeder.get_linenr()
 
-                tokens = self._tokenize(line, True, filename, linenr)
+                tokens = self._tokenize(line, False, filename, linenr)
 
             if tokens.is_empty():
                 continue
@@ -1386,7 +1427,7 @@ class Config():
             filename = line_feeder.get_filename()
             linenr = line_feeder.get_linenr()
 
-            tokens = self._tokenize(line, True, filename, linenr)
+            tokens = self._tokenize(line, False, filename, linenr)
 
             if tokens.is_empty():
                 continue
@@ -2249,8 +2290,6 @@ keywords = {
         "comment"        : T_COMMENT,
         "menuconfig"     : T_MENUCONFIG,
         "help"           : T_HELP,
-        "---help---"     : T_HELP,
-        "---"            : T_HELP,  # Hack to handle CONFIG_W1_CON
         "if"             : T_IF,
         "depends"        : T_DEPENDS,
         "on"             : T_ON,
@@ -2289,6 +2328,10 @@ string_lex = (T_BOOL, T_TRISTATE, T_INT, T_HEX, T_STRING,
 # Characters that may appear in symbol names
 sym_chars = frozenset(string.ascii_letters + string.digits + "._/-")
 
+# Characters that might be the first significant character on a line. Other
+# characters are ignored at the beginning of a line.
+command_chars = frozenset(string.ascii_letters + string.digits + "_");
+
 # Regular expressions for parsing .config files
 set_re   = re.compile(r"CONFIG_(\w+)=(.*)")
 unset_re = re.compile(r"# CONFIG_(\w+) is not set")
author	Ulf Magnusson <ulfalizer@gmail.com>	2012-11-08 14:34:25 +0100
committer	Ulf Magnusson <ulfalizer@gmail.com>	2012-11-08 14:34:25 +0100
commit	b9409a7ca140f5f677b46293f0c8db52b14280d9 (patch)
tree	95c1b7b5005b2c33adfc4f468a48c97990194582 /kconfiglib.py
parent	00c10acf00140f5f14324c7ebbc316b10d0e4688 (diff)