# -*- coding: utf-8 -*- from __future__ import unicode_literals import re import string from ._compat import chr from ._compat import decode from ._utils import _escaped from ._utils import RFC_3339_LOOSE from ._utils import parse_rfc3339 from .container import Container from .exceptions import EmptyKeyError from .exceptions import EmptyTableNameError from .exceptions import InternalParserError from .exceptions import InvalidCharInStringError from .exceptions import InvalidDateTimeError from .exceptions import InvalidDateError from .exceptions import InvalidTimeError from .exceptions import InvalidNumberError from .exceptions import InvalidUnicodeValueError from .exceptions import MixedArrayTypesError from .exceptions import ParseError from .exceptions import UnexpectedCharError from .exceptions import UnexpectedEofError from .items import AoT from .items import Array from .items import Bool from .items import BoolType from .items import Comment from .items import Date from .items import DateTime from .items import Float from .items import InlineTable from .items import Integer from .items import Item from .items import Key from .items import KeyType from .items import Null from .items import String from .items import StringType from .items import Table from .items import Time from .items import Trivia from .items import Whitespace from .source import Source from .toml_char import TOMLChar from .toml_document import TOMLDocument class Parser: """ Parser for TOML documents. """ def __init__(self, string): # type: (str) -> None # Input to parse self._src = Source(decode(string)) self._aot_stack = [] @property def _state(self): return self._src.state @property def _idx(self): return self._src.idx @property def _current(self): return self._src.current @property def _marker(self): return self._src.marker def extract(self): # type: () -> str """ Extracts the value between marker and index """ return self._src.extract() def inc(self, exception=None): # type: (Optional[ParseError.__class__]) -> bool """ Increments the parser if the end of the input has not been reached. Returns whether or not it was able to advance. """ return self._src.inc(exception=exception) def inc_n(self, n, exception=None): # type: (int, Optional[ParseError]) -> bool """ Increments the parser by n characters if the end of the input has not been reached. """ return self._src.inc_n(n=n, exception=exception) def consume(self, chars, min=0, max=-1): """ Consume chars until min/max is satisfied is valid. """ return self._src.consume(chars=chars, min=min, max=max) def end(self): # type: () -> bool """ Returns True if the parser has reached the end of the input. """ return self._src.end() def mark(self): # type: () -> None """ Sets the marker to the index's current position """ self._src.mark() def parse_error(self, exception=ParseError, *args): """ Creates a generic "parse error" at the current position. """ return self._src.parse_error(exception, *args) def parse(self): # type: () -> TOMLDocument body = TOMLDocument(True) # Take all keyvals outside of tables/AoT's. while not self.end(): # Break out if a table is found if self._current == "[": break # Otherwise, take and append one KV item = self._parse_item() if not item: break key, value = item if key is not None and key.is_dotted(): # We actually have a table self._handle_dotted_key(body, key, value) elif not self._merge_ws(value, body): body.append(key, value) self.mark() while not self.end(): key, value = self._parse_table() if isinstance(value, Table) and value.is_aot_element(): # This is just the first table in an AoT. Parse the rest of the array # along with it. value = self._parse_aot(value, key.key) body.append(key, value) body.parsing(False) return body def _merge_ws(self, item, container): # type: (Item, Container) -> bool """ Merges the given Item with the last one currently in the given Container if both are whitespace items. Returns True if the items were merged. """ last = container.last_item() if not last: return False if not isinstance(item, Whitespace) or not isinstance(last, Whitespace): return False start = self._idx - (len(last.s) + len(item.s)) container.body[-1] = ( container.body[-1][0], Whitespace(self._src[start : self._idx]), ) return True def _is_child(self, parent, child): # type: (str, str) -> bool """ Returns whether a key is strictly a child of another key. AoT siblings are not considered children of one another. """ parent_parts = tuple(self._split_table_name(parent)) child_parts = tuple(self._split_table_name(child)) if parent_parts == child_parts: return False return parent_parts == child_parts[: len(parent_parts)] def _split_table_name(self, name): # type: (str) -> Generator[Key] in_name = False current = "" t = KeyType.Bare parts = 0 for c in name.strip(): c = TOMLChar(c) if c == ".": if in_name: current += c continue if not current: raise self.parse_error() yield Key(current.strip(), t=t, sep="") parts += 1 current = "" t = KeyType.Bare continue elif c in {"'", '"'}: if in_name: if ( t == KeyType.Literal and c == '"' or t == KeyType.Basic and c == "'" ): current += c continue if c != t.value: raise self.parse_error() in_name = False else: if current and TOMLChar(current[-1]).is_spaces() and not parts: raise self.parse_error() in_name = True t = KeyType.Literal if c == "'" else KeyType.Basic continue elif in_name or c.is_bare_key_char(): if ( not in_name and current and TOMLChar(current[-1]).is_spaces() and not parts ): raise self.parse_error() current += c elif c.is_spaces(): # A space is only valid at this point # if it's in between parts. # We store it for now and will check # later if it's valid current += c continue else: raise self.parse_error() if current.strip(): yield Key(current.strip(), t=t, sep="") def _parse_item(self): # type: () -> Optional[Tuple[Optional[Key], Item]] """ Attempts to parse the next item and returns it, along with its key if the item is value-like. """ self.mark() with self._state as state: while True: c = self._current if c == "\n": # Found a newline; Return all whitespace found up to this point. self.inc() return None, Whitespace(self.extract()) elif c in " \t\r": # Skip whitespace. if not self.inc(): return None, Whitespace(self.extract()) elif c == "#": # Found a comment, parse it indent = self.extract() cws, comment, trail = self._parse_comment_trail() return None, Comment(Trivia(indent, cws, comment, trail)) elif c == "[": # Found a table, delegate to the calling function. return else: # Begining of a KV pair. # Return to beginning of whitespace so it gets included # as indentation for the KV about to be parsed. state.restore = True break return self._parse_key_value(True) def _parse_comment_trail(self): # type: () -> Tuple[str, str, str] """ Returns (comment_ws, comment, trail) If there is no comment, comment_ws and comment will simply be empty. """ if self.end(): return "", "", "" comment = "" comment_ws = "" self.mark() while True: c = self._current if c == "\n": break elif c == "#": comment_ws = self.extract() self.mark() self.inc() # Skip # # The comment itself while not self.end() and not self._current.is_nl() and self.inc(): pass comment = self.extract() self.mark() break elif c in " \t\r": self.inc() else: raise self.parse_error(UnexpectedCharError, c) if self.end(): break while self._current.is_spaces() and self.inc(): pass if self._current == "\r": self.inc() if self._current == "\n": self.inc() trail = "" if self._idx != self._marker or self._current.is_ws(): trail = self.extract() return comment_ws, comment, trail def _parse_key_value(self, parse_comment=False): # type: (bool) -> (Key, Item) # Leading indent self.mark() while self._current.is_spaces() and self.inc(): pass indent = self.extract() # Key key = self._parse_key() if not key.key.strip(): raise self.parse_error(EmptyKeyError) self.mark() found_equals = self._current == "=" while self._current.is_kv_sep() and self.inc(): if self._current == "=": if found_equals: raise self.parse_error(UnexpectedCharError, "=") else: found_equals = True pass key.sep = self.extract() # Value val = self._parse_value() # Comment if parse_comment: cws, comment, trail = self._parse_comment_trail() meta = val.trivia meta.comment_ws = cws meta.comment = comment meta.trail = trail else: val.trivia.trail = "" val.trivia.indent = indent return key, val def _parse_key(self): # type: () -> Key """ Parses a Key at the current position; WS before the key must be exhausted first at the callsite. """ if self._current in "\"'": return self._parse_quoted_key() else: return self._parse_bare_key() def _parse_quoted_key(self): # type: () -> Key """ Parses a key enclosed in either single or double quotes. """ quote_style = self._current key_type = None dotted = False for t in KeyType: if t.value == quote_style: key_type = t break if key_type is None: raise RuntimeError("Should not have entered _parse_quoted_key()") self.inc() self.mark() while self._current != quote_style and self.inc(): pass key = self.extract() if self._current == ".": self.inc() dotted = True key += "." + self._parse_key().as_string() key_type = KeyType.Bare else: self.inc() return Key(key, key_type, "", dotted) def _parse_bare_key(self): # type: () -> Key """ Parses a bare key. """ key_type = None dotted = False self.mark() while self._current.is_bare_key_char() and self.inc(): pass key = self.extract() if self._current == ".": self.inc() dotted = True key += "." + self._parse_key().as_string() key_type = KeyType.Bare return Key(key, key_type, "", dotted) def _handle_dotted_key( self, container, key, value ): # type: (Union[Container, Table], Key, Any) -> None names = tuple(self._split_table_name(key.key)) name = names[0] name._dotted = True if name in container: if isinstance(container, Table): table = container.value.item(name) else: table = container.item(name) else: table = Table(Container(True), Trivia(), False, is_super_table=True) if isinstance(container, Table): container.raw_append(name, table) else: container.append(name, table) for i, _name in enumerate(names[1:]): if i == len(names) - 2: _name.sep = key.sep table.append(_name, value) else: _name._dotted = True if _name in table.value: table = table.value.item(_name) else: table.append( _name, Table( Container(True), Trivia(), False, is_super_table=i < len(names) - 2, ), ) table = table[_name] def _parse_value(self): # type: () -> Item """ Attempts to parse a value at the current position. """ self.mark() c = self._current trivia = Trivia() if c == StringType.SLB.value: return self._parse_basic_string() elif c == StringType.SLL.value: return self._parse_literal_string() elif c == BoolType.TRUE.value[0]: return self._parse_true() elif c == BoolType.FALSE.value[0]: return self._parse_false() elif c == "[": return self._parse_array() elif c == "{": return self._parse_inline_table() elif c in "+-" or self._peek(4) in { "+inf", "-inf", "inf", "+nan", "-nan", "nan", }: # Number while self._current not in " \t\n\r#,]}" and self.inc(): pass raw = self.extract() item = self._parse_number(raw, trivia) if item is not None: return item raise self.parse_error(InvalidNumberError) elif c in string.digits: # Integer, Float, Date, Time or DateTime while self._current not in " \t\n\r#,]}" and self.inc(): pass raw = self.extract() m = RFC_3339_LOOSE.match(raw) if m: if m.group(1) and m.group(5): # datetime try: dt = parse_rfc3339(raw) return DateTime( dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.microsecond, dt.tzinfo, trivia, raw, ) except ValueError: raise self.parse_error(InvalidDateTimeError) if m.group(1): try: dt = parse_rfc3339(raw) return Date(dt.year, dt.month, dt.day, trivia, raw) except ValueError: raise self.parse_error(InvalidDateError) if m.group(5): try: t = parse_rfc3339(raw) return Time( t.hour, t.minute, t.second, t.microsecond, t.tzinfo, trivia, raw, ) except ValueError: raise self.parse_error(InvalidTimeError) item = self._parse_number(raw, trivia) if item is not None: return item raise self.parse_error(InvalidNumberError) else: raise self.parse_error(UnexpectedCharError, c) def _parse_true(self): return self._parse_bool(BoolType.TRUE) def _parse_false(self): return self._parse_bool(BoolType.FALSE) def _parse_bool(self, style): # type: (BoolType) -> Bool with self._state: style = BoolType(style) # only keep parsing for bool if the characters match the style # try consuming rest of chars in style for c in style: self.consume(c, min=1, max=1) return Bool(style, Trivia()) def _parse_array(self): # type: () -> Array # Consume opening bracket, EOF here is an issue (middle of array) self.inc(exception=UnexpectedEofError) elems = [] # type: List[Item] prev_value = None while True: # consume whitespace mark = self._idx self.consume(TOMLChar.SPACES) newline = self.consume(TOMLChar.NL) indent = self._src[mark : self._idx] if newline: elems.append(Whitespace(indent)) continue # consume comment if self._current == "#": cws, comment, trail = self._parse_comment_trail() elems.append(Comment(Trivia(indent, cws, comment, trail))) continue # consume indent if indent: elems.append(Whitespace(indent)) continue # consume value if not prev_value: try: elems.append(self._parse_value()) prev_value = True continue except UnexpectedCharError: pass # consume comma if prev_value and self._current == ",": self.inc(exception=UnexpectedEofError) elems.append(Whitespace(",")) prev_value = False continue # consume closing bracket if self._current == "]": # consume closing bracket, EOF here doesn't matter self.inc() break raise self.parse_error(UnexpectedCharError, self._current) try: res = Array(elems, Trivia()) except ValueError: pass else: if res.is_homogeneous(): return res raise self.parse_error(MixedArrayTypesError) def _parse_inline_table(self): # type: () -> InlineTable # consume opening bracket, EOF here is an issue (middle of array) self.inc(exception=UnexpectedEofError) elems = Container(True) trailing_comma = None while True: # consume leading whitespace mark = self._idx self.consume(TOMLChar.SPACES) raw = self._src[mark : self._idx] if raw: elems.add(Whitespace(raw)) if not trailing_comma: # None: empty inline table # False: previous key-value pair was not followed by a comma if self._current == "}": # consume closing bracket, EOF here doesn't matter self.inc() break if trailing_comma is False: raise self.parse_error(UnexpectedCharError, self._current) else: # True: previous key-value pair was followed by a comma if self._current == "}": raise self.parse_error(UnexpectedCharError, self._current) key, val = self._parse_key_value(False) elems.add(key, val) # consume trailing whitespace mark = self._idx self.consume(TOMLChar.SPACES) raw = self._src[mark : self._idx] if raw: elems.add(Whitespace(raw)) # consume trailing comma trailing_comma = self._current == "," if trailing_comma: # consume closing bracket, EOF here is an issue (middle of inline table) self.inc(exception=UnexpectedEofError) return InlineTable(elems, Trivia()) def _parse_number(self, raw, trivia): # type: (str, Trivia) -> Optional[Item] # Leading zeros are not allowed sign = "" if raw.startswith(("+", "-")): sign = raw[0] raw = raw[1:] if ( len(raw) > 1 and raw.startswith("0") and not raw.startswith(("0.", "0o", "0x", "0b")) ): return if raw.startswith(("0o", "0x", "0b")) and sign: return digits = "[0-9]" base = 10 if raw.startswith("0b"): digits = "[01]" base = 2 elif raw.startswith("0o"): digits = "[0-7]" base = 8 elif raw.startswith("0x"): digits = "[0-9a-f]" base = 16 # Underscores should be surrounded by digits clean = re.sub("(?i)(?<={})_(?={})".format(digits, digits), "", raw) if "_" in clean: return if clean.endswith("."): return try: return Integer(int(sign + clean, base), trivia, sign + raw) except ValueError: try: return Float(float(sign + clean), trivia, sign + raw) except ValueError: return def _parse_literal_string(self): # type: () -> String with self._state: return self._parse_string(StringType.SLL) def _parse_basic_string(self): # type: () -> String with self._state: return self._parse_string(StringType.SLB) def _parse_escaped_char(self, multiline): if multiline and self._current.is_ws(): # When the last non-whitespace character on a line is # a \, it will be trimmed along with all whitespace # (including newlines) up to the next non-whitespace # character or closing delimiter. # """\ # hello \ # world""" tmp = "" while self._current.is_ws(): tmp += self._current # consume the whitespace, EOF here is an issue # (middle of string) self.inc(exception=UnexpectedEofError) continue # the escape followed by whitespace must have a newline # before any other chars if "\n" not in tmp: raise self.parse_error(InvalidCharInStringError, self._current) return "" if self._current in _escaped: c = _escaped[self._current] # consume this char, EOF here is an issue (middle of string) self.inc(exception=UnexpectedEofError) return c if self._current in {"u", "U"}: # this needs to be a unicode u, ue = self._peek_unicode(self._current == "U") if u is not None: # consume the U char and the unicode value self.inc_n(len(ue) + 1) return u raise self.parse_error(InvalidUnicodeValueError) raise self.parse_error(InvalidCharInStringError, self._current) def _parse_string(self, delim): # type: (StringType) -> String # only keep parsing for string if the current character matches the delim if self._current != delim.unit: raise self.parse_error( InternalParserError, "Invalid character for string type {}".format(delim), ) # consume the opening/first delim, EOF here is an issue # (middle of string or middle of delim) self.inc(exception=UnexpectedEofError) if self._current == delim.unit: # consume the closing/second delim, we do not care if EOF occurs as # that would simply imply an empty single line string if not self.inc() or self._current != delim.unit: # Empty string return String(delim, "", "", Trivia()) # consume the third delim, EOF here is an issue (middle of string) self.inc(exception=UnexpectedEofError) delim = delim.toggle() # convert delim to multi delim self.mark() # to extract the original string with whitespace and all value = "" # A newline immediately following the opening delimiter will be trimmed. if delim.is_multiline() and self._current == "\n": # consume the newline, EOF here is an issue (middle of string) self.inc(exception=UnexpectedEofError) escaped = False # whether the previous key was ESCAPE while True: if delim.is_singleline() and self._current.is_nl(): # single line cannot have actual newline characters raise self.parse_error(InvalidCharInStringError, self._current) elif not escaped and self._current == delim.unit: # try to process current as a closing delim original = self.extract() close = "" if delim.is_multiline(): # try consuming three delims as this would mean the end of # the string for last in [False, False, True]: if self._current != delim.unit: # Not a triple quote, leave in result as-is. # Adding back the characters we already consumed value += close close = "" # clear the close break close += delim.unit # consume this delim, EOF here is only an issue if this # is not the third (last) delim character self.inc(exception=UnexpectedEofError if not last else None) if not close: # if there is no close characters, keep parsing continue else: # consume the closing delim, we do not care if EOF occurs as # that would simply imply the end of self._src self.inc() return String(delim, value, original, Trivia()) elif delim.is_basic() and escaped: # attempt to parse the current char as an escaped value, an exception # is raised if this fails value += self._parse_escaped_char(delim.is_multiline()) # no longer escaped escaped = False elif delim.is_basic() and self._current == "\\": # the next char is being escaped escaped = True # consume this char, EOF here is an issue (middle of string) self.inc(exception=UnexpectedEofError) else: # this is either a literal string where we keep everything as is, # or this is not a special escaped char in a basic string value += self._current # consume this char, EOF here is an issue (middle of string) self.inc(exception=UnexpectedEofError) def _parse_table( self, parent_name=None ): # type: (Optional[str]) -> Tuple[Key, Union[Table, AoT]] """ Parses a table element. """ if self._current != "[": raise self.parse_error( InternalParserError, "_parse_table() called on non-bracket character." ) indent = self.extract() self.inc() # Skip opening bracket if self.end(): raise self.parse_error(UnexpectedEofError) is_aot = False if self._current == "[": if not self.inc(): raise self.parse_error(UnexpectedEofError) is_aot = True # Consume any whitespace self.mark() while self._current.is_spaces() and self.inc(): pass ws_prefix = self.extract() # Key if self._current in [StringType.SLL.value, StringType.SLB.value]: delimiter = ( StringType.SLL if self._current == StringType.SLL.value else StringType.SLB ) name = self._parse_string(delimiter) name = "{delimiter}{name}{delimiter}".format( delimiter=delimiter.value, name=name ) self.mark() while self._current != "]" and self.inc(): if self.end(): raise self.parse_error(UnexpectedEofError) pass ws_suffix = self.extract() name += ws_suffix else: self.mark() while self._current != "]" and self.inc(): if self.end(): raise self.parse_error(UnexpectedEofError) pass name = self.extract() name = ws_prefix + name if not name.strip(): raise self.parse_error(EmptyTableNameError) key = Key(name, sep="") name_parts = tuple(self._split_table_name(name)) missing_table = False if parent_name: parent_name_parts = tuple(self._split_table_name(parent_name)) else: parent_name_parts = tuple() if len(name_parts) > len(parent_name_parts) + 1: missing_table = True name_parts = name_parts[len(parent_name_parts) :] values = Container(True) self.inc() # Skip closing bracket if is_aot: # TODO: Verify close bracket self.inc() cws, comment, trail = self._parse_comment_trail() result = Null() table = Table( values, Trivia(indent, cws, comment, trail), is_aot, name=name, display_name=name, ) if len(name_parts) > 1: if missing_table: # Missing super table # i.e. a table initialized like this: [foo.bar] # without initializing [foo] # # So we have to create the parent tables table = Table( Container(True), Trivia(indent, cws, comment, trail), is_aot and name_parts[0].key in self._aot_stack, is_super_table=True, name=name_parts[0].key, ) result = table key = name_parts[0] for i, _name in enumerate(name_parts[1:]): if _name in table: child = table[_name] else: child = Table( Container(True), Trivia(indent, cws, comment, trail), is_aot and i == len(name_parts[1:]) - 1, is_super_table=i < len(name_parts[1:]) - 1, name=_name.key, display_name=name if i == len(name_parts[1:]) - 1 else None, ) if is_aot and i == len(name_parts[1:]) - 1: table.append(_name, AoT([child], name=table.name, parsed=True)) else: table.append(_name, child) table = child values = table.value else: if name_parts: key = name_parts[0] while not self.end(): item = self._parse_item() if item: _key, item = item if not self._merge_ws(item, values): if _key is not None and _key.is_dotted(): self._handle_dotted_key(table, _key, item) else: table.raw_append(_key, item) else: if self._current == "[": is_aot_next, name_next = self._peek_table() if self._is_child(name, name_next): key_next, table_next = self._parse_table(name) table.raw_append(key_next, table_next) # Picking up any sibling while not self.end(): _, name_next = self._peek_table() if not self._is_child(name, name_next): break key_next, table_next = self._parse_table(name) table.raw_append(key_next, table_next) break else: raise self.parse_error( InternalParserError, "_parse_item() returned None on a non-bracket character.", ) if isinstance(result, Null): result = table if is_aot and (not self._aot_stack or name != self._aot_stack[-1]): result = self._parse_aot(result, name) return key, result def _peek_table(self): # type: () -> Tuple[bool, str] """ Peeks ahead non-intrusively by cloning then restoring the initial state of the parser. Returns the name of the table about to be parsed, as well as whether it is part of an AoT. """ # we always want to restore after exiting this scope with self._state(save_marker=True, restore=True): if self._current != "[": raise self.parse_error( InternalParserError, "_peek_table() entered on non-bracket character", ) # AoT self.inc() is_aot = False if self._current == "[": self.inc() is_aot = True self.mark() while self._current != "]" and self.inc(): table_name = self.extract() return is_aot, table_name def _parse_aot(self, first, name_first): # type: (Table, str) -> AoT """ Parses all siblings of the provided table first and bundles them into an AoT. """ payload = [first] self._aot_stack.append(name_first) while not self.end(): is_aot_next, name_next = self._peek_table() if is_aot_next and name_next == name_first: _, table = self._parse_table(name_first) payload.append(table) else: break self._aot_stack.pop() return AoT(payload, parsed=True) def _peek(self, n): # type: (int) -> str """ Peeks ahead n characters. n is the max number of characters that will be peeked. """ # we always want to restore after exiting this scope with self._state(restore=True): buf = "" for _ in range(n): if self._current not in " \t\n\r#,]}": buf += self._current self.inc() continue break return buf def _peek_unicode( self, is_long ): # type: (bool) -> Tuple[Optional[str], Optional[str]] """ Peeks ahead non-intrusively by cloning then restoring the initial state of the parser. Returns the unicode value is it's a valid one else None. """ # we always want to restore after exiting this scope with self._state(save_marker=True, restore=True): if self._current not in {"u", "U"}: raise self.parse_error( InternalParserError, "_peek_unicode() entered on non-unicode value" ) self.inc() # Dropping prefix self.mark() if is_long: chars = 8 else: chars = 4 if not self.inc_n(chars): value, extracted = None, None else: extracted = self.extract() if extracted[0].lower() == "d" and extracted[1].strip("01234567"): return None, None try: value = chr(int(extracted, 16)) except ValueError: value = None return value, extracted