from typing import Union, List, Iterator BOM = '\ufeff' ParsingIterator = Iterator[Union[str, 'ParsingIterator']] NestedStringList = Union[str, List['NestedStringList']] def text_reader(filename: str, encoding: str = 'utf-8') -> Iterator[str]: with open(filename, 'rt', encoding=encoding, buffering=1) as f: while True: chunk: str = f.read(1) if 0 == len(chunk): break elif chunk.startswith(BOM): continue yield chunk def brackets_parser(reader: Iterator[str], level: int = 0) \ -> ParsingIterator: cur_elem: str = "" is_quoted: bool = False for chunk in reader: if '{' == chunk and not is_quoted: if 0 == level: # skip document root, always look at first { level += 1 continue nested = brackets_parser(reader, level + 1) yield nested # ensure we are done for nested for _ in nested: pass continue elif ',' == chunk and not is_quoted: if 0 < len(cur_elem): yield cur_elem cur_elem = '' continue elif '}' == chunk and not is_quoted: if 0 < len(cur_elem): yield cur_elem break elif '"' == chunk: is_quoted = not is_quoted continue elif '\n' == chunk or '\r' == chunk: continue cur_elem += chunk def brackets_select(parser: ParsingIterator, selector: str = '') \ -> NestedStringList: if '' == selector: l_selector = [] elem = parser else: l_selector = selector.split('/') cur_sel: str = l_selector.pop(0) idx: int = int(cur_sel) _, elem = next(filter(lambda x: x[0] == idx, enumerate(parser))) if 0 == len(l_selector): if type(elem) == str: return elem else: return [i if type(i) == str else brackets_select(i) for i in elem] return brackets_select(elem, '/'.join(l_selector)) def get_infobases(filename: str) -> List[str]: reader = text_reader(filename) parser = brackets_parser(reader) base_list = brackets_select(parser, '2')[1:] return list(map(lambda x: x[1], base_list))