77 lines
2.3 KiB
Python
77 lines
2.3 KiB
Python
|
|
from typing import Union, List, Iterator
|
|
|
|
BOM = '\ufeff'
|
|
ParsingIterator = Iterator[Union[str, 'ParsingIterator']]
|
|
NestedStringList = Union[str, List['NestedStringList']]
|
|
|
|
|
|
def text_reader(filename: str, encoding: str = 'utf-8') -> Iterator[str]:
|
|
with open(filename, 'rt', encoding=encoding, buffering=1) as f:
|
|
while True:
|
|
chunk: str = f.read(1)
|
|
if 0 == len(chunk):
|
|
break
|
|
elif chunk.startswith(BOM):
|
|
continue
|
|
yield chunk
|
|
|
|
|
|
def brackets_parser(reader: Iterator[str], level: int = 0) \
|
|
-> ParsingIterator:
|
|
cur_elem: str = ""
|
|
is_quoted: bool = False
|
|
for chunk in reader:
|
|
if '{' == chunk and not is_quoted:
|
|
if 0 == level: # skip document root, always look at first {
|
|
level += 1
|
|
continue
|
|
nested = brackets_parser(reader, level + 1)
|
|
yield nested
|
|
# ensure we are done for nested
|
|
for _ in nested:
|
|
pass
|
|
continue
|
|
elif ',' == chunk and not is_quoted:
|
|
if 0 < len(cur_elem):
|
|
yield cur_elem
|
|
cur_elem = ''
|
|
continue
|
|
elif '}' == chunk and not is_quoted:
|
|
if 0 < len(cur_elem):
|
|
yield cur_elem
|
|
break
|
|
elif '"' == chunk:
|
|
is_quoted = not is_quoted
|
|
continue
|
|
elif '\n' == chunk or '\r' == chunk:
|
|
continue
|
|
cur_elem += chunk
|
|
|
|
|
|
def brackets_select(parser: ParsingIterator, selector: str = '') \
|
|
-> NestedStringList:
|
|
if '' == selector:
|
|
l_selector = []
|
|
elem = parser
|
|
else:
|
|
l_selector = selector.split('/')
|
|
cur_sel: str = l_selector.pop(0)
|
|
idx: int = int(cur_sel)
|
|
_, elem = next(filter(lambda x: x[0] == idx, enumerate(parser)))
|
|
|
|
if 0 == len(l_selector):
|
|
if type(elem) == str:
|
|
return elem
|
|
else:
|
|
return [i if type(i) == str else brackets_select(i) for i in elem]
|
|
|
|
return brackets_select(elem, '/'.join(l_selector))
|
|
|
|
|
|
def get_infobases(filename: str) -> List[str]:
|
|
reader = text_reader(filename)
|
|
parser = brackets_parser(reader)
|
|
base_list = brackets_select(parser, '2')[1:]
|
|
return list(map(lambda x: x[1], base_list))
|