pub1c-web/brackets.py
2021-05-21 15:19:11 +03:00

77 lines
2.3 KiB
Python

from typing import Union, List, Iterator
BOM = '\ufeff'
ParsingIterator = Iterator[Union[str, 'ParsingIterator']]
NestedStringList = Union[str, List['NestedStringList']]
def text_reader(filename: str, encoding: str = 'utf-8') -> Iterator[str]:
with open(filename, 'rt', encoding=encoding, buffering=1) as f:
while True:
chunk: str = f.read(1)
if 0 == len(chunk):
break
elif chunk.startswith(BOM):
continue
yield chunk
def brackets_parser(reader: Iterator[str], level: int = 0) \
-> ParsingIterator:
cur_elem: str = ""
is_quoted: bool = False
for chunk in reader:
if '{' == chunk and not is_quoted:
if 0 == level: # skip document root, always look at first {
level += 1
continue
nested = brackets_parser(reader, level + 1)
yield nested
# ensure we are done for nested
for _ in nested:
pass
continue
elif ',' == chunk and not is_quoted:
if 0 < len(cur_elem):
yield cur_elem
cur_elem = ''
continue
elif '}' == chunk and not is_quoted:
if 0 < len(cur_elem):
yield cur_elem
break
elif '"' == chunk:
is_quoted = not is_quoted
continue
elif '\n' == chunk or '\r' == chunk:
continue
cur_elem += chunk
def brackets_select(parser: ParsingIterator, selector: str = '') \
-> NestedStringList:
if '' == selector:
l_selector = []
elem = parser
else:
l_selector = selector.split('/')
cur_sel: str = l_selector.pop(0)
idx: int = int(cur_sel)
_, elem = next(filter(lambda x: x[0] == idx, enumerate(parser)))
if 0 == len(l_selector):
if type(elem) == str:
return elem
else:
return [i if type(i) == str else brackets_select(i) for i in elem]
return brackets_select(elem, '/'.join(l_selector))
def get_infobases(filename: str) -> List[str]:
reader = text_reader(filename)
parser = brackets_parser(reader)
base_list = brackets_select(parser, '2')[1:]
return list(map(lambda x: x[1], base_list))