parser refactor

This commit is contained in:
Dmitry Belyaev 2022-09-20 14:31:37 +03:00
parent 7eaa394d7d
commit d77e164d07
Signed by: b4tman
GPG Key ID: 41A00BF15EA7E5F3

View File

@ -26,6 +26,24 @@ enum KeywordType {
CurrentScope,
}
impl FromStr for KeywordType {
type Err = ();
fn from_str(pattern: &str) -> Result<Self, Self::Err> {
use KeywordType::*;
Ok(match pattern {
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
"Вопрос " | "Вопрос:" => QuestionStart,
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
})
}
}
#[derive(Debug, Clone, Copy)]
enum DataScope {
Global,
@ -33,7 +51,7 @@ enum DataScope {
QuestionContent,
}
struct Context {
struct QuestionsParser {
// global output value
data: json::JsonValue,
// temp questions array
@ -57,66 +75,9 @@ struct Context {
last_tag: String,
}
// check questions before push
trait PushIfValid {
fn is_valid(&self) -> bool;
fn push_if_valid(&mut self, value: json::JsonValue);
}
impl PushIfValid for json::JsonValue {
fn is_valid(&self) -> bool {
self.has_key("Вопрос") && self.has_key("Ответ")
}
fn push_if_valid(&mut self, value: json::JsonValue) {
if value.is_valid() {
self.push(value).unwrap_or(())
}
}
}
impl Context {
fn new() -> Context {
Context {
data: json::JsonValue::new_object(),
questions: json::JsonValue::new_array(),
cur_keyword_type: None,
cur_question: json::JsonValue::new_object(),
cur_question_pre: json::JsonValue::new_object(),
cur_tag: String::new(),
cur_content: Vec::<String>::new(),
cur_scope: DataScope::Global,
have_new_question: false,
last_keyword_type: None,
last_tag: String::new(),
}
}
}
impl FromStr for KeywordType {
type Err = ();
fn from_str(pattern: &str) -> Result<Self, Self::Err> {
use KeywordType::*;
Ok(match pattern {
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
"Вопрос " | "Вопрос:" => QuestionStart,
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
})
}
}
async fn parse_file(
entry_reader: impl AsyncReadExt + Unpin,
) -> Result<json::JsonValue, Box<dyn std::error::Error>> {
let buf_reader = BufReader::new(entry_reader);
let mut lines = buf_reader.lines();
let patterns = vec![
/// Text questions parser
impl QuestionsParser {
const PATTERNS: &'static [&'static str] = &[
"Чемпионат:",
"Пакет:",
"URL:",
@ -141,87 +102,174 @@ async fn parse_file(
"Комментарий:",
"Комментарии:",
];
let mut context = Context::new();
let mut ctx = &mut context;
/// create new parser
pub fn new() -> QuestionsParser {
QuestionsParser {
data: json::JsonValue::new_object(),
questions: json::JsonValue::new_array(),
cur_keyword_type: None,
cur_question: json::JsonValue::new_object(),
cur_question_pre: json::JsonValue::new_object(),
cur_tag: String::new(),
cur_content: Vec::<String>::new(),
cur_scope: DataScope::Global,
have_new_question: false,
last_keyword_type: None,
last_tag: String::new(),
}
}
/// join current content lines
fn get_current_content(&self) -> String {
self.cur_content.join("\n")
}
/// clear current content
fn clear_current_content(&mut self) {
self.cur_content.clear()
}
/// add new line to current content
fn append_to_current_content(&mut self, line: String) {
self.cur_content.push(line);
}
/// check current question have required fields
fn is_current_question_valid(&self) -> bool {
self.cur_question.has_key("Вопрос") && self.cur_question.has_key("Ответ")
}
/// add current question to parsed array
fn add_cur_question(&mut self) {
if self.is_current_question_valid() {
self.questions.push(self.cur_question.clone()).unwrap_or(())
}
}
/// set current content to last tag(keyword) to data scope
fn apply_content_to(&mut self, scope: DataScope) {
let content = self.get_current_content();
// match value to store data
let scope_data = match scope {
DataScope::Global => &mut self.data,
DataScope::QuestionPre => &mut self.cur_question_pre,
DataScope::QuestionContent => &mut self.cur_question,
};
scope_data[&self.last_tag] = content.into();
self.clear_current_content();
}
/// set current content to last tag(keyword) to current scope
fn apply_content_to_cur_scope(&mut self) {
self.apply_content_to(self.cur_scope);
}
/// set current scope
fn set_scope(&mut self, scope: DataScope) {
self.cur_scope = scope;
}
/// set current scope and set current content to last tag(keyword) to data scope
fn set_scope_and_apply(&mut self, scope: DataScope) {
self.set_scope(scope);
self.apply_content_to_cur_scope();
}
/// add last question (if have) and start collecting new one
fn start_new_question(&mut self) {
// store prev question before reading new
if self.have_new_question {
self.add_cur_question();
}
// prepare to read new question data with cur_question_pre values
self.cur_question = self.cur_question_pre.clone();
self.have_new_question = true;
}
/// check last tag(keyword) and set current content to corresponding data scope
fn apply_content_for_last_keyword(&mut self) {
// apply accumulated content when new keyword found
match self.last_keyword_type {
Some(KeywordType::Global) => {
self.set_scope_and_apply(DataScope::Global);
}
Some(KeywordType::QuestionPre) => {
self.set_scope_and_apply(DataScope::QuestionPre);
}
Some(KeywordType::QuestionStart) => {
self.start_new_question();
self.set_scope_and_apply(DataScope::QuestionContent);
}
Some(KeywordType::QuestionContent) => {
self.apply_content_to(DataScope::QuestionContent);
}
Some(KeywordType::CurrentScope) => {
self.apply_content_to_cur_scope();
}
_ => (), //None or Ignore
};
}
/// set current keyword(tag) and type as last, and set new as current
fn set_new_keyword(&mut self, keyword: &str) {
self.last_keyword_type = self.cur_keyword_type;
self.last_tag = self.cur_tag.clone();
self.cur_keyword_type = Some(keyword.parse().unwrap());
self.cur_tag = keyword.replace(' ', "").replace(':', "");
}
/// if line matched keyword
fn on_keyword_match(&mut self, line: &str, keyword: &str) {
self.set_new_keyword(keyword);
// remember question id
if let Some(KeywordType::QuestionStart) = self.cur_keyword_type {
self.cur_question_pre["id"] = line.replace(':', "").into();
};
self.apply_content_for_last_keyword();
}
/// parse next line
pub fn parse_line(&mut self, line: &str) {
match QuestionsParser::PATTERNS
.iter() // find keyword
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
{
Some(pattern) => {
self.on_keyword_match(line, pattern);
}
None => {
self.append_to_current_content(line.to_string());
}
}
}
/// finish parsing
pub fn finish(&mut self) {
if self.have_new_question && !self.cur_content.is_empty() {
self.cur_question[&self.cur_tag] = self.get_current_content().into();
self.add_cur_question();
self.clear_current_content();
self.have_new_question = false;
}
self.data["Вопросы"] = self.questions.clone();
}
/// get parsed data
pub fn get_parsed(self) -> json::JsonValue {
self.data
}
}
async fn parse_file(
entry_reader: impl AsyncReadExt + Unpin,
) -> Result<json::JsonValue, Box<dyn std::error::Error>> {
let buf_reader = BufReader::new(entry_reader);
let mut lines = buf_reader.lines();
let mut parser = QuestionsParser::new();
while let Some(line_r) = lines.next_line().await? {
let line = line_r.trim();
if line.is_empty() {
continue;
}
let line = line.to_string();
match patterns
.iter() // find keyword
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
{
Some(pattern) => {
use KeywordType::*;
ctx.last_keyword_type = ctx.cur_keyword_type;
ctx.last_tag = ctx.cur_tag.clone();
ctx.cur_keyword_type = Some(pattern.parse().unwrap());
ctx.cur_tag = pattern.replace(' ', "").replace(':', "");
// remember question id
if let Some(QuestionStart) = ctx.cur_keyword_type {
ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into();
};
// apply accumulated content when new keyword found
match ctx.last_keyword_type {
Some(Global) => {
ctx.cur_scope = DataScope::Global;
ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
}
Some(QuestionPre) => {
ctx.cur_scope = DataScope::QuestionPre;
ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(QuestionStart) => {
ctx.cur_scope = DataScope::QuestionContent;
// store prev question before reading new
if ctx.have_new_question {
ctx.questions.push_if_valid(ctx.cur_question.clone());
}
// prepare to read new question data with cur_question_pre values
ctx.cur_question = ctx.cur_question_pre.clone();
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
ctx.have_new_question = true;
}
Some(QuestionContent) => {
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(CurrentScope) => {
// match value to store data
let scope_data = match ctx.cur_scope {
DataScope::Global => &mut ctx.data,
DataScope::QuestionPre => &mut ctx.cur_question_pre,
DataScope::QuestionContent => &mut ctx.cur_question,
};
scope_data[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
_ => (), //None or Ignore
};
// clear content
ctx.cur_content.clear();
}
None => {
// accumulate content if line is not a keyword
ctx.cur_content.push(line);
}
}
parser.parse_line(line);
}
// finish reading last question
if ctx.have_new_question && !ctx.cur_content.is_empty() {
ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").into();
ctx.questions.push_if_valid(ctx.cur_question.clone());
ctx.have_new_question = false;
}
ctx.data["Вопросы"] = ctx.questions.clone();
Ok(ctx.data.clone())
parser.finish();
Ok(parser.get_parsed())
}
struct WriteQueueItem {
@ -305,7 +353,7 @@ async fn data_writer(queue: WriteQueue) {
}
}
writer.close().await.unwrap();
println!("write done ✅");
}