279 lines
8.9 KiB
Rust
279 lines
8.9 KiB
Rust
extern crate async_zip;
|
||
extern crate futures;
|
||
extern crate json;
|
||
extern crate tokio;
|
||
|
||
use async_zip::read::fs::ZipFileReader;
|
||
use std::path::PathBuf;
|
||
use std::str::FromStr;
|
||
use tokio::{fs, task};
|
||
use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader};
|
||
|
||
const BASE_FILENAME: &str = "baza_utf8.zip";
|
||
const OUTPUT_PATH: &str = "json";
|
||
|
||
#[derive(Debug, Clone, Copy)]
|
||
enum KeywordType {
|
||
Ignore,
|
||
Global,
|
||
QuestionPre,
|
||
QuestionStart,
|
||
QuestionContent,
|
||
CurrentScope,
|
||
}
|
||
|
||
#[derive(Debug, Clone, Copy)]
|
||
enum DataScope {
|
||
Global,
|
||
QuestionPre,
|
||
QuestionContent,
|
||
}
|
||
|
||
struct Context {
|
||
// global output value
|
||
data: json::JsonValue,
|
||
// temp questions array
|
||
questions: json::JsonValue,
|
||
cur_keyword_type: Option<KeywordType>,
|
||
// temp question value
|
||
cur_question: json::JsonValue,
|
||
// temp value for pre'question fields
|
||
cur_question_pre: json::JsonValue,
|
||
// scope for data fields
|
||
cur_scope: DataScope,
|
||
// curent json key
|
||
cur_tag: String,
|
||
// current json value
|
||
cur_content: Vec<String>,
|
||
// need to push temp question value if true
|
||
have_new_question: bool,
|
||
// prev. keyword type
|
||
last_keyword_type: Option<KeywordType>,
|
||
// prev. json key (used for store acummulated content when new keyword readed)
|
||
last_tag: String,
|
||
}
|
||
|
||
// check questions before push
|
||
trait PushIfValid {
|
||
fn is_valid(&self) -> bool;
|
||
fn push_if_valid(&mut self, value: json::JsonValue);
|
||
}
|
||
|
||
impl PushIfValid for json::JsonValue {
|
||
fn is_valid(&self) -> bool {
|
||
self.has_key("Вопрос") && self.has_key("Ответ")
|
||
}
|
||
fn push_if_valid(&mut self, value: json::JsonValue) {
|
||
if value.is_valid() {
|
||
self.push(value).unwrap_or(())
|
||
}
|
||
}
|
||
}
|
||
|
||
impl Context {
|
||
fn new() -> Context {
|
||
Context {
|
||
data: json::JsonValue::new_object(),
|
||
questions: json::JsonValue::new_array(),
|
||
cur_keyword_type: None,
|
||
cur_question: json::JsonValue::new_object(),
|
||
cur_question_pre: json::JsonValue::new_object(),
|
||
cur_tag: String::new(),
|
||
cur_content: Vec::<String>::new(),
|
||
cur_scope: DataScope::Global,
|
||
have_new_question: false,
|
||
last_keyword_type: None,
|
||
last_tag: String::new(),
|
||
}
|
||
}
|
||
}
|
||
|
||
impl FromStr for KeywordType {
|
||
type Err = ();
|
||
|
||
fn from_str(pattern: &str) -> Result<Self, Self::Err> {
|
||
use KeywordType::*;
|
||
Ok(match pattern {
|
||
"Мета:" => Ignore,
|
||
"Чемпионат:" | "Пакет:" => Global,
|
||
"Тур:" => QuestionPre,
|
||
"Вопрос " | "Вопрос:" => QuestionStart,
|
||
"Ответ:" | "Зачет:" => QuestionContent,
|
||
_ => CurrentScope,
|
||
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
|
||
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
|
||
})
|
||
}
|
||
}
|
||
|
||
async fn parse_file(
|
||
entry_reader: impl AsyncReadExt + Unpin,
|
||
) -> Result<json::JsonValue, Box<dyn std::error::Error>> {
|
||
let buf_reader = BufReader::new(entry_reader);
|
||
let mut lines = buf_reader.lines();
|
||
|
||
let patterns = vec![
|
||
"Чемпионат:",
|
||
"Пакет:",
|
||
"URL:",
|
||
"Ссылка:",
|
||
"Дата:",
|
||
"Редактор:",
|
||
"Обработан:",
|
||
"Копирайт:",
|
||
"Инфо:",
|
||
"Тема:",
|
||
"Вид:",
|
||
"Тип:",
|
||
"Тур:",
|
||
"Мета:",
|
||
"Вопрос ",
|
||
"Вопрос:",
|
||
"Ответ:",
|
||
"Зачет:",
|
||
"Источник:",
|
||
"Рейтинг:",
|
||
"Автор:",
|
||
"Комментарий:",
|
||
"Комментарии:",
|
||
];
|
||
let mut context = Context::new();
|
||
let mut ctx = &mut context;
|
||
|
||
while let Some(line_r) = lines.next_line().await? {
|
||
let line = line_r.trim();
|
||
if line.is_empty() {
|
||
continue;
|
||
}
|
||
let line = line.to_string();
|
||
|
||
match patterns
|
||
.iter() // find keyword
|
||
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
|
||
{
|
||
Some(pattern) => {
|
||
use KeywordType::*;
|
||
|
||
ctx.last_keyword_type = ctx.cur_keyword_type;
|
||
ctx.last_tag = ctx.cur_tag.clone();
|
||
ctx.cur_keyword_type = Some(pattern.parse().unwrap());
|
||
ctx.cur_tag = pattern.replace(' ', "").replace(':', "");
|
||
|
||
// remember question id
|
||
if let Some(QuestionStart) = ctx.cur_keyword_type {
|
||
ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into();
|
||
};
|
||
|
||
// apply accumulated content when new keyword found
|
||
match ctx.last_keyword_type {
|
||
Some(Global) => {
|
||
ctx.cur_scope = DataScope::Global;
|
||
ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
|
||
}
|
||
Some(QuestionPre) => {
|
||
ctx.cur_scope = DataScope::QuestionPre;
|
||
ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
||
}
|
||
Some(QuestionStart) => {
|
||
ctx.cur_scope = DataScope::QuestionContent;
|
||
// store prev question before reading new
|
||
if ctx.have_new_question {
|
||
ctx.questions.push_if_valid(ctx.cur_question.clone());
|
||
}
|
||
// prepare to read new question data with cur_question_pre values
|
||
ctx.cur_question = ctx.cur_question_pre.clone();
|
||
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
||
ctx.have_new_question = true;
|
||
}
|
||
Some(QuestionContent) => {
|
||
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
||
}
|
||
Some(CurrentScope) => {
|
||
// match value to store data
|
||
let scope_data = match ctx.cur_scope {
|
||
DataScope::Global => &mut ctx.data,
|
||
DataScope::QuestionPre => &mut ctx.cur_question_pre,
|
||
DataScope::QuestionContent => &mut ctx.cur_question,
|
||
};
|
||
scope_data[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
||
}
|
||
_ => (), //None or Ignore
|
||
};
|
||
// clear content
|
||
ctx.cur_content.clear();
|
||
}
|
||
None => {
|
||
// accumulate content if line is not a keyword
|
||
ctx.cur_content.push(line);
|
||
}
|
||
}
|
||
}
|
||
|
||
// finish reading last question
|
||
if ctx.have_new_question && !ctx.cur_content.is_empty() {
|
||
ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").into();
|
||
ctx.questions.push_if_valid(ctx.cur_question.clone());
|
||
ctx.have_new_question = false;
|
||
}
|
||
|
||
ctx.data["Вопросы"] = ctx.questions.clone();
|
||
Ok(ctx.data.clone())
|
||
}
|
||
|
||
async fn process_file(
|
||
archive: &ZipFileReader,
|
||
index: usize,
|
||
name: String,
|
||
) -> Result<(), Box<dyn std::error::Error>> {
|
||
let entry_reader = archive.entry_reader(index).await?;
|
||
|
||
// make output filename
|
||
let mut outfilename = PathBuf::from(OUTPUT_PATH);
|
||
outfilename.push(name);
|
||
outfilename.set_extension("json");
|
||
|
||
// save json to file
|
||
let new_data = parse_file(entry_reader).await?;
|
||
let data_str = task::spawn_blocking(move || {
|
||
new_data.pretty(2)
|
||
}).await?;
|
||
|
||
let mut outfile = fs::File::create(outfilename).await?;
|
||
outfile.write_all(data_str.as_bytes()).await?;
|
||
Ok(())
|
||
}
|
||
|
||
#[tokio::main]
|
||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||
// open archive just to list files
|
||
let archive = ZipFileReader::new(String::from(BASE_FILENAME)).await?;
|
||
|
||
let source_files: Vec<(usize, String)> = archive
|
||
.entries()
|
||
.iter()
|
||
.enumerate()
|
||
.filter(|item| !item.1.dir())
|
||
.filter(|item| {
|
||
// skip files without "txt" extension
|
||
item.1.name().ends_with(".txt")
|
||
})
|
||
.map(|item| (item.0, item.1.name().to_string()))
|
||
.collect();
|
||
|
||
// check output directory
|
||
match fs::metadata(OUTPUT_PATH).await {
|
||
Err(_) => fs::create_dir_all(OUTPUT_PATH).await?,
|
||
Ok(x) if x.is_file() => return Err("output directory is file!".into()),
|
||
_ => (),
|
||
};
|
||
|
||
println!("processing {} files ...", source_files.len());
|
||
|
||
for i in source_files {
|
||
process_file(&archive, i.0, i.1).await?;
|
||
}
|
||
|
||
println!("done");
|
||
Ok(())
|
||
}
|