371 lines
12 KiB
Rust
371 lines
12 KiB
Rust
extern crate async_zip;
|
||
extern crate json;
|
||
extern crate tokio;
|
||
|
||
use async_zip::read::fs::ZipFileReader;
|
||
use async_zip::write::{EntryOptions, ZipFileWriter};
|
||
use async_zip::Compression;
|
||
use std::path::PathBuf;
|
||
use std::str::FromStr;
|
||
use tokio::fs;
|
||
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader, BufWriter};
|
||
use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender};
|
||
|
||
const INPUT_FILENAME: &str = "baza_utf8.zip";
|
||
const OUTPUT_FILENAME: &str = "json.zip";
|
||
const OUTPUT_COMPRESSION: Compression = Compression::Zstd;
|
||
|
||
#[derive(Debug, Clone, Copy)]
|
||
enum KeywordType {
|
||
Ignore,
|
||
Global,
|
||
QuestionPre,
|
||
QuestionStart,
|
||
QuestionContent,
|
||
CurrentScope,
|
||
}
|
||
|
||
impl FromStr for KeywordType {
|
||
type Err = ();
|
||
|
||
fn from_str(pattern: &str) -> Result<Self, Self::Err> {
|
||
use KeywordType::*;
|
||
Ok(match pattern {
|
||
"Мета:" => Ignore,
|
||
"Чемпионат:" | "Пакет:" => Global,
|
||
"Тур:" => QuestionPre,
|
||
"Вопрос " | "Вопрос:" => QuestionStart,
|
||
"Ответ:" | "Зачет:" => QuestionContent,
|
||
_ => CurrentScope,
|
||
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
|
||
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
|
||
})
|
||
}
|
||
}
|
||
|
||
#[derive(Debug, Clone, Copy)]
|
||
enum DataScope {
|
||
Global,
|
||
QuestionPre,
|
||
QuestionContent,
|
||
}
|
||
|
||
struct QuestionsParser {
|
||
// global output value
|
||
data: json::JsonValue,
|
||
// temp questions array
|
||
questions: json::JsonValue,
|
||
cur_keyword_type: Option<KeywordType>,
|
||
// temp question value
|
||
cur_question: json::JsonValue,
|
||
// temp value for pre'question fields
|
||
cur_question_pre: json::JsonValue,
|
||
// scope for data fields
|
||
cur_scope: DataScope,
|
||
// curent json key
|
||
cur_tag: String,
|
||
// current json value
|
||
cur_content: Vec<String>,
|
||
// need to push temp question value if true
|
||
have_new_question: bool,
|
||
// prev. keyword type
|
||
last_keyword_type: Option<KeywordType>,
|
||
// prev. json key (used for store acummulated content when new keyword readed)
|
||
last_tag: String,
|
||
}
|
||
|
||
/// Text questions parser
|
||
impl QuestionsParser {
|
||
const PATTERNS: &'static [&'static str] = &[
|
||
"Чемпионат:",
|
||
"Пакет:",
|
||
"URL:",
|
||
"Ссылка:",
|
||
"Дата:",
|
||
"Редактор:",
|
||
"Обработан:",
|
||
"Копирайт:",
|
||
"Инфо:",
|
||
"Тема:",
|
||
"Вид:",
|
||
"Тип:",
|
||
"Тур:",
|
||
"Мета:",
|
||
"Вопрос ",
|
||
"Вопрос:",
|
||
"Ответ:",
|
||
"Зачет:",
|
||
"Источник:",
|
||
"Рейтинг:",
|
||
"Автор:",
|
||
"Комментарий:",
|
||
"Комментарии:",
|
||
];
|
||
|
||
/// create new parser
|
||
pub fn new() -> QuestionsParser {
|
||
QuestionsParser {
|
||
data: json::JsonValue::new_object(),
|
||
questions: json::JsonValue::new_array(),
|
||
cur_keyword_type: None,
|
||
cur_question: json::JsonValue::new_object(),
|
||
cur_question_pre: json::JsonValue::new_object(),
|
||
cur_tag: String::new(),
|
||
cur_content: Vec::<String>::new(),
|
||
cur_scope: DataScope::Global,
|
||
have_new_question: false,
|
||
last_keyword_type: None,
|
||
last_tag: String::new(),
|
||
}
|
||
}
|
||
/// join current content lines
|
||
fn get_current_content(&self) -> String {
|
||
self.cur_content.join("\n")
|
||
}
|
||
/// clear current content
|
||
fn clear_current_content(&mut self) {
|
||
self.cur_content.clear()
|
||
}
|
||
/// add new line to current content
|
||
fn append_to_current_content(&mut self, line: String) {
|
||
self.cur_content.push(line);
|
||
}
|
||
/// check current question have required fields
|
||
fn is_current_question_valid(&self) -> bool {
|
||
self.cur_question.has_key("Вопрос") && self.cur_question.has_key("Ответ")
|
||
}
|
||
/// add current question to parsed array
|
||
fn add_cur_question(&mut self) {
|
||
if self.is_current_question_valid() {
|
||
let current = std::mem::replace(&mut self.cur_question, self.cur_question_pre.clone());
|
||
self.questions.push(current).unwrap()
|
||
}
|
||
}
|
||
/// set current content to last tag(keyword) to data scope
|
||
fn apply_content_to(&mut self, scope: DataScope) {
|
||
let content = self.get_current_content();
|
||
// match value to store data
|
||
let scope_data = match scope {
|
||
DataScope::Global => &mut self.data,
|
||
DataScope::QuestionPre => &mut self.cur_question_pre,
|
||
DataScope::QuestionContent => &mut self.cur_question,
|
||
};
|
||
scope_data[&self.last_tag] = content.into();
|
||
self.clear_current_content();
|
||
}
|
||
/// set current content to last tag(keyword) to current scope
|
||
fn apply_content_to_cur_scope(&mut self) {
|
||
self.apply_content_to(self.cur_scope);
|
||
}
|
||
/// set current scope
|
||
fn set_scope(&mut self, scope: DataScope) {
|
||
self.cur_scope = scope;
|
||
}
|
||
/// set current scope and set current content to last tag(keyword) to data scope
|
||
fn set_scope_and_apply(&mut self, scope: DataScope) {
|
||
self.set_scope(scope);
|
||
self.apply_content_to_cur_scope();
|
||
}
|
||
/// add last question (if have) and start collecting new one
|
||
fn start_new_question(&mut self) {
|
||
// store prev question before reading new
|
||
if self.have_new_question {
|
||
self.add_cur_question();
|
||
}
|
||
self.have_new_question = true;
|
||
}
|
||
|
||
/// check last tag(keyword) and set current content to corresponding data scope
|
||
fn apply_content_for_last_keyword(&mut self) {
|
||
// apply accumulated content when new keyword found
|
||
match self.last_keyword_type {
|
||
Some(KeywordType::Global) => {
|
||
self.set_scope_and_apply(DataScope::Global);
|
||
}
|
||
Some(KeywordType::QuestionPre) => {
|
||
self.set_scope_and_apply(DataScope::QuestionPre);
|
||
}
|
||
Some(KeywordType::QuestionStart) => {
|
||
self.start_new_question();
|
||
self.set_scope_and_apply(DataScope::QuestionContent);
|
||
}
|
||
Some(KeywordType::QuestionContent) => {
|
||
self.apply_content_to(DataScope::QuestionContent);
|
||
}
|
||
Some(KeywordType::CurrentScope) => {
|
||
self.apply_content_to_cur_scope();
|
||
}
|
||
_ => (), //None or Ignore
|
||
};
|
||
}
|
||
/// set current keyword(tag) and type as last, and set new as current
|
||
fn set_new_keyword(&mut self, keyword: &str) {
|
||
self.last_keyword_type =
|
||
std::mem::replace(&mut self.cur_keyword_type, Some(keyword.parse().unwrap()));
|
||
self.last_tag = std::mem::replace(
|
||
&mut self.cur_tag,
|
||
keyword.trim_end().trim_end_matches(':').to_string(),
|
||
);
|
||
}
|
||
/// if line matched keyword
|
||
fn on_keyword_match(&mut self, line: &str, keyword: &str) {
|
||
self.set_new_keyword(keyword);
|
||
|
||
// remember question id
|
||
if let Some(KeywordType::QuestionStart) = self.cur_keyword_type {
|
||
self.cur_question_pre["id"] = line.trim_end().trim_end_matches(':').into();
|
||
};
|
||
|
||
self.apply_content_for_last_keyword();
|
||
}
|
||
|
||
/// parse next line
|
||
pub fn parse_line(&mut self, line: &str) {
|
||
match QuestionsParser::PATTERNS
|
||
.iter() // find keyword
|
||
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
|
||
{
|
||
Some(pattern) => {
|
||
self.on_keyword_match(line, pattern);
|
||
}
|
||
None => {
|
||
self.append_to_current_content(line.to_string());
|
||
}
|
||
}
|
||
}
|
||
|
||
/// finish parsing
|
||
pub fn finish(&mut self) {
|
||
if self.have_new_question && !self.cur_content.is_empty() {
|
||
self.cur_question[&self.cur_tag] = self.get_current_content().into();
|
||
self.add_cur_question();
|
||
self.clear_current_content();
|
||
self.have_new_question = false;
|
||
}
|
||
self.data["Вопросы"] = std::mem::replace(&mut self.questions, json::JsonValue::new_array());
|
||
}
|
||
|
||
/// get parsed data
|
||
pub fn get_parsed(self) -> json::JsonValue {
|
||
self.data
|
||
}
|
||
}
|
||
|
||
#[derive(Debug)]
|
||
struct FileText {
|
||
name: String,
|
||
text: String,
|
||
}
|
||
|
||
#[derive(Debug)]
|
||
enum TextReaderMessage {
|
||
NextLine(String),
|
||
EndOfFile(String),
|
||
}
|
||
|
||
/// read txt files from zip and convert to json
|
||
async fn zip_text_reader(tx: UnboundedSender<TextReaderMessage>) {
|
||
// open archive just to list files
|
||
let archive = ZipFileReader::new(INPUT_FILENAME).await.expect("open zip");
|
||
|
||
let source_files = archive
|
||
.entries()
|
||
.iter()
|
||
.enumerate()
|
||
.filter(|(_, entry)| !entry.dir())
|
||
.filter(|(_, entry)| {
|
||
// skip files without "txt" extension
|
||
entry.name().ends_with(".txt")
|
||
})
|
||
.map(|(index, entry)| (index, entry.name().to_string()));
|
||
//
|
||
for (index, name) in source_files {
|
||
let entry_reader = archive.entry_reader(index).await.expect("read entry");
|
||
let buf_reader = BufReader::new(entry_reader);
|
||
let mut lines = buf_reader.lines();
|
||
while let Some(line) = lines.next_line().await.expect("next line") {
|
||
tx.send(TextReaderMessage::NextLine(line))
|
||
.expect("send line");
|
||
}
|
||
tx.send(TextReaderMessage::EndOfFile(name))
|
||
.expect("send end");
|
||
}
|
||
|
||
println!("read done ✅");
|
||
}
|
||
|
||
/// convert text questions to json format
|
||
async fn questions_converter(
|
||
mut rx: UnboundedReceiver<TextReaderMessage>,
|
||
tx: UnboundedSender<FileText>,
|
||
) {
|
||
let mut parser = QuestionsParser::new();
|
||
while let Some(msg) = rx.recv().await {
|
||
match msg {
|
||
TextReaderMessage::NextLine(line) => {
|
||
let line = line.trim();
|
||
if line.is_empty() {
|
||
continue;
|
||
}
|
||
parser.parse_line(line);
|
||
}
|
||
TextReaderMessage::EndOfFile(name) => {
|
||
parser.finish();
|
||
let data_json = parser.get_parsed();
|
||
let text = data_json.pretty(2);
|
||
tx.send(FileText { name, text }).expect("send json");
|
||
parser = QuestionsParser::new();
|
||
}
|
||
}
|
||
}
|
||
println!("convert done ✅");
|
||
}
|
||
|
||
/// write json data to zip files
|
||
async fn zip_json_writer(mut rx: UnboundedReceiver<FileText>) {
|
||
let file = fs::File::create(OUTPUT_FILENAME)
|
||
.await
|
||
.expect("create file");
|
||
let mut buf = BufWriter::with_capacity(100 * 1024 * 1024, file);
|
||
let mut writer = ZipFileWriter::new(&mut buf);
|
||
|
||
while let Some(FileText { name, text: data }) = rx.recv().await {
|
||
// make output filename
|
||
let mut outfilename = PathBuf::from(name);
|
||
outfilename.set_extension("json");
|
||
let outfilename = outfilename.to_str().unwrap().to_string();
|
||
let opts = EntryOptions::new(outfilename, OUTPUT_COMPRESSION);
|
||
|
||
// write new zip entry
|
||
writer
|
||
.write_entry_whole(opts, data.as_bytes())
|
||
.await
|
||
.expect("write entry");
|
||
}
|
||
writer.close().await.expect("close writer");
|
||
buf.flush().await.expect("flush buffer");
|
||
|
||
println!("write done ✅");
|
||
}
|
||
|
||
#[tokio::main]
|
||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||
// check output filename
|
||
match fs::metadata(OUTPUT_FILENAME).await {
|
||
Ok(x) if x.is_dir() => return Err("output file is a directory!".into()),
|
||
_ => (),
|
||
};
|
||
|
||
let (reader_tx, reader_rx) = mpsc::unbounded_channel::<TextReaderMessage>();
|
||
let (json_tx, json_rx) = mpsc::unbounded_channel::<FileText>();
|
||
|
||
tokio::try_join!(
|
||
tokio::spawn(zip_text_reader(reader_tx)),
|
||
tokio::spawn(questions_converter(reader_rx, json_tx)),
|
||
tokio::spawn(zip_json_writer(json_rx))
|
||
)?;
|
||
|
||
println!("all done ✅");
|
||
Ok(())
|
||
}
|