2022-09-12 15:59:35 +03:00
|
|
|
|
extern crate async_zip;
|
2019-07-26 12:24:25 +03:00
|
|
|
|
extern crate json;
|
2022-09-12 15:59:35 +03:00
|
|
|
|
extern crate tokio;
|
2019-07-26 12:24:25 +03:00
|
|
|
|
|
2022-09-12 15:59:35 +03:00
|
|
|
|
use async_zip::read::fs::ZipFileReader;
|
2022-09-12 22:34:17 +03:00
|
|
|
|
use async_zip::write::{EntryOptions, ZipFileWriter};
|
|
|
|
|
use async_zip::Compression;
|
2022-09-20 00:16:04 +03:00
|
|
|
|
use std::collections::VecDeque;
|
2019-07-26 12:24:25 +03:00
|
|
|
|
use std::path::PathBuf;
|
2022-08-25 16:12:47 +03:00
|
|
|
|
use std::str::FromStr;
|
2022-09-20 00:12:49 +03:00
|
|
|
|
use std::sync::Arc;
|
2022-09-20 00:16:04 +03:00
|
|
|
|
use tokio::fs;
|
2022-09-20 00:12:49 +03:00
|
|
|
|
use tokio::io::{AsyncBufReadExt, AsyncReadExt, BufReader};
|
|
|
|
|
use tokio::sync::Mutex;
|
2019-07-26 12:24:25 +03:00
|
|
|
|
|
2022-09-12 22:34:17 +03:00
|
|
|
|
const INPUT_FILENAME: &str = "baza_utf8.zip";
|
|
|
|
|
const OUTPUT_FILENAME: &str = "json.zip";
|
2019-07-26 23:04:41 +03:00
|
|
|
|
|
2019-07-26 12:24:25 +03:00
|
|
|
|
#[derive(Debug, Clone, Copy)]
|
|
|
|
|
enum KeywordType {
|
2019-07-27 20:24:49 +03:00
|
|
|
|
Ignore,
|
|
|
|
|
Global,
|
|
|
|
|
QuestionPre,
|
|
|
|
|
QuestionStart,
|
|
|
|
|
QuestionContent,
|
|
|
|
|
CurrentScope,
|
2019-07-26 12:25:45 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Copy)]
|
|
|
|
|
enum DataScope {
|
2019-07-27 20:24:49 +03:00
|
|
|
|
Global,
|
|
|
|
|
QuestionPre,
|
|
|
|
|
QuestionContent,
|
2019-07-26 12:24:25 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct Context {
|
2019-07-27 20:24:49 +03:00
|
|
|
|
// global output value
|
|
|
|
|
data: json::JsonValue,
|
|
|
|
|
// temp questions array
|
|
|
|
|
questions: json::JsonValue,
|
|
|
|
|
cur_keyword_type: Option<KeywordType>,
|
|
|
|
|
// temp question value
|
|
|
|
|
cur_question: json::JsonValue,
|
|
|
|
|
// temp value for pre'question fields
|
|
|
|
|
cur_question_pre: json::JsonValue,
|
|
|
|
|
// scope for data fields
|
|
|
|
|
cur_scope: DataScope,
|
|
|
|
|
// curent json key
|
|
|
|
|
cur_tag: String,
|
|
|
|
|
// current json value
|
|
|
|
|
cur_content: Vec<String>,
|
|
|
|
|
// need to push temp question value if true
|
|
|
|
|
have_new_question: bool,
|
|
|
|
|
// prev. keyword type
|
|
|
|
|
last_keyword_type: Option<KeywordType>,
|
|
|
|
|
// prev. json key (used for store acummulated content when new keyword readed)
|
|
|
|
|
last_tag: String,
|
2019-07-26 12:24:25 +03:00
|
|
|
|
}
|
|
|
|
|
|
2019-07-30 21:15:20 +03:00
|
|
|
|
// check questions before push
|
|
|
|
|
trait PushIfValid {
|
|
|
|
|
fn is_valid(&self) -> bool;
|
|
|
|
|
fn push_if_valid(&mut self, value: json::JsonValue);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl PushIfValid for json::JsonValue {
|
|
|
|
|
fn is_valid(&self) -> bool {
|
|
|
|
|
self.has_key("Вопрос") && self.has_key("Ответ")
|
|
|
|
|
}
|
|
|
|
|
fn push_if_valid(&mut self, value: json::JsonValue) {
|
|
|
|
|
if value.is_valid() {
|
|
|
|
|
self.push(value).unwrap_or(())
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-07-27 11:30:40 +03:00
|
|
|
|
impl Context {
|
2019-07-27 20:24:49 +03:00
|
|
|
|
fn new() -> Context {
|
|
|
|
|
Context {
|
|
|
|
|
data: json::JsonValue::new_object(),
|
|
|
|
|
questions: json::JsonValue::new_array(),
|
|
|
|
|
cur_keyword_type: None,
|
|
|
|
|
cur_question: json::JsonValue::new_object(),
|
|
|
|
|
cur_question_pre: json::JsonValue::new_object(),
|
|
|
|
|
cur_tag: String::new(),
|
|
|
|
|
cur_content: Vec::<String>::new(),
|
|
|
|
|
cur_scope: DataScope::Global,
|
|
|
|
|
have_new_question: false,
|
|
|
|
|
last_keyword_type: None,
|
|
|
|
|
last_tag: String::new(),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-25 16:12:47 +03:00
|
|
|
|
impl FromStr for KeywordType {
|
|
|
|
|
type Err = ();
|
|
|
|
|
|
|
|
|
|
fn from_str(pattern: &str) -> Result<Self, Self::Err> {
|
2019-07-27 20:24:49 +03:00
|
|
|
|
use KeywordType::*;
|
2022-08-25 16:12:47 +03:00
|
|
|
|
Ok(match pattern {
|
2019-07-27 20:24:49 +03:00
|
|
|
|
"Мета:" => Ignore,
|
|
|
|
|
"Чемпионат:" | "Пакет:" => Global,
|
|
|
|
|
"Тур:" => QuestionPre,
|
2019-07-28 15:42:25 +03:00
|
|
|
|
"Вопрос " | "Вопрос:" => QuestionStart,
|
2019-07-27 20:24:49 +03:00
|
|
|
|
"Ответ:" | "Зачет:" => QuestionContent,
|
|
|
|
|
_ => CurrentScope,
|
|
|
|
|
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
|
|
|
|
|
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
|
2022-08-25 16:12:47 +03:00
|
|
|
|
})
|
2019-07-27 20:24:49 +03:00
|
|
|
|
}
|
2019-07-27 11:30:40 +03:00
|
|
|
|
}
|
|
|
|
|
|
2022-09-12 15:59:35 +03:00
|
|
|
|
async fn parse_file(
|
|
|
|
|
entry_reader: impl AsyncReadExt + Unpin,
|
|
|
|
|
) -> Result<json::JsonValue, Box<dyn std::error::Error>> {
|
|
|
|
|
let buf_reader = BufReader::new(entry_reader);
|
|
|
|
|
let mut lines = buf_reader.lines();
|
2019-07-26 12:24:25 +03:00
|
|
|
|
|
2019-07-27 20:24:49 +03:00
|
|
|
|
let patterns = vec![
|
|
|
|
|
"Чемпионат:",
|
|
|
|
|
"Пакет:",
|
|
|
|
|
"URL:",
|
|
|
|
|
"Ссылка:",
|
|
|
|
|
"Дата:",
|
|
|
|
|
"Редактор:",
|
|
|
|
|
"Обработан:",
|
|
|
|
|
"Копирайт:",
|
|
|
|
|
"Инфо:",
|
|
|
|
|
"Тема:",
|
|
|
|
|
"Вид:",
|
|
|
|
|
"Тип:",
|
|
|
|
|
"Тур:",
|
|
|
|
|
"Мета:",
|
|
|
|
|
"Вопрос ",
|
|
|
|
|
"Вопрос:",
|
|
|
|
|
"Ответ:",
|
|
|
|
|
"Зачет:",
|
|
|
|
|
"Источник:",
|
|
|
|
|
"Рейтинг:",
|
|
|
|
|
"Автор:",
|
|
|
|
|
"Комментарий:",
|
|
|
|
|
"Комментарии:",
|
|
|
|
|
];
|
|
|
|
|
let mut context = Context::new();
|
|
|
|
|
let mut ctx = &mut context;
|
2019-07-26 12:24:25 +03:00
|
|
|
|
|
2022-09-12 15:59:35 +03:00
|
|
|
|
while let Some(line_r) = lines.next_line().await? {
|
|
|
|
|
let line = line_r.trim();
|
|
|
|
|
if line.is_empty() {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
let line = line.to_string();
|
|
|
|
|
|
|
|
|
|
match patterns
|
|
|
|
|
.iter() // find keyword
|
|
|
|
|
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
|
|
|
|
|
{
|
|
|
|
|
Some(pattern) => {
|
|
|
|
|
use KeywordType::*;
|
|
|
|
|
|
|
|
|
|
ctx.last_keyword_type = ctx.cur_keyword_type;
|
|
|
|
|
ctx.last_tag = ctx.cur_tag.clone();
|
|
|
|
|
ctx.cur_keyword_type = Some(pattern.parse().unwrap());
|
|
|
|
|
ctx.cur_tag = pattern.replace(' ', "").replace(':', "");
|
|
|
|
|
|
|
|
|
|
// remember question id
|
|
|
|
|
if let Some(QuestionStart) = ctx.cur_keyword_type {
|
|
|
|
|
ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into();
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// apply accumulated content when new keyword found
|
|
|
|
|
match ctx.last_keyword_type {
|
|
|
|
|
Some(Global) => {
|
|
|
|
|
ctx.cur_scope = DataScope::Global;
|
|
|
|
|
ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
|
|
|
|
|
}
|
|
|
|
|
Some(QuestionPre) => {
|
|
|
|
|
ctx.cur_scope = DataScope::QuestionPre;
|
|
|
|
|
ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
|
|
|
|
}
|
|
|
|
|
Some(QuestionStart) => {
|
|
|
|
|
ctx.cur_scope = DataScope::QuestionContent;
|
|
|
|
|
// store prev question before reading new
|
|
|
|
|
if ctx.have_new_question {
|
|
|
|
|
ctx.questions.push_if_valid(ctx.cur_question.clone());
|
2019-07-27 20:24:49 +03:00
|
|
|
|
}
|
2022-09-12 15:59:35 +03:00
|
|
|
|
// prepare to read new question data with cur_question_pre values
|
|
|
|
|
ctx.cur_question = ctx.cur_question_pre.clone();
|
|
|
|
|
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
|
|
|
|
ctx.have_new_question = true;
|
|
|
|
|
}
|
|
|
|
|
Some(QuestionContent) => {
|
|
|
|
|
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
|
|
|
|
}
|
|
|
|
|
Some(CurrentScope) => {
|
|
|
|
|
// match value to store data
|
|
|
|
|
let scope_data = match ctx.cur_scope {
|
|
|
|
|
DataScope::Global => &mut ctx.data,
|
|
|
|
|
DataScope::QuestionPre => &mut ctx.cur_question_pre,
|
|
|
|
|
DataScope::QuestionContent => &mut ctx.cur_question,
|
|
|
|
|
};
|
|
|
|
|
scope_data[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
|
|
|
|
}
|
|
|
|
|
_ => (), //None or Ignore
|
|
|
|
|
};
|
|
|
|
|
// clear content
|
|
|
|
|
ctx.cur_content.clear();
|
2019-07-27 20:24:49 +03:00
|
|
|
|
}
|
2022-09-12 15:59:35 +03:00
|
|
|
|
None => {
|
|
|
|
|
// accumulate content if line is not a keyword
|
|
|
|
|
ctx.cur_content.push(line);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-07-26 12:24:25 +03:00
|
|
|
|
|
2019-07-27 20:24:49 +03:00
|
|
|
|
// finish reading last question
|
|
|
|
|
if ctx.have_new_question && !ctx.cur_content.is_empty() {
|
|
|
|
|
ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").into();
|
2019-07-30 21:15:20 +03:00
|
|
|
|
ctx.questions.push_if_valid(ctx.cur_question.clone());
|
2019-07-27 20:24:49 +03:00
|
|
|
|
ctx.have_new_question = false;
|
|
|
|
|
}
|
2019-07-26 12:24:25 +03:00
|
|
|
|
|
2019-07-27 20:24:49 +03:00
|
|
|
|
ctx.data["Вопросы"] = ctx.questions.clone();
|
|
|
|
|
Ok(ctx.data.clone())
|
2019-07-26 12:24:25 +03:00
|
|
|
|
}
|
|
|
|
|
|
2022-09-20 00:12:49 +03:00
|
|
|
|
struct WriteQueueItem {
|
|
|
|
|
name: String,
|
|
|
|
|
data: String,
|
2019-07-26 23:04:41 +03:00
|
|
|
|
}
|
|
|
|
|
|
2022-09-20 00:12:49 +03:00
|
|
|
|
type WriteQueue = Arc<Mutex<VecDeque<Option<WriteQueueItem>>>>;
|
|
|
|
|
|
|
|
|
|
async fn data_reader(queue: WriteQueue) {
|
2019-07-27 20:24:49 +03:00
|
|
|
|
// open archive just to list files
|
2022-09-20 00:16:04 +03:00
|
|
|
|
let archive = ZipFileReader::new(String::from(INPUT_FILENAME))
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
2022-09-12 15:59:35 +03:00
|
|
|
|
|
2022-09-20 00:12:49 +03:00
|
|
|
|
let source_files = archive
|
2022-09-12 15:59:35 +03:00
|
|
|
|
.entries()
|
|
|
|
|
.iter()
|
|
|
|
|
.enumerate()
|
2022-09-12 22:34:17 +03:00
|
|
|
|
.filter(|(_, entry)| !entry.dir())
|
|
|
|
|
.filter(|(_, entry)| {
|
2019-07-27 20:24:49 +03:00
|
|
|
|
// skip files without "txt" extension
|
2022-09-12 22:34:17 +03:00
|
|
|
|
entry.name().ends_with(".txt")
|
2019-07-27 20:24:49 +03:00
|
|
|
|
})
|
2022-09-20 00:12:49 +03:00
|
|
|
|
.map(|(index, entry)| (index, entry.name().to_string()));
|
|
|
|
|
//
|
|
|
|
|
for (index, name) in source_files {
|
|
|
|
|
let entry_reader = archive.entry_reader(index).await.unwrap();
|
|
|
|
|
// parse file to json
|
|
|
|
|
let new_data = parse_file(entry_reader).await.unwrap();
|
|
|
|
|
// dump json to str
|
|
|
|
|
let data = new_data.pretty(2);
|
2022-09-20 00:16:04 +03:00
|
|
|
|
|
2022-09-20 00:12:49 +03:00
|
|
|
|
// add to queue
|
2022-09-20 00:16:04 +03:00
|
|
|
|
queue
|
|
|
|
|
.lock()
|
|
|
|
|
.await
|
|
|
|
|
.push_back(Some(WriteQueueItem { name, data }));
|
2022-09-20 00:12:49 +03:00
|
|
|
|
}
|
|
|
|
|
// mark queue as done for writer to exit loop
|
|
|
|
|
queue.lock().await.push_back(None);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn data_writer(queue: WriteQueue) {
|
|
|
|
|
let mut file = fs::File::create(OUTPUT_FILENAME).await.unwrap();
|
|
|
|
|
let mut writer = ZipFileWriter::new(&mut file);
|
|
|
|
|
|
|
|
|
|
let mut is_reading_done = false;
|
|
|
|
|
loop {
|
|
|
|
|
let mut queue_locked = queue.lock().await;
|
|
|
|
|
if queue_locked.is_empty() {
|
|
|
|
|
drop(queue_locked);
|
|
|
|
|
if is_reading_done {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
tokio::time::sleep(tokio::time::Duration::from_micros(50)).await;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2022-09-20 00:16:04 +03:00
|
|
|
|
|
2022-09-20 00:12:49 +03:00
|
|
|
|
let item = queue_locked.pop_front().unwrap();
|
|
|
|
|
drop(queue_locked);
|
|
|
|
|
match item {
|
|
|
|
|
None => {
|
|
|
|
|
is_reading_done = true;
|
|
|
|
|
}
|
2022-09-20 00:16:04 +03:00
|
|
|
|
Some(WriteQueueItem { name, data }) => {
|
2022-09-20 00:12:49 +03:00
|
|
|
|
// make output filename
|
|
|
|
|
let mut outfilename = PathBuf::from(name);
|
|
|
|
|
outfilename.set_extension("json");
|
|
|
|
|
let outfilename = outfilename.to_str().unwrap().to_string();
|
|
|
|
|
let opts = EntryOptions::new(outfilename, Compression::Deflate);
|
|
|
|
|
|
|
|
|
|
// write new zip entry
|
2022-09-20 00:16:04 +03:00
|
|
|
|
writer
|
|
|
|
|
.write_entry_whole(opts, data.as_bytes())
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
2022-09-20 00:12:49 +03:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
writer.close().await.unwrap();
|
|
|
|
|
}
|
2022-08-25 15:50:37 +03:00
|
|
|
|
|
2022-09-20 00:12:49 +03:00
|
|
|
|
#[tokio::main]
|
|
|
|
|
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
2022-09-12 22:34:17 +03:00
|
|
|
|
// check output filename
|
|
|
|
|
match fs::metadata(OUTPUT_FILENAME).await {
|
|
|
|
|
Ok(x) if x.is_dir() => return Err("output file is a directory!".into()),
|
2022-09-12 15:59:35 +03:00
|
|
|
|
_ => (),
|
2022-08-25 14:45:03 +03:00
|
|
|
|
};
|
2022-08-25 15:50:37 +03:00
|
|
|
|
|
2022-09-20 00:12:49 +03:00
|
|
|
|
let queue: WriteQueue = Arc::new(Mutex::new(VecDeque::with_capacity(40)));
|
|
|
|
|
let queue_r = Arc::clone(&queue);
|
|
|
|
|
let queue_w = Arc::clone(&queue);
|
|
|
|
|
|
|
|
|
|
let res = tokio::try_join!(
|
2022-09-20 00:16:04 +03:00
|
|
|
|
tokio::spawn(async move { data_reader(queue_r).await }),
|
|
|
|
|
tokio::spawn(async move { data_writer(queue_w).await })
|
2022-09-12 22:34:17 +03:00
|
|
|
|
);
|
2022-09-12 15:59:35 +03:00
|
|
|
|
|
2022-09-20 00:12:49 +03:00
|
|
|
|
res?;
|
2022-08-25 15:50:37 +03:00
|
|
|
|
|
2022-09-12 22:34:17 +03:00
|
|
|
|
println!("done ✅");
|
2019-07-27 20:24:49 +03:00
|
|
|
|
Ok(())
|
2019-07-25 12:02:25 +03:00
|
|
|
|
}
|