Skip to content

Commit e717a40

Browse files
committed
WIP pdf extractor
1 parent 6fc885e commit e717a40

File tree

8 files changed

+460
-23
lines changed

8 files changed

+460
-23
lines changed

Cargo.lock

Lines changed: 377 additions & 22 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
[workspace]
22
members = [
3+
"atomizer",
34
"server",
45
"cli",
56
"lib",

atomizer/Cargo.toml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[package]
2+
description = "Turn files into Atomic Data."
3+
edition = "2021"
4+
name = "atomizer"
5+
version = "0.1.0"
6+
7+
[dependencies]
8+
atomic_lib = {version = "0.34.0", path = "../lib"}
9+
# Should match the version of pdf-extract
10+
lopdf = "0.26"
11+
mime_guess = "2.0.4"
12+
pdf-extract = {git = "https://github.com/Hessesian/pdf-extract"}

atomizer/src/file.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
use std::{collections::HashMap, error::Error};
2+
3+
use atomic_lib::resources::PropVals;
4+
5+
use crate::pdf;
6+
7+
pub struct File {
8+
filename: String,
9+
mime: String,
10+
bytes: Vec<u8>,
11+
}
12+
13+
impl File {
14+
pub fn open(filename: &str) -> Result<File, Box<dyn Error>> {
15+
let bytes = std::fs::read(filename)?;
16+
let mime = mime_guess::from_path(filename)
17+
.first_or_octet_stream()
18+
.to_string();
19+
20+
Ok(File {
21+
filename: filename.to_string(),
22+
mime,
23+
bytes,
24+
})
25+
}
26+
27+
/// Transforms an input file into an Atomic Data [Resource]
28+
pub fn atomize(&self) -> PropVals {
29+
match self.mime.as_str() {
30+
"application/pdf" => pdf::atomize(self),
31+
_ => HashMap::new(),
32+
}
33+
}
34+
35+
pub fn bytes(&self) -> &[u8] {
36+
&self.bytes
37+
}
38+
}

atomizer/src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
mod file;
2+
mod pdf;
3+
4+
use atomic_lib::Resource;

atomizer/src/pdf.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
use atomic_lib::resources::PropVals;
2+
use lopdf::content;
3+
4+
const content_prop: &str = "content";
5+
6+
pub fn atomize(file: &crate::file::File) -> PropVals {
7+
let mut props = PropVals::new();
8+
let mut s = String::new();
9+
let mut output = pdf_extract::PlainTextOutput::new(&mut s);
10+
let text = pdf_extract::extract_text_mem(file.bytes()).unwrap();
11+
props.insert(content_prop.into(), atomic_lib::Value::String(text));
12+
props
13+
}
14+
15+
#[cfg(test)]
16+
mod tests {
17+
use super::*;
18+
use crate::file::File;
19+
20+
#[test]
21+
fn load_pdf() {
22+
let f = File::open("./test/docs-demo.pdf").unwrap();
23+
let propvals = f.atomize();
24+
let content = propvals.get(content_prop).unwrap();
25+
assert!(content.to_string().contains("Atomic Data"));
26+
}
27+
}

atomizer/test/docs-demo.pdf

194 KB
Binary file not shown.

lib/src/populate.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use crate::{
99
parse::ParseOpts,
1010
schema::{Class, Property},
1111
storelike::Query,
12-
urls, Resource, Storelike, Value,
12+
urls, Storelike, Value,
1313
};
1414

1515
/// Populates a store with some of the most fundamental Properties and Classes needed to bootstrap the whole.

0 commit comments

Comments
 (0)