1use lopdf::{Document, Object};
7use regex::Regex;
8use std::path::Path;
9use thiserror::Error;
10
11#[derive(Error, Debug)]
13pub enum ParserError {
14 #[error("PDF parsing error: {0}")]
16 Pdf(#[from] lopdf::Error),
17 #[error("IO error: {0}")]
19 Io(#[from] std::io::Error),
20 #[error("Failed to extract text from PDF")]
22 ExtractionFailed,
23}
24
25#[derive(Debug, Clone)]
27pub struct Lesson {
28 pub subject: String,
30 pub room: String,
32 pub teacher: String,
34 pub class_code: String,
36 pub day_index: usize,
38 pub period_index: usize,
40}
41
42#[derive(Debug, Clone)]
44pub struct Week {
45 pub lessons: Vec<Lesson>,
47 pub week_name: String,
49 pub student_name: Option<String>,
51 pub form: Option<String>,
53}
54
55#[derive(Debug, Clone)]
57struct TextItem {
58 x: f64,
59 y: f64,
60 text: String,
61}
62
63pub fn parse_pdf(path: &Path) -> Result<Vec<Week>, ParserError> {
99 let doc = Document::load(path)?;
100 let mut weeks = Vec::new();
101
102 for (page_num, page_id) in doc.get_pages() {
103 let text_items = extract_text_from_page(&doc, page_id)?;
104 if text_items.is_empty() {
105 continue;
106 }
107
108 let page_weeks = process_page_text(text_items, page_num);
109 weeks.extend(page_weeks);
110 }
111
112 Ok(weeks)
113}
114
115fn extract_text_from_page(
116 doc: &Document,
117 page_id: (u32, u16),
118) -> Result<Vec<TextItem>, ParserError> {
119 let content_bytes = doc.get_page_content(page_id)?;
120 let content = lopdf::content::Content::decode(&content_bytes)?;
121 let mut text_items = Vec::new();
122
123 let mut current_x = 0.0;
124 let mut current_y = 0.0;
125
126 for operation in content.operations.iter() {
127 match operation.operator.as_str() {
128 "BT" => {
129 current_x = 0.0;
130 current_y = 0.0;
131 }
132 "Tm" => {
133 if operation.operands.len() == 6 {
134 if let (Ok(e), Ok(f)) = (
135 operation.operands[4].as_float(),
136 operation.operands[5].as_float(),
137 ) {
138 current_x = e as f64;
139 current_y = f as f64;
140 }
141 }
142 }
143 "Td" | "TD" => {
144 if operation.operands.len() == 2 {
145 if let (Ok(tx), Ok(ty)) = (
146 operation.operands[0].as_float(),
147 operation.operands[1].as_float(),
148 ) {
149 current_x += tx as f64;
150 current_y += ty as f64;
151 }
152 }
153 }
154 "Tj" => {
155 if let Some(text) = decode_text_object(&operation.operands[0]) {
156 text_items.push(TextItem {
157 x: current_x,
158 y: current_y,
159 text: decode_bromcom_text(&text),
160 });
161 }
162 }
163 "TJ" => {
164 if let Ok(arr) = operation.operands[0].as_array() {
165 let mut full_text = String::new();
166 for item in arr {
167 if let Some(text) = decode_text_object(item) {
168 full_text.push_str(&text);
169 }
170 }
171 text_items.push(TextItem {
172 x: current_x,
173 y: current_y,
174 text: decode_bromcom_text(&full_text),
175 });
176 }
177 }
178 _ => {}
179 }
180 }
181
182 Ok(text_items)
183}
184
185fn decode_text_object(obj: &Object) -> Option<String> {
186 match obj {
187 Object::String(bytes, _) => String::from_utf8(bytes.clone()).ok(),
188 _ => None,
189 }
190}
191
192fn decode_bromcom_text(text: &str) -> String {
193 text.chars()
194 .filter(|&c| c != '\0')
195 .map(|c| {
196 let code = c as u8;
197 let new_code = code.wrapping_add(29);
198 new_code as char
199 })
200 .collect()
201}
202
203fn process_page_text(items: Vec<TextItem>, _page_num: u32) -> Vec<Week> {
204 let mut weeks = Vec::new();
205
206 let week_regex = Regex::new(r"Week\s+(\d+)").unwrap();
207
208 let mut week_headers: Vec<(&TextItem, u32)> = items
210 .iter()
211 .filter_map(|i| {
212 week_regex
213 .captures(&i.text)
214 .map(|cap| (i, cap[1].parse::<u32>().unwrap_or(0)))
215 })
216 .collect();
217
218 week_headers.sort_by_key(|k| k.1);
220
221 if week_headers.is_empty() {
222 return weeks;
223 }
224
225 let y_increases_down = if week_headers.len() > 1 {
228 week_headers[1].0.y > week_headers[0].0.y
229 } else {
230 let header_y = week_headers[0].0.y;
232 let items_below_y_down = items.iter().filter(|i| i.y > header_y).count();
233 let items_below_y_up = items.iter().filter(|i| i.y < header_y).count();
234 items_below_y_down > items_below_y_up
235 };
236
237 for (i, (header, _week_num)) in week_headers.iter().enumerate() {
238 let start_y = header.y;
239 let end_y = if i + 1 < week_headers.len() {
240 week_headers[i + 1].0.y
241 } else if y_increases_down {
242 f64::MAX
243 } else {
244 0.0
245 };
246
247 let week_items: Vec<&TextItem> = items
264 .iter()
265 .filter(|item| {
266 if y_increases_down {
267 item.y >= start_y && item.y < end_y
268 } else {
269 item.y <= start_y && item.y > end_y
270 }
271 })
272 .collect();
273
274 let week_name = if let Some(mat) = week_regex.find(&header.text) {
276 mat.as_str().to_string()
277 } else {
278 "Unknown Week".to_string()
279 };
280
281 let lessons = parse_week_items(&week_items);
282
283 let (student_name, form) = extract_student_info(&week_items);
285
286 if !lessons.is_empty() {
287 weeks.push(Week {
288 lessons,
289 week_name,
290 student_name,
291 form,
292 });
293 }
294 }
295
296 weeks
297}
298
299fn parse_week_items(items: &[&TextItem]) -> Vec<Lesson> {
300 let mut lessons = Vec::new();
301
302 let days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"];
304 let mut day_cols: Vec<(usize, f64)> = Vec::new(); for (i, day) in days.iter().enumerate() {
307 if let Some(header) = items.iter().find(|item| {
308 item.text.trim().eq_ignore_ascii_case(day)
309 || item.text.to_lowercase().contains(&day.to_lowercase())
310 }) {
311 day_cols.push((i, header.x));
312 }
314 }
315
316 if day_cols.is_empty() {
317 return lessons;
322 }
323
324 let marker_map = [
329 ("PD", 0),
330 ("Reg", 0),
331 ("L1", 1),
332 ("1", 1),
333 ("L2", 2),
334 ("2", 2),
335 ("L3", 3),
336 ("3", 3),
337 ("L4", 4),
338 ("4", 4),
339 ("L4/", 4),
340 ("L5", 5),
341 ("5", 5),
342 ];
343
344 let mut period_rows: Vec<(usize, f64)> = Vec::new(); for (marker_text, period_idx) in marker_map.iter() {
347 let matching_items: Vec<&f64> = items
349 .iter()
350 .filter(|item| {
351 let text = item.text.trim();
352 text == *marker_text ||
353 (marker_text.len() == 2 && text.starts_with(marker_text))
355 })
356 .map(|item| &item.y)
357 .collect();
358
359 if !matching_items.is_empty() {
360 let avg_y: f64 =
362 matching_items.iter().cloned().sum::<f64>() / matching_items.len() as f64;
363 if !period_rows.iter().any(|(idx, _)| idx == period_idx) {
365 period_rows.push((*period_idx, avg_y));
366 }
367 }
368 }
369
370 let teacher_regex_filter = Regex::new(r"^(Mr|Ms|Mrs|Miss)\s+.*$").unwrap();
373 for (day_idx, day_x) in &day_cols {
374 for (period_idx, period_y) in &period_rows {
375 let main_items: Vec<&&TextItem> = items
383 .iter()
384 .filter(|item| {
385 (item.x - day_x).abs() < 45.0 &&
386 (item.y - period_y).abs() < 25.0 &&
387 !days.iter().any(|d| item.text.trim().eq_ignore_ascii_case(d)) &&
389 !marker_map.iter().any(|(m, _)| item.text.trim() == *m)
390 })
391 .collect();
392
393 let teacher_items: Vec<&&TextItem> = items
395 .iter()
396 .filter(|item| {
397 (item.x - day_x).abs() < 45.0 &&
398 item.y > *period_y && (item.y - period_y).abs() < 35.0 &&
400 teacher_regex_filter.is_match(item.text.trim())
401 })
402 .collect();
403
404 let mut cell_items: Vec<&&TextItem> = main_items;
406 cell_items.extend(teacher_items);
407
408 if !cell_items.is_empty() {
409 let lesson = parse_lesson_content(cell_items, *day_idx, *period_idx);
410 lessons.push(lesson);
411 }
412 }
413 }
414
415 lessons
416}
417
418fn parse_lesson_content(items: Vec<&&TextItem>, day_index: usize, period_index: usize) -> Lesson {
419 let mut sorted_items = items.clone();
421 sorted_items.sort_by(|a, b| {
422 a.y.partial_cmp(&b.y)
423 .unwrap_or(std::cmp::Ordering::Equal)
424 .then(a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal))
425 });
426
427 let mut subject_parts: Vec<String> = Vec::new();
428 let mut room = "Unknown".to_string();
429 let mut teacher = "Unknown".to_string();
430 let mut class_code = String::new();
431
432 let room_regex = Regex::new(r"^[A-Z]{2,3}\d+[A-Z]?$").unwrap(); let teacher_regex = Regex::new(r"^(Mr|Ms|Mrs|Miss)\s+.*$").unwrap();
434 let class_regex = Regex::new(r"^\d[A-Z].*$").unwrap(); let days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"];
436
437 let location_indicators = ["DEFAULT", "DS"];
439
440 for item in sorted_items {
441 let text = item.text.trim();
442 if text.is_empty() {
443 continue;
444 }
445
446 if days.iter().any(|d| text.eq_ignore_ascii_case(d)) {
448 continue;
449 }
450
451 if location_indicators.contains(&text) {
453 continue;
454 }
455
456 if room_regex.is_match(text) && room == "Unknown" {
457 room = text.to_string();
459 } else if teacher_regex.is_match(text) {
460 teacher = text.to_string();
461 } else if class_regex.is_match(text) {
462 class_code = text.to_string();
463 } else {
464 subject_parts.push(text.to_string());
466 }
467 }
468
469 let subject = if subject_parts.is_empty() {
471 "Unknown".to_string()
472 } else {
473 subject_parts.join(" ")
474 };
475
476 Lesson {
477 subject,
478 room,
479 teacher,
480 class_code,
481 day_index,
482 period_index,
483 }
484}
485
486fn extract_student_info(items: &[&TextItem]) -> (Option<String>, Option<String>) {
487 let form_in_parens_regex = Regex::new(r"^(.+?)\s*\(([0-9]{2,4}[A-Z0-9]*)\)$").unwrap();
494 let form_code_regex = Regex::new(r"^[0-9]{2,4}[A-Z0-9]*$").unwrap(); if items.is_empty() {
497 return (None, None);
498 }
499
500 let mut sorted = items.to_vec();
502 sorted.sort_by(|a, b| {
503 a.y.partial_cmp(&b.y)
504 .unwrap_or(std::cmp::Ordering::Equal)
505 .then(a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal))
506 });
507
508 let mut student_name = None;
509 let mut form = None;
510
511 let excluded = [
513 "Week",
514 "Term",
515 "Monday",
516 "Tuesday",
517 "Wednesday",
518 "Thursday",
519 "Friday",
520 "Page",
521 "of",
522 "Personal",
523 "Development",
524 "Intervention",
525 ];
526
527 for item in sorted.iter().take(50) {
529 let text = item.text.trim();
530
531 if let Some(cap) = form_in_parens_regex.captures(text) {
532 let name = cap[1].trim();
533 if name.len() > 3 {
534 return (Some(name.to_string()), Some(cap[2].to_string()));
535 }
536 }
537 }
538
539 for (i, item) in sorted.iter().take(50).enumerate() {
541 let text = item.text.trim();
542
543 if text.is_empty() || excluded.iter().any(|&e| text.contains(e)) {
544 continue;
545 }
546
547 if form.is_none() && form_code_regex.is_match(text) {
549 form = Some(text.to_string());
550
551 if student_name.is_none() {
553 for j in (0..i).rev() {
554 let prev = sorted[j];
555 let prev_text = prev.text.trim();
556
557 if (prev.y - item.y).abs() < 5.0
559 && prev_text.len() > 3
560 && prev_text.contains(' ')
561 && !excluded.iter().any(|&e| prev_text.contains(e))
562 && !prev_text.starts_with("Mr")
563 && !prev_text.starts_with("Ms")
564 && !prev_text.starts_with("Mrs")
565 && !prev_text.starts_with("Miss")
566 {
567 student_name = Some(prev_text.to_string());
568 break;
569 }
570 }
571 }
572 }
573
574 if student_name.is_some() && form.is_some() {
576 break;
577 }
578 }
579
580 (student_name, form)
581}
582
583#[cfg(test)]
584mod tests {
585 use super::*;
586
587 fn make_item(x: f64, y: f64, text: &str) -> TextItem {
588 TextItem {
589 x,
590 y,
591 text: text.to_string(),
592 }
593 }
594
595 #[test]
596 fn parse_lesson_with_room_and_teacher() {
597 let src = [
598 make_item(100.0, 100.0, "Personal"),
599 make_item(100.0, 110.0, "Development"),
600 make_item(100.0, 120.0, "Intervention"),
601 make_item(100.0, 130.0, "HU9"),
602 make_item(100.0, 145.0, "Ms Test A"),
603 ];
604
605 let refs: Vec<&TextItem> = src.iter().collect();
606 let refsrefs: Vec<&&TextItem> = refs.iter().collect();
607
608 let lesson = parse_lesson_content(refsrefs, 0, 0);
609 assert_eq!(lesson.subject, "Personal Development Intervention");
610 assert_eq!(lesson.room, "HU9");
611 assert_eq!(lesson.teacher, "Ms Test A");
612 }
613
614 #[test]
615 fn parse_lesson_detects_classcode() {
616 let src = [
617 make_item(100.0, 200.0, "Science"),
618 make_item(100.0, 210.0, "8A1/Co"),
619 make_item(100.0, 220.0, "Mr Test B"),
620 ];
621
622 let refs: Vec<&TextItem> = src.iter().collect();
623 let refsrefs: Vec<&&TextItem> = refs.iter().collect();
624
625 let lesson = parse_lesson_content(refsrefs, 1, 2);
626 assert_eq!(lesson.subject, "Science");
627 assert_eq!(lesson.class_code, "8A1/Co");
628 assert_eq!(lesson.teacher, "Mr Test B");
629 }
630
631 #[test]
632 fn extract_student_info_parens() {
633 let src = [make_item(10.0, 10.0, "Alex Testington (11XX)")];
634 let items: Vec<&TextItem> = src.iter().collect();
635
636 let (name, form) = extract_student_info(&items);
637 assert_eq!(name.unwrap(), "Alex Testington");
638 assert_eq!(form.unwrap(), "11XX");
639 }
640
641 #[test]
642 fn extract_student_info_separate() {
643 let src = [
644 make_item(10.0, 10.0, "Alex Testington"),
645 make_item(10.0, 12.0, "11XX"),
646 ];
647 let items: Vec<&TextItem> = src.iter().collect();
648
649 let (name, form) = extract_student_info(&items);
650 assert_eq!(name.unwrap(), "Alex Testington");
651 assert_eq!(form.unwrap(), "11XX");
652 }
653
654 #[test]
655 fn extract_student_info_parens_numeric() {
656 let src = [make_item(10.0, 10.0, "Alex Testington (917)")];
657 let items: Vec<&TextItem> = src.iter().collect();
658
659 let (name, form) = extract_student_info(&items);
660 assert_eq!(name.unwrap(), "Alex Testington");
661 assert_eq!(form.unwrap(), "917");
662 }
663
664 #[test]
665 fn extract_student_info_separate_alpha_num() {
666 let src = [
667 make_item(10.0, 10.0, "Ann Example"),
668 make_item(10.0, 13.0, "10A"),
669 ];
670 let items: Vec<&TextItem> = src.iter().collect();
671
672 let (name, form) = extract_student_info(&items);
673 assert_eq!(name.unwrap(), "Ann Example");
674 assert_eq!(form.unwrap(), "10A");
675 }
676}