timetable_core/
parser.rs

1//! PDF parsing for Bromcom timetables.
2//!
3//! This module extracts text with coordinates from Bromcom PDF files and reconstructs
4//! the timetable grid structure using heuristics for day/period detection.
5
6use lopdf::{Document, Object};
7use regex::Regex;
8use std::path::Path;
9use thiserror::Error;
10
11/// Errors that can occur during PDF parsing.
12#[derive(Error, Debug)]
13pub enum ParserError {
14    /// PDF document parsing error
15    #[error("PDF parsing error: {0}")]
16    Pdf(#[from] lopdf::Error),
17    /// I/O error reading PDF file
18    #[error("IO error: {0}")]
19    Io(#[from] std::io::Error),
20    /// Failed to extract text from PDF
21    #[error("Failed to extract text from PDF")]
22    ExtractionFailed,
23}
24
25/// A single lesson entry in the timetable.
26#[derive(Debug, Clone)]
27pub struct Lesson {
28    /// Subject name (e.g., "Mathematics", "French")
29    pub subject: String,
30    /// Room code (e.g., "MA3", "SC8")
31    pub room: String,
32    /// Teacher name (e.g., "Ms Test A", "Mr Test B")
33    pub teacher: String,
34    /// Class code (e.g., "MA3", "HU9")
35    pub class_code: String,
36    /// Day of week (0-4 for Monday-Friday; weekends not included)
37    pub day_index: usize,
38    /// Period index (0 = PD, 1 = L1, 2 = L2, etc.)
39    pub period_index: usize,
40}
41
42/// A week of timetable data containing multiple lessons.
43#[derive(Debug, Clone)]
44pub struct Week {
45    /// All lessons for this week
46    pub lessons: Vec<Lesson>,
47    /// Week identifier (e.g., "Week 1", "Week 2")
48    pub week_name: String,
49    /// Student name extracted from PDF (e.g., "Alex Testington")
50    pub student_name: Option<String>,
51    /// Form/class code (e.g., "11XX")
52    pub form: Option<String>,
53}
54
55/// Internal representation of text item with coordinates.
56#[derive(Debug, Clone)]
57struct TextItem {
58    x: f64,
59    y: f64,
60    text: String,
61}
62
63/// Parse a Bromcom PDF timetable file.
64///
65/// Extracts text with coordinates from each page and reconstructs the timetable grid
66/// by detecting week boundaries, day columns, and period rows.
67///
68/// # Arguments
69///
70/// * `path` - Path to the Bromcom PDF file
71///
72/// # Returns
73///
74/// A vector of [`Week`] structures, one for each week found in the PDF.
75///
76/// # Errors
77///
78/// Returns [`ParserError`] if:
79/// - The PDF file cannot be opened or read
80/// - The PDF structure is invalid
81/// - Text extraction fails
82///
83/// # Example
84///
85/// ```no_run
86/// use timetable_core::parser::parse_pdf;
87/// use std::path::Path;
88///
89/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
90/// let weeks = parse_pdf(Path::new("input/timetable.pdf"))?;
91/// println!("Found {} weeks", weeks.len());
92/// for week in weeks {
93///     println!("{}  has {} lessons", week.week_name, week.lessons.len());
94/// }
95/// # Ok(())
96/// # }
97/// ```
98pub fn parse_pdf(path: &Path) -> Result<Vec<Week>, ParserError> {
99    let doc = Document::load(path)?;
100    let mut weeks = Vec::new();
101
102    for (page_num, page_id) in doc.get_pages() {
103        let text_items = extract_text_from_page(&doc, page_id)?;
104        if text_items.is_empty() {
105            continue;
106        }
107
108        let page_weeks = process_page_text(text_items, page_num);
109        weeks.extend(page_weeks);
110    }
111
112    Ok(weeks)
113}
114
115fn extract_text_from_page(
116    doc: &Document,
117    page_id: (u32, u16),
118) -> Result<Vec<TextItem>, ParserError> {
119    let content_bytes = doc.get_page_content(page_id)?;
120    let content = lopdf::content::Content::decode(&content_bytes)?;
121    let mut text_items = Vec::new();
122
123    let mut current_x = 0.0;
124    let mut current_y = 0.0;
125
126    for operation in content.operations.iter() {
127        match operation.operator.as_str() {
128            "BT" => {
129                current_x = 0.0;
130                current_y = 0.0;
131            }
132            "Tm" => {
133                if operation.operands.len() == 6 {
134                    if let (Ok(e), Ok(f)) = (
135                        operation.operands[4].as_float(),
136                        operation.operands[5].as_float(),
137                    ) {
138                        current_x = e as f64;
139                        current_y = f as f64;
140                    }
141                }
142            }
143            "Td" | "TD" => {
144                if operation.operands.len() == 2 {
145                    if let (Ok(tx), Ok(ty)) = (
146                        operation.operands[0].as_float(),
147                        operation.operands[1].as_float(),
148                    ) {
149                        current_x += tx as f64;
150                        current_y += ty as f64;
151                    }
152                }
153            }
154            "Tj" => {
155                if let Some(text) = decode_text_object(&operation.operands[0]) {
156                    text_items.push(TextItem {
157                        x: current_x,
158                        y: current_y,
159                        text: decode_bromcom_text(&text),
160                    });
161                }
162            }
163            "TJ" => {
164                if let Ok(arr) = operation.operands[0].as_array() {
165                    let mut full_text = String::new();
166                    for item in arr {
167                        if let Some(text) = decode_text_object(item) {
168                            full_text.push_str(&text);
169                        }
170                    }
171                    text_items.push(TextItem {
172                        x: current_x,
173                        y: current_y,
174                        text: decode_bromcom_text(&full_text),
175                    });
176                }
177            }
178            _ => {}
179        }
180    }
181
182    Ok(text_items)
183}
184
185fn decode_text_object(obj: &Object) -> Option<String> {
186    match obj {
187        Object::String(bytes, _) => String::from_utf8(bytes.clone()).ok(),
188        _ => None,
189    }
190}
191
192fn decode_bromcom_text(text: &str) -> String {
193    text.chars()
194        .filter(|&c| c != '\0')
195        .map(|c| {
196            let code = c as u8;
197            let new_code = code.wrapping_add(29);
198            new_code as char
199        })
200        .collect()
201}
202
203fn process_page_text(items: Vec<TextItem>, _page_num: u32) -> Vec<Week> {
204    let mut weeks = Vec::new();
205
206    let week_regex = Regex::new(r"Week\s+(\d+)").unwrap();
207
208    // Collect headers with their week number
209    let mut week_headers: Vec<(&TextItem, u32)> = items
210        .iter()
211        .filter_map(|i| {
212            week_regex
213                .captures(&i.text)
214                .map(|cap| (i, cap[1].parse::<u32>().unwrap_or(0)))
215        })
216        .collect();
217
218    // Sort by Week Number (Ascending) -> This ensures Top-to-Bottom order
219    week_headers.sort_by_key(|k| k.1);
220
221    if week_headers.is_empty() {
222        return weeks;
223    }
224
225    // Determine Y direction
226    // If we have multiple headers, we can check if Y increases or decreases
227    let y_increases_down = if week_headers.len() > 1 {
228        week_headers[1].0.y > week_headers[0].0.y
229    } else {
230        // Fallback: Check if most items are below or above the header
231        let header_y = week_headers[0].0.y;
232        let items_below_y_down = items.iter().filter(|i| i.y > header_y).count();
233        let items_below_y_up = items.iter().filter(|i| i.y < header_y).count();
234        items_below_y_down > items_below_y_up
235    };
236
237    for (i, (header, _week_num)) in week_headers.iter().enumerate() {
238        let start_y = header.y;
239        let end_y = if i + 1 < week_headers.len() {
240            week_headers[i + 1].0.y
241        } else if y_increases_down {
242            f64::MAX
243        } else {
244            0.0
245        };
246
247        // Define range [min, max]
248        // let (min_y, max_y) = if start_y < end_y {
249        //     (start_y, end_y)
250        // } else {
251        //     (end_y, start_y)
252        // };
253
254        // Filter items for this week
255        // We want items strictly between the headers (or header and page edge)
256        // But we must include the header line itself if we want to parse it?
257        // Actually we pass `week_items` to `parse_week_items`.
258        // `parse_week_items` needs the day headers which are usually below the Week header.
259
260        // If Y increases down: Header is at min_y. Content is > min_y and < max_y.
261        // If Y increases up: Header is at max_y. Content is < max_y and > min_y.
262
263        let week_items: Vec<&TextItem> = items
264            .iter()
265            .filter(|item| {
266                if y_increases_down {
267                    item.y >= start_y && item.y < end_y
268                } else {
269                    item.y <= start_y && item.y > end_y
270                }
271            })
272            .collect();
273
274        // Extract "Week X" from header
275        let week_name = if let Some(mat) = week_regex.find(&header.text) {
276            mat.as_str().to_string()
277        } else {
278            "Unknown Week".to_string()
279        };
280
281        let lessons = parse_week_items(&week_items);
282
283        // Try to extract student name and form from the page
284        let (student_name, form) = extract_student_info(&week_items);
285
286        if !lessons.is_empty() {
287            weeks.push(Week {
288                lessons,
289                week_name,
290                student_name,
291                form,
292            });
293        }
294    }
295
296    weeks
297}
298
299fn parse_week_items(items: &[&TextItem]) -> Vec<Lesson> {
300    let mut lessons = Vec::new();
301
302    // 1. Find Day Headers to establish X columns
303    let days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"];
304    let mut day_cols: Vec<(usize, f64)> = Vec::new(); // (day_index, x_center)
305
306    for (i, day) in days.iter().enumerate() {
307        if let Some(header) = items.iter().find(|item| {
308            item.text.trim().eq_ignore_ascii_case(day)
309                || item.text.to_lowercase().contains(&day.to_lowercase())
310        }) {
311            day_cols.push((i, header.x));
312            // println!("  Found Day: {} at X={}", day, header.x);
313        }
314    }
315
316    if day_cols.is_empty() {
317        // println!("  WARNING: No day headers found! Checking first few items:");
318        // for item in items.iter().take(10) {
319        //     println!("    '{}'", item.text);
320        // }
321        return lessons;
322    }
323
324    // 2. Find Period Rows (Y coordinates)
325    // We look for markers and group them by period index.
326    // Markers: L1..L5, PD.
327    // We map them to period indices 0..5 (PD=0, L1=1, L2=2, L3=3, L4=4, L5=5)
328    let marker_map = [
329        ("PD", 0),
330        ("Reg", 0),
331        ("L1", 1),
332        ("1", 1),
333        ("L2", 2),
334        ("2", 2),
335        ("L3", 3),
336        ("3", 3),
337        ("L4", 4),
338        ("4", 4),
339        ("L4/", 4),
340        ("L5", 5),
341        ("5", 5),
342    ];
343
344    let mut period_rows: Vec<(usize, f64)> = Vec::new(); // (period_index, y_center)
345
346    for (marker_text, period_idx) in marker_map.iter() {
347        // Find all items matching this marker
348        let matching_items: Vec<&f64> = items
349            .iter()
350            .filter(|item| {
351                let text = item.text.trim();
352                text == *marker_text ||
353                // Also match if text contains the marker (e.g., "PD" in larger text)
354                (marker_text.len() == 2 && text.starts_with(marker_text))
355            })
356            .map(|item| &item.y)
357            .collect();
358
359        if !matching_items.is_empty() {
360            // Average Y
361            let avg_y: f64 =
362                matching_items.iter().cloned().sum::<f64>() / matching_items.len() as f64;
363            // Only add if we haven't already added this period index
364            if !period_rows.iter().any(|(idx, _)| idx == period_idx) {
365                period_rows.push((*period_idx, avg_y));
366            }
367        }
368    }
369
370    // 3. Iterate Grid (Days x Periods)
371    // Pre-compile teacher regex so it's not recreated inside the inner loop
372    let teacher_regex_filter = Regex::new(r"^(Mr|Ms|Mrs|Miss)\s+.*$").unwrap();
373    for (day_idx, day_x) in &day_cols {
374        for (period_idx, period_y) in &period_rows {
375            // Define cell bounds
376            // We look for items near (day_x, period_y)
377            // For cell content (subject, room, class): Y +/- 25
378            // For teachers: Y tolerance needs to be larger (they're positioned below)
379            // So we'll use a two-pass approach
380
381            // First pass: get main cell items (subject, room, class code)
382            let main_items: Vec<&&TextItem> = items
383                .iter()
384                .filter(|item| {
385                    (item.x - day_x).abs() < 45.0 &&
386                    (item.y - period_y).abs() < 25.0 &&
387                    // Exclude markers and day headers
388                    !days.iter().any(|d| item.text.trim().eq_ignore_ascii_case(d)) &&
389                    !marker_map.iter().any(|(m, _)| item.text.trim() == *m)
390                })
391                .collect();
392
393            // Second pass: find teachers in a slightly wider Y range, but only below the period marker
394            let teacher_items: Vec<&&TextItem> = items
395                .iter()
396                .filter(|item| {
397                    (item.x - day_x).abs() < 45.0 &&
398                    item.y > *period_y && // Only below the period marker
399                    (item.y - period_y).abs() < 35.0 &&
400                    teacher_regex_filter.is_match(item.text.trim())
401                })
402                .collect();
403
404            // Combine both sets
405            let mut cell_items: Vec<&&TextItem> = main_items;
406            cell_items.extend(teacher_items);
407
408            if !cell_items.is_empty() {
409                let lesson = parse_lesson_content(cell_items, *day_idx, *period_idx);
410                lessons.push(lesson);
411            }
412        }
413    }
414
415    lessons
416}
417
418fn parse_lesson_content(items: Vec<&&TextItem>, day_index: usize, period_index: usize) -> Lesson {
419    // Sort by Y (top to bottom), then by X (left to right)
420    let mut sorted_items = items.clone();
421    sorted_items.sort_by(|a, b| {
422        a.y.partial_cmp(&b.y)
423            .unwrap_or(std::cmp::Ordering::Equal)
424            .then(a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal))
425    });
426
427    let mut subject_parts: Vec<String> = Vec::new();
428    let mut room = "Unknown".to_string();
429    let mut teacher = "Unknown".to_string();
430    let mut class_code = String::new();
431
432    let room_regex = Regex::new(r"^[A-Z]{2,3}\d+[A-Z]?$").unwrap(); // e.g. SC8, HU5, MA3 - strict format
433    let teacher_regex = Regex::new(r"^(Mr|Ms|Mrs|Miss)\s+.*$").unwrap();
434    let class_regex = Regex::new(r"^\d[A-Z].*$").unwrap(); // e.g. 8A1/Co
435    let days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"];
436
437    // Words that are location indicators (not room codes, not part of subject)
438    let location_indicators = ["DEFAULT", "DS"];
439
440    for item in sorted_items {
441        let text = item.text.trim();
442        if text.is_empty() {
443            continue;
444        }
445
446        // Skip day names if they accidentally got included
447        if days.iter().any(|d| text.eq_ignore_ascii_case(d)) {
448            continue;
449        }
450
451        // Skip location indicator words (DEFAULT, DS)
452        if location_indicators.contains(&text) {
453            continue;
454        }
455
456        if room_regex.is_match(text) && room == "Unknown" {
457            // Only capture first room code found, excluding common false positives
458            room = text.to_string();
459        } else if teacher_regex.is_match(text) {
460            teacher = text.to_string();
461        } else if class_regex.is_match(text) {
462            class_code = text.to_string();
463        } else {
464            // Accumulate subject parts
465            subject_parts.push(text.to_string());
466        }
467    }
468
469    // Join subject parts with spaces
470    let subject = if subject_parts.is_empty() {
471        "Unknown".to_string()
472    } else {
473        subject_parts.join(" ")
474    };
475
476    Lesson {
477        subject,
478        room,
479        teacher,
480        class_code,
481        day_index,
482        period_index,
483    }
484}
485
486fn extract_student_info(items: &[&TextItem]) -> (Option<String>, Option<String>) {
487    // Look for student name and form code as separate items
488    // Name is typically "Firstname Lastname" and form is like "11RD" or "917"
489    // Accept common forms such as:
490    //  - Digits only: "917", "1017"
491    //  - Digits with letters: "11RD", "10A"
492    //  - 2-4 digit starting codes with optional letters/digits afterwards
493    let form_in_parens_regex = Regex::new(r"^(.+?)\s*\(([0-9]{2,4}[A-Z0-9]*)\)$").unwrap();
494    let form_code_regex = Regex::new(r"^[0-9]{2,4}[A-Z0-9]*$").unwrap(); // Like 11RD, 917, 1017, 10A
495
496    if items.is_empty() {
497        return (None, None);
498    }
499
500    // Sort by Y position (top to bottom), then X (left to right)
501    let mut sorted = items.to_vec();
502    sorted.sort_by(|a, b| {
503        a.y.partial_cmp(&b.y)
504            .unwrap_or(std::cmp::Ordering::Equal)
505            .then(a.x.partial_cmp(&b.x).unwrap_or(std::cmp::Ordering::Equal))
506    });
507
508    let mut student_name = None;
509    let mut form = None;
510
511    // Excluded words that are never student names
512    let excluded = [
513        "Week",
514        "Term",
515        "Monday",
516        "Tuesday",
517        "Wednesday",
518        "Thursday",
519        "Friday",
520        "Page",
521        "of",
522        "Personal",
523        "Development",
524        "Intervention",
525    ];
526
527    // First pass: look for combined "Name (Form)" pattern
528    for item in sorted.iter().take(50) {
529        let text = item.text.trim();
530
531        if let Some(cap) = form_in_parens_regex.captures(text) {
532            let name = cap[1].trim();
533            if name.len() > 3 {
534                return (Some(name.to_string()), Some(cap[2].to_string()));
535            }
536        }
537    }
538
539    // Second pass: look for separate name and form items near the top
540    for (i, item) in sorted.iter().take(50).enumerate() {
541        let text = item.text.trim();
542
543        if text.is_empty() || excluded.iter().any(|&e| text.contains(e)) {
544            continue;
545        }
546
547        // Check if this is a form code
548        if form.is_none() && form_code_regex.is_match(text) {
549            form = Some(text.to_string());
550
551            // Look for name nearby - check previous items on similar Y coordinate
552            if student_name.is_none() {
553                for j in (0..i).rev() {
554                    let prev = sorted[j];
555                    let prev_text = prev.text.trim();
556
557                    // Check if on similar Y (same line) and looks like a name
558                    if (prev.y - item.y).abs() < 5.0
559                        && prev_text.len() > 3
560                        && prev_text.contains(' ')
561                        && !excluded.iter().any(|&e| prev_text.contains(e))
562                        && !prev_text.starts_with("Mr")
563                        && !prev_text.starts_with("Ms")
564                        && !prev_text.starts_with("Mrs")
565                        && !prev_text.starts_with("Miss")
566                    {
567                        student_name = Some(prev_text.to_string());
568                        break;
569                    }
570                }
571            }
572        }
573
574        // If we found both, stop
575        if student_name.is_some() && form.is_some() {
576            break;
577        }
578    }
579
580    (student_name, form)
581}
582
583#[cfg(test)]
584mod tests {
585    use super::*;
586
587    fn make_item(x: f64, y: f64, text: &str) -> TextItem {
588        TextItem {
589            x,
590            y,
591            text: text.to_string(),
592        }
593    }
594
595    #[test]
596    fn parse_lesson_with_room_and_teacher() {
597        let src = [
598            make_item(100.0, 100.0, "Personal"),
599            make_item(100.0, 110.0, "Development"),
600            make_item(100.0, 120.0, "Intervention"),
601            make_item(100.0, 130.0, "HU9"),
602            make_item(100.0, 145.0, "Ms Test A"),
603        ];
604
605        let refs: Vec<&TextItem> = src.iter().collect();
606        let refsrefs: Vec<&&TextItem> = refs.iter().collect();
607
608        let lesson = parse_lesson_content(refsrefs, 0, 0);
609        assert_eq!(lesson.subject, "Personal Development Intervention");
610        assert_eq!(lesson.room, "HU9");
611        assert_eq!(lesson.teacher, "Ms Test A");
612    }
613
614    #[test]
615    fn parse_lesson_detects_classcode() {
616        let src = [
617            make_item(100.0, 200.0, "Science"),
618            make_item(100.0, 210.0, "8A1/Co"),
619            make_item(100.0, 220.0, "Mr Test B"),
620        ];
621
622        let refs: Vec<&TextItem> = src.iter().collect();
623        let refsrefs: Vec<&&TextItem> = refs.iter().collect();
624
625        let lesson = parse_lesson_content(refsrefs, 1, 2);
626        assert_eq!(lesson.subject, "Science");
627        assert_eq!(lesson.class_code, "8A1/Co");
628        assert_eq!(lesson.teacher, "Mr Test B");
629    }
630
631    #[test]
632    fn extract_student_info_parens() {
633        let src = [make_item(10.0, 10.0, "Alex Testington (11XX)")];
634        let items: Vec<&TextItem> = src.iter().collect();
635
636        let (name, form) = extract_student_info(&items);
637        assert_eq!(name.unwrap(), "Alex Testington");
638        assert_eq!(form.unwrap(), "11XX");
639    }
640
641    #[test]
642    fn extract_student_info_separate() {
643        let src = [
644            make_item(10.0, 10.0, "Alex Testington"),
645            make_item(10.0, 12.0, "11XX"),
646        ];
647        let items: Vec<&TextItem> = src.iter().collect();
648
649        let (name, form) = extract_student_info(&items);
650        assert_eq!(name.unwrap(), "Alex Testington");
651        assert_eq!(form.unwrap(), "11XX");
652    }
653
654    #[test]
655    fn extract_student_info_parens_numeric() {
656        let src = [make_item(10.0, 10.0, "Alex Testington (917)")];
657        let items: Vec<&TextItem> = src.iter().collect();
658
659        let (name, form) = extract_student_info(&items);
660        assert_eq!(name.unwrap(), "Alex Testington");
661        assert_eq!(form.unwrap(), "917");
662    }
663
664    #[test]
665    fn extract_student_info_separate_alpha_num() {
666        let src = [
667            make_item(10.0, 10.0, "Ann Example"),
668            make_item(10.0, 13.0, "10A"),
669        ];
670        let items: Vec<&TextItem> = src.iter().collect();
671
672        let (name, form) = extract_student_info(&items);
673        assert_eq!(name.unwrap(), "Ann Example");
674        assert_eq!(form.unwrap(), "10A");
675    }
676}