From 960f490bebe8cc0e764a2161bfe4bd39dd2a75a0 Mon Sep 17 00:00:00 2001 From: xdrm-brackets Date: Thu, 14 Sep 2017 16:57:42 +0200 Subject: [PATCH] Improved location read + tesseract --- build/service/CalendarExtractor.php | 11 ++--- build/service/Tesseract.php | 65 +++++++++++++++++++---------- 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/build/service/CalendarExtractor.php b/build/service/CalendarExtractor.php index 44c9fe2..4d8c7ed 100644 --- a/build/service/CalendarExtractor.php +++ b/build/service/CalendarExtractor.php @@ -266,7 +266,7 @@ /* (2) Manage copy error */ if( !$copied ) - throw new \Exception("Cannot clip image"); + return [ null, null ]; /* (3) Save to jpeg */ \imagesavealpha($clip, true); @@ -292,7 +292,7 @@ /* (2) Manage error */ }catch(\Exception $e){ - $read = [ 'unkown', 'unknown' ]; + $read = [ null, null ]; } @@ -430,9 +430,10 @@ $RAW .= "DTSTART:${start_t}\n"; $RAW .= "DTEND:${data[0]}\n"; $RAW .= "UID:$start_t-univ-pau-ics\n"; // required - $RAW .= "SUMMARY:${data[1]}\n"; - $RAW .= "LOCATION:${data[2]}\n"; - // $RAW .= "ATTACH;ENCODING=BASE64;VALUE=BINARY;FILENAME=att.jpg:${data[1]}\n"; + if( !is_null($data[1]) ) + $RAW .= "SUMMARY:${data[1]}\n"; + if( !is_null($data[2]) ) + $RAW .= "LOCATION:${data[2]}\n"; $RAW .= "CATEGORIES: UPPA Calendar\n"; $RAW .= "END:VEVENT\n"; } diff --git a/build/service/Tesseract.php b/build/service/Tesseract.php index d38d815..d96b643 100755 --- a/build/service/Tesseract.php +++ b/build/service/Tesseract.php @@ -48,37 +48,60 @@ public function read(){ /* [1] Record the text from the image - =========================================================*/ - /* (1) Process tesseract */ - $read = shell_exec("tesseract ".$this->fname." stdout -l fra --user-words ".__ROOT__."/config/edt.user-words -c language_model_penalty_non_freq_dict_word=0.1 -c language_model_penalty_non_dict_word=.15 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 2>/dev/null"); + =========================================================*/ { - /* (2) If empty */ - if( is_null($read) || !preg_match('@\n@g', $read) ) - throw new \Exception("Nothing read"); + /* (1) Process tesseract */ + $read = shell_exec("tesseract ".$this->fname." stdout -l fra --user-words ".__ROOT__."/config/edt.user-words -c language_model_penalty_non_freq_dict_word=0.1 -c language_model_penalty_non_dict_word=.15 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 2>/dev/null"); - /* (3) Split by lines */ - $by_line = explode("\n", $read); + // var_dump($read); - /* (4) Get first line (title) */ - $title = $by_line[0]; + /* (2) If empty */ + if( is_null($read) || !preg_match('@\n@m', $read) ) + throw new \Exception("Nothing read"); - /* (5) Get last non-empty line */ - for( $i = count($by_line)-1 ; $i > 0 ; $i-- ){ + /* (3) Split by lines */ + $by_line = explode("\n", $read); + $lines = []; - // {1} Check not empty // - if( empty($by_line[$i]) ) - continue; + /* (4) Remove empty lines */ + for( $i = 0 ; $i < count($by_line) ; $i++ ){ - // {2} Matches // - if( preg_match('@^amphi@i', $by_line[$i]) || // 'amphi A', 'amphi 600 droit' - preg_match('@^S\d+@i', $by_line[$i]) // 'S10', 'S22' - ) - return [ $title, $by_line[$i] ]; + if( !empty( trim($by_line[$i]) )) + $lines[] = $by_line[$i]; + + } + + /* (5) Manage if empty */ + if( count($lines) < 2 ) + throw new \Exception("Nothing read"); } - return [ $title, 'unknown' ]; + /* [2] Extract data + =========================================================*/ { + + /* (1) Get first non-empty line (title) */ + $title = $lines[0]; + + /* (2) Get last non-empty line */ + for( $i = count($lines)-1 ; $i > 0 ; $i-- ){ + + // Amphi ... // + if( preg_match('@^a[nm][bp][hl]i ?(.+)$@i', $lines[$i], $m) ) // 'amphi A', 'amphi 600 droit' + return [ $title, "Amphi ${m[1]}" ]; + + + // S... OR 5... // + if( preg_match('@^[S|5] ?(\d+)@i', $lines[$i], $m) ) // 'S10', 'S22' + return [ $title, "S. ${m[1]}" ]; + + } + + } + + + return [ $title, null ]; }