Path of the image * * @return instance New Tesseract * ---------------------------------------------------------*/ public function __construct($fname=null){ /* [1] Check argument =========================================================*/ { /* (1) Check type */ if( !is_string($fname) ) throw new \Exception("Tesseract.__construct() expected but Tesseract.__construct(<".gettype($fname).">) received"); /* (2) Check file validity */ if( !file_exists($fname) ) throw new \Exception("Tesseract.__construct() but is not valid"); } /* [2] Store as attribute =========================================================*/ $this->fname = $fname; } /* (2) Read the image file * * @return read The read content * ---------------------------------------------------------*/ public function read(){ /* [1] Record the text from the image =========================================================*/ { /* (1) Process tesseract */ $read = shell_exec("tesseract ".$this->fname." stdout -l fra --user-words ".__ROOT__."/config/edt.user-words -c language_model_penalty_non_freq_dict_word=0.3 -c language_model_penalty_non_dict_word=.25 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 2>/dev/null"); /* (2) If empty */ if( is_null($read) || !preg_match('@\n@m', $read) ) throw new \Exception("Nothing read"); /* (3) Split by lines */ $by_line = explode("\n", $read); $lines = []; /* (4) Remove empty lines */ for( $i = 0 ; $i < count($by_line) ; $i++ ){ if( !empty( trim($by_line[$i]) )) $lines[] = $by_line[$i]; } /* (5) Manage if empty */ if( count($lines) < 3 ) throw new \Exception("Nothing read"); } /* [2] Extract data =========================================================*/ { /* (1) Get first non-empty line (title) */ $title = $lines[0]; /* (2) Get last non-empty line */ for( $i = 2 ; $i < count($lines) ; $i++ ){ // Amphi ... // if( preg_match('@^a[nm](?:[bp][hln])?[ir] ?(.+)$@i', $lines[$i], $m) ) // 'amphi A', 'amphi 600 droit' return [ $title, "Amphi ${m[1]}" ]; // S... OR 5... // if( preg_match('@^[S|5] ?(.+)@i', $lines[$i], $m) ) // 'S10', 'S22' return [ $title, "S. ${m[1]}" ]; // If not 'Cours', 'CTD', 'TD', 'TP' // if( preg_match('@^(co[hu][trn][s5]|t[dp|c[mn]|ct[dp])@i', $lines[$i]) ) continue; } } return [ $title, $lines[2] ]; } }