2017-09-13 15:30:03 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace service;
|
|
|
|
|
|
|
|
class Tesseract{
|
|
|
|
|
|
|
|
/* [1] Attributes
|
|
|
|
=========================================================*/
|
2017-09-14 13:54:41 +00:00
|
|
|
private $fname = null;
|
2017-09-13 15:30:03 +00:00
|
|
|
|
|
|
|
|
|
|
|
/* (1) Constructs and initialise a readed file
|
|
|
|
*
|
2017-09-14 13:54:41 +00:00
|
|
|
* @fname<String> Path of the image
|
|
|
|
*
|
2017-09-13 15:30:03 +00:00
|
|
|
* @return instance<Tesseract> New Tesseract
|
|
|
|
*
|
|
|
|
---------------------------------------------------------*/
|
2017-09-14 13:54:41 +00:00
|
|
|
public function __construct($fname=null){
|
|
|
|
|
|
|
|
/* [1] Check argument
|
|
|
|
=========================================================*/ {
|
|
|
|
|
|
|
|
/* (1) Check type */
|
|
|
|
if( !is_string($fname) )
|
|
|
|
throw new \Exception("Tesseract.__construct(<String>) expected but Tesseract.__construct(<".gettype($fname).">) received");
|
|
|
|
|
|
|
|
/* (2) Check file validity */
|
|
|
|
if( !file_exists($fname) )
|
|
|
|
throw new \Exception("Tesseract.__construct(<PATH>) but <PATH> is not valid");
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* [2] Store as attribute
|
|
|
|
=========================================================*/
|
|
|
|
$this->fname = $fname;
|
2017-09-13 15:30:03 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* (2) Read the image file
|
|
|
|
*
|
2017-09-14 13:54:41 +00:00
|
|
|
* @return read<String> The read content
|
2017-09-13 15:30:03 +00:00
|
|
|
*
|
|
|
|
---------------------------------------------------------*/
|
|
|
|
public function read(){
|
|
|
|
|
|
|
|
/* [1] Record the text from the image
|
2017-09-14 14:57:42 +00:00
|
|
|
=========================================================*/ {
|
|
|
|
|
|
|
|
/* (1) Process tesseract */
|
2017-09-15 06:49:13 +00:00
|
|
|
$read = shell_exec("tesseract ".$this->fname." stdout -l fra --user-words ".__ROOT__."/config/edt.user-words -c language_model_penalty_non_freq_dict_word=0.3 -c language_model_penalty_non_dict_word=.25 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 2>/dev/null");
|
2017-09-14 14:57:42 +00:00
|
|
|
|
|
|
|
/* (2) If empty */
|
|
|
|
if( is_null($read) || !preg_match('@\n@m', $read) )
|
|
|
|
throw new \Exception("Nothing read");
|
|
|
|
|
|
|
|
/* (3) Split by lines */
|
|
|
|
$by_line = explode("\n", $read);
|
|
|
|
$lines = [];
|
|
|
|
|
|
|
|
/* (4) Remove empty lines */
|
|
|
|
for( $i = 0 ; $i < count($by_line) ; $i++ ){
|
|
|
|
|
|
|
|
if( !empty( trim($by_line[$i]) ))
|
|
|
|
$lines[] = $by_line[$i];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/* (5) Manage if empty */
|
|
|
|
if( count($lines) < 2 )
|
|
|
|
throw new \Exception("Nothing read");
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* [2] Extract data
|
|
|
|
=========================================================*/ {
|
2017-09-13 15:30:03 +00:00
|
|
|
|
2017-09-14 14:57:42 +00:00
|
|
|
/* (1) Get first non-empty line (title) */
|
|
|
|
$title = $lines[0];
|
2017-09-13 15:30:03 +00:00
|
|
|
|
2017-09-14 14:57:42 +00:00
|
|
|
/* (2) Get last non-empty line */
|
|
|
|
for( $i = count($lines)-1 ; $i > 0 ; $i-- ){
|
2017-09-13 15:30:03 +00:00
|
|
|
|
2017-09-14 14:57:42 +00:00
|
|
|
// Amphi ... //
|
2017-09-15 06:49:13 +00:00
|
|
|
if( preg_match('@^a[nm]([bp][hln])?[ir] ?(.+)$@i', $lines[$i], $m) ) // 'amphi A', 'amphi 600 droit'
|
2017-09-14 14:57:42 +00:00
|
|
|
return [ $title, "Amphi ${m[1]}" ];
|
2017-09-13 15:30:03 +00:00
|
|
|
|
|
|
|
|
2017-09-14 14:57:42 +00:00
|
|
|
// S... OR 5... //
|
|
|
|
if( preg_match('@^[S|5] ?(\d+)@i', $lines[$i], $m) ) // 'S10', 'S22'
|
|
|
|
return [ $title, "S. ${m[1]}" ];
|
2017-09-13 15:30:03 +00:00
|
|
|
|
2017-09-14 14:57:42 +00:00
|
|
|
}
|
2017-09-13 15:30:03 +00:00
|
|
|
|
2017-09-14 13:54:41 +00:00
|
|
|
}
|
2017-09-13 15:30:03 +00:00
|
|
|
|
2017-09-14 13:54:41 +00:00
|
|
|
|
2017-09-14 14:57:42 +00:00
|
|
|
return [ $title, null ];
|
2017-09-14 13:54:41 +00:00
|
|
|
|
|
|
|
}
|
2017-09-13 15:30:03 +00:00
|
|
|
|
|
|
|
|
|
|
|
}
|