univ-pau-ics/build/service/Tesseract.php

86 lines
2.2 KiB
PHP
Raw Normal View History

2017-09-13 15:30:03 +00:00
<?php
namespace service;
class Tesseract{
/* [1] Attributes
=========================================================*/
2017-09-14 13:54:41 +00:00
private $fname = null;
2017-09-13 15:30:03 +00:00
/* (1) Constructs and initialise a readed file
*
2017-09-14 13:54:41 +00:00
* @fname<String> Path of the image
*
2017-09-13 15:30:03 +00:00
* @return instance<Tesseract> New Tesseract
*
---------------------------------------------------------*/
2017-09-14 13:54:41 +00:00
public function __construct($fname=null){
/* [1] Check argument
=========================================================*/ {
/* (1) Check type */
if( !is_string($fname) )
throw new \Exception("Tesseract.__construct(<String>) expected but Tesseract.__construct(<".gettype($fname).">) received");
/* (2) Check file validity */
if( !file_exists($fname) )
throw new \Exception("Tesseract.__construct(<PATH>) but <PATH> is not valid");
}
/* [2] Store as attribute
=========================================================*/
$this->fname = $fname;
2017-09-13 15:30:03 +00:00
}
/* (2) Read the image file
*
2017-09-14 13:54:41 +00:00
* @return read<String> The read content
2017-09-13 15:30:03 +00:00
*
---------------------------------------------------------*/
public function read(){
/* [1] Record the text from the image
=========================================================*/
2017-09-14 13:54:41 +00:00
/* (1) Process tesseract */
$read = shell_exec("tesseract ".$this->fname." stdout -l fra --user-words ".__ROOT__."/config/edt.user-words -c language_model_penalty_non_freq_dict_word=0.1 -c language_model_penalty_non_dict_word=.15 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 2>/dev/null");
2017-09-13 15:30:03 +00:00
2017-09-14 13:54:41 +00:00
/* (2) If empty */
2017-09-14 14:28:06 +00:00
if( is_null($read) || !preg_match('@\n@', $read) )
2017-09-14 13:54:41 +00:00
throw new \Exception("Nothing read");
2017-09-13 15:30:03 +00:00
2017-09-14 13:54:41 +00:00
/* (3) Split by lines */
$by_line = explode("\n", $read);
2017-09-13 15:30:03 +00:00
2017-09-14 13:54:41 +00:00
/* (4) Get first line (title) */
$title = $by_line[0];
2017-09-13 15:30:03 +00:00
2017-09-14 13:54:41 +00:00
/* (5) Get last non-empty line */
for( $i = count($by_line)-1 ; $i > 0 ; $i-- ){
2017-09-13 15:30:03 +00:00
2017-09-14 13:54:41 +00:00
// {1} Check not empty //
if( empty($by_line[$i]) )
continue;
2017-09-13 15:30:03 +00:00
2017-09-14 13:54:41 +00:00
// {2} Matches //
if( preg_match('@^amphi@i', $by_line[$i]) || // 'amphi A', 'amphi 600 droit'
preg_match('@^S\d+@i', $by_line[$i]) // 'S10', 'S22'
)
return [ $title, $by_line[$i] ];
2017-09-13 15:30:03 +00:00
2017-09-14 13:54:41 +00:00
}
2017-09-13 15:30:03 +00:00
2017-09-14 13:54:41 +00:00
return [ $title, 'unknown' ];
}
2017-09-13 15:30:03 +00:00
}