OCR v0.1
This commit is contained in:
parent
8da9dff2a9
commit
6464bd57c9
|
@ -39,11 +39,13 @@
|
|||
/* [2] Display file
|
||||
=========================================================*/
|
||||
/* (1) Headers */
|
||||
header('Content-Type: text/calendar; charset=utf-8');
|
||||
header('Content-Disposition: attachment; filename='.$this->diplome_id.'.ics');
|
||||
// header('Content-Type: text/calendar; charset=utf-8');
|
||||
// header('Content-Disposition: attachment; filename='.$this->diplome_id.'.ics');
|
||||
|
||||
/* (2) Body */
|
||||
echo "<pre>";
|
||||
readfile($file_name);
|
||||
echo "</pre>";
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
namespace service;
|
||||
|
||||
use \lightdb\core\lightdb;
|
||||
use \service\Tesseract;
|
||||
|
||||
|
||||
class CalendarExtractor{
|
||||
|
@ -197,7 +198,9 @@
|
|||
$this->event[$uid][$time] = [ $this->yToTime($day_n, $y) ];
|
||||
|
||||
// {6} Exctract event's image //
|
||||
$this->event[$uid][$time][1] = $this->extractEvent("$time-$uid", [$col_x, $start_y+1], [$col_ind[$day_n+1]-1, $y]);
|
||||
$ev = $this->extractEvent("$time-$uid", [$col_x, $start_y+1], [$col_ind[$day_n+1]-1, $y]);
|
||||
$this->event[$uid][$time][1] = $ev[0];
|
||||
$this->event[$uid][$time][2] = $ev[1];
|
||||
|
||||
|
||||
}
|
||||
|
@ -240,42 +243,72 @@
|
|||
$link = __ROOT__."/tmp/$uid.jpeg";
|
||||
$width = $stop[0]-$start[0];
|
||||
$height = $stop[1]-$start[1];
|
||||
$resize_factor = 2;
|
||||
|
||||
/* [1] Get the right clip
|
||||
=========================================================*/ {
|
||||
|
||||
/* (1) Create clipped copy */
|
||||
$clip = \imagecreatetruecolor($width, $height);
|
||||
$clip = \imagecreatetruecolor($width*$resize_factor, $height*$resize_factor);
|
||||
|
||||
$copied = \imagecopyresized(
|
||||
$clip, // destin img
|
||||
$this->img_res, // source img
|
||||
0, // dest x
|
||||
0, // dest y
|
||||
$start[0], // src x
|
||||
$start[1], // src y
|
||||
$width, // dest w
|
||||
$height, // dest h
|
||||
$width, // src w
|
||||
$height // src h
|
||||
$clip, // destin img
|
||||
$this->img_res, // source img
|
||||
0, // dest x
|
||||
0, // dest y
|
||||
$start[0], // src x
|
||||
$start[1], // src y
|
||||
$width*$resize_factor, // dest w
|
||||
$height*$resize_factor, // dest h
|
||||
$width, // src w
|
||||
$height // src h
|
||||
);
|
||||
|
||||
/* (2) Manage copy error */
|
||||
if( !$copied )
|
||||
throw new \Exception("Cannot clip image");
|
||||
|
||||
/* (3) Save to base64 */
|
||||
/* (3) Save to jpeg */
|
||||
\imagesavealpha($clip, true);
|
||||
|
||||
ob_start();
|
||||
\imagejpeg($clip);
|
||||
$image_data = \base64_encode(ob_get_contents());
|
||||
ob_end_clean();
|
||||
// ob_start();
|
||||
\imagejpeg($clip, $link);
|
||||
// $image_data = \base64_encode(ob_get_contents());
|
||||
// ob_end_clean();
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* [2] Apply Tesseract
|
||||
=========================================================*/ {
|
||||
|
||||
/* (1) Load image with tesseract */
|
||||
try{
|
||||
|
||||
$tesseract = new Tesseract($link);
|
||||
$read = $tesseract->read();
|
||||
|
||||
/* (2) Manage error */
|
||||
}catch(\Exception $e){
|
||||
|
||||
$read = [ 'unkown', 'unknown' ];
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
return $image_data;
|
||||
|
||||
/* [3] End procedure
|
||||
=========================================================*/
|
||||
/* (1) Remove file */
|
||||
unlink($link);
|
||||
|
||||
/* (2) Return read value */
|
||||
return $read;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -389,11 +422,6 @@
|
|||
=========================================================*/
|
||||
foreach($this->event as $event_col=>$events){
|
||||
|
||||
$type = "unknown";
|
||||
|
||||
if( isset($col_assoc[$event_col]) )
|
||||
$type = $col_assoc[$event_col];
|
||||
|
||||
/* (2) For each event of each type
|
||||
---------------------------------------------------------*/
|
||||
foreach($events as $start_t=>$data){
|
||||
|
@ -402,8 +430,9 @@
|
|||
$RAW .= "DTSTART:${start_t}\n";
|
||||
$RAW .= "DTEND:${data[0]}\n";
|
||||
$RAW .= "UID:$start_t-univ-pau-ics\n"; // required
|
||||
$RAW .= "SUMMARY:$type\n";
|
||||
$RAW .= "ATTACH;ENCODING=BASE64;VALUE=BINARY;FILENAME=att.jpg:${data[1]}\n";
|
||||
$RAW .= "SUMMARY:${data[1]}\n";
|
||||
$RAW .= "LOCATION:${data[2]}\n";
|
||||
// $RAW .= "ATTACH;ENCODING=BASE64;VALUE=BINARY;FILENAME=att.jpg:${data[1]}\n";
|
||||
$RAW .= "CATEGORIES: UPPA Calendar\n";
|
||||
$RAW .= "END:VEVENT\n";
|
||||
}
|
||||
|
|
|
@ -6,86 +6,81 @@
|
|||
|
||||
/* [1] Attributes
|
||||
=========================================================*/
|
||||
private $filename = null;
|
||||
private $content = null;
|
||||
private $course = null;
|
||||
private $teacher = null;
|
||||
private $room = null;
|
||||
private $fname = null;
|
||||
|
||||
|
||||
/* (1) Constructs and initialise a readed file
|
||||
*
|
||||
* @fname<String> Path of the image
|
||||
*
|
||||
* @return instance<Tesseract> New Tesseract
|
||||
*
|
||||
---------------------------------------------------------*/
|
||||
public function __construct($filename){
|
||||
$this->filename = $filename;
|
||||
public function __construct($fname=null){
|
||||
|
||||
/* [1] Check argument
|
||||
=========================================================*/ {
|
||||
|
||||
/* (1) Check type */
|
||||
if( !is_string($fname) )
|
||||
throw new \Exception("Tesseract.__construct(<String>) expected but Tesseract.__construct(<".gettype($fname).">) received");
|
||||
|
||||
/* (2) Check file validity */
|
||||
if( !file_exists($fname) )
|
||||
throw new \Exception("Tesseract.__construct(<PATH>) but <PATH> is not valid");
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* [2] Store as attribute
|
||||
=========================================================*/
|
||||
$this->fname = $fname;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* (2) Read the image file
|
||||
*
|
||||
* @return this
|
||||
* @return read<String> The read content
|
||||
*
|
||||
---------------------------------------------------------*/
|
||||
public function read(){
|
||||
|
||||
/* [1] Record the text from the image
|
||||
=========================================================*/
|
||||
$filename = $this->filename;
|
||||
$this->content = shell_exec("tesseract $filename stdout -l fra");
|
||||
/* (1) Process tesseract */
|
||||
$read = shell_exec("tesseract ".$this->fname." stdout -l fra --user-words ".__ROOT__."/config/edt.user-words -c language_model_penalty_non_freq_dict_word=0.1 -c language_model_penalty_non_dict_word=.15 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 2>/dev/null");
|
||||
|
||||
$lists = explode(chr(10), $this->content);
|
||||
/* (2) If empty */
|
||||
if( is_null($read) )
|
||||
throw new \Exception("Nothing read");
|
||||
|
||||
if (count($lists) < 3) {
|
||||
throw new \Exception('Result not interpreted');
|
||||
}
|
||||
/* (3) Split by lines */
|
||||
$by_line = explode("\n", $read);
|
||||
|
||||
$this->course = $lists[0];
|
||||
$this->teacher = $lists[1];
|
||||
$this->room = $lists[2];
|
||||
/* (4) Get first line (title) */
|
||||
$title = $by_line[0];
|
||||
|
||||
/* (5) Get last non-empty line */
|
||||
for( $i = count($by_line)-1 ; $i > 0 ; $i-- ){
|
||||
|
||||
// {1} Check not empty //
|
||||
if( empty($by_line[$i]) )
|
||||
continue;
|
||||
|
||||
// {2} Matches //
|
||||
if( preg_match('@^amphi@i', $by_line[$i]) || // 'amphi A', 'amphi 600 droit'
|
||||
preg_match('@^S\d+@i', $by_line[$i]) // 'S10', 'S22'
|
||||
)
|
||||
return [ $title, $by_line[$i] ];
|
||||
|
||||
}
|
||||
|
||||
|
||||
return [ $title, 'unknown' ];
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/* (3) Return the text readed by the Tesseract OCR
|
||||
*
|
||||
* @return $this->content
|
||||
*
|
||||
---------------------------------------------------------*/
|
||||
public function getContent() {
|
||||
return $this->content;
|
||||
}
|
||||
|
||||
/* (4) Return the course readed by the Tesseract OCR
|
||||
*
|
||||
* @return $this->course
|
||||
*
|
||||
---------------------------------------------------------*/
|
||||
public function getCourse() {
|
||||
return $this->course;
|
||||
}
|
||||
|
||||
/* (5) Return the teacher readed by the Tesseract OCR
|
||||
*
|
||||
* @return $this->teacher
|
||||
*
|
||||
---------------------------------------------------------*/
|
||||
public function getTeacher() {
|
||||
return $this->teacher;
|
||||
}
|
||||
|
||||
/* (6) Return the room class readed by the Tesseract OCR
|
||||
*
|
||||
* @return $this->room
|
||||
*
|
||||
---------------------------------------------------------*/
|
||||
public function getRoom() {
|
||||
return $this->room;
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
intro
|
||||
introduction
|
||||
diversite
|
||||
evolution
|
||||
mathematiques
|
||||
general
|
||||
Gestion
|
||||
gestion
|
||||
Molécules
|
||||
cellulaire
|
||||
biologie
|
||||
physique
|
||||
amphi
|
||||
CTD
|
||||
droit
|
||||
gauche
|
||||
grammaire
|
||||
methodologie
|
||||
Cours
|
|
@ -1,4 +1,5 @@
|
|||
#!/bin/bash
|
||||
|
||||
sudo apt-get install php7.0-gd;
|
||||
sudo service apache2 restart;
|
||||
sudo service apache2 restart;
|
||||
sudo apt-get install tesseract-ocr tesseract-ocr-fra;
|
Loading…
Reference in New Issue