Improved tesseract dictionnary accuracy + fixed 'ami' to 'amphi'

This commit is contained in:
xdrm-brackets 2017-09-15 08:49:13 +02:00
parent ea77b64c32
commit 5dae2ef258
2 changed files with 14 additions and 5 deletions

View File

@ -51,9 +51,7 @@
=========================================================*/ { =========================================================*/ {
/* (1) Process tesseract */ /* (1) Process tesseract */
$read = shell_exec("tesseract ".$this->fname." stdout -l fra --user-words ".__ROOT__."/config/edt.user-words -c language_model_penalty_non_freq_dict_word=0.1 -c language_model_penalty_non_dict_word=.15 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 2>/dev/null"); $read = shell_exec("tesseract ".$this->fname." stdout -l fra --user-words ".__ROOT__."/config/edt.user-words -c language_model_penalty_non_freq_dict_word=0.3 -c language_model_penalty_non_dict_word=.25 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 2>/dev/null");
// var_dump($read);
/* (2) If empty */ /* (2) If empty */
if( is_null($read) || !preg_match('@\n@m', $read) ) if( is_null($read) || !preg_match('@\n@m', $read) )
@ -88,7 +86,7 @@
for( $i = count($lines)-1 ; $i > 0 ; $i-- ){ for( $i = count($lines)-1 ; $i > 0 ; $i-- ){
// Amphi ... // // Amphi ... //
if( preg_match('@^a[nm][bp][hl]i ?(.+)$@i', $lines[$i], $m) ) // 'amphi A', 'amphi 600 droit' if( preg_match('@^a[nm]([bp][hln])?[ir] ?(.+)$@i', $lines[$i], $m) ) // 'amphi A', 'amphi 600 droit'
return [ $title, "Amphi ${m[1]}" ]; return [ $title, "Amphi ${m[1]}" ];

View File

@ -16,4 +16,15 @@ droit
gauche gauche
grammaire grammaire
methodologie methodologie
Cours Cours
Modélisation
Systèmes
concurrents
synthèse
d'image
3D
LV1
LV2
Génie
logiciel
Interface