1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
# Copyright 1999-2019 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
EAPI=7
MY_PN="tesseract-ocr"
LANGPACKV="4.00"
URI_PREFIX="https://github.com/${MY_PN}/tessdata/raw/${LANGPACKV}/"
inherit autotools toolchain-funcs
DESCRIPTION="An OCR Engine, orginally developed at HP, now open source."
HOMEPAGE="https://github.com/tesseract-ocr"
SRC_URI="https://github.com/${MY_PN}/${PN}/archive/${PV}.tar.gz -> ${P}.tar.gz
${URI_PREFIX}eng.traineddata -> eng.traineddata-${LANGPACKV}
math? ( ${URI_PREFIX}equ.traineddata -> equ.traineddata-${LANGPACKV} )
osd? ( ${URI_PREFIX}osd.traineddata -> osd.traineddata-${LANGPACKV} )"
LICENSE="Apache-2.0"
SLOT="0"
KEYWORDS="~alpha ~amd64 ~arm ~arm64 ~mips ~ppc ~ppc64 ~sparc ~x86"
IUSE="doc jpeg math opencl openmp osd png static-libs tiff training webp"
# List of supported Gentoo linguas and their upstream mapping
# https://github.com/tesseract-ocr/tesseract/wiki/Data-Files
# "old" variants were regrouped in the matching modern locale
LANGUAGES="af:afr am:amh ar:ara as:asm az:aze,aze_cyrl be:bel bn:ben bo:bod bs:bos bg:bul ca:cat cs:ces zh:chi_sim,chi_tra cy:cym da:dan de:deu,frk dz:dzo el:ell,grc en:enm eo:epo et:est eu:eus fa:fas fi:fin fr:fra,frm ga:gle gl:glg gu:guj he:heb hi:hin hr:hrv hu:hun id:ind is:isl it:ita,ita_old ja:jpn kn:kan ka:kat,kat_old kk:kaz km:khm ky:kir ko:kor ku:kur lo:lao la:lat lv:lav lt:lit ml:mal mr:mar mk:mkd ms:msa my:mya ne:nep nl:nld no:nor or:ori pa:pan pl:pol pt:por ro:ron ru:rus sa:san si:sin sk:slk sl:slv es:spa,spa_old sq:sqi sr:srp,srp_latn sw:swa sv:swe syc:syr ta:tam te:tel tg:tgk tl:tgl th:tha tr:tur ug:uig uk:ukr uz:uzb,uzb_cyrl vi:vie"
# Missing matches:
# ceb Cebuano
# chr Cherokee
# hat Haitian; Haitian Creole
# iku Inuktitut
# jav Javanese
# mlt Maltese
# pus Pushto; Pashto
# tir Tigrinya
# urd Urdu
# yid Yiddish
# l10n_en provides the additional data:
# enm English, Middle (1100-1500)
for lang in ${LANGUAGES}; do
gentoo_lang=${lang%:*}
tess_langs=${lang#*:}
for tess_lang in ${tess_langs//,/ }; do
SRC_URI+=" l10n_${gentoo_lang}? ( ${URI_PREFIX}${tess_lang}.traineddata -> ${tess_lang}.traineddata-${LANGPACKV} )"
done
IUSE+=" l10n_${gentoo_lang}"
done
# With opencl USE=tiff is necessary in leptonica
RDEPEND=">=media-libs/leptonica-1.74:=[zlib,tiff?,jpeg?,png?,webp?]
opencl? (
virtual/opencl
media-libs/tiff:0=
media-libs/leptonica:=[tiff]
)
training? (
dev-libs/icu:=
x11-libs/pango:=
x11-libs/cairo:=
)"
DEPEND="${RDEPEND}
app-text/asciidoc
app-text/docbook-xsl-stylesheets
dev-libs/libxslt
doc? ( app-doc/doxygen )"
DOCS=( AUTHORS ChangeLog README.md )
PATCHES=(
"${FILESDIR}"/${P}-manpages.patch
)
pkg_pretend() {
[[ ${MERGE_TYPE} != binary ]] && use openmp && tc-check-openmp
}
pkg_setup() {
[[ ${MERGE_TYPE} != binary ]] && use openmp && tc-check-openmp
}
src_unpack() {
unpack ${P}.tar.gz
for file in ${A}; do
if [[ "${file}" == *traineddata* ]]; then
cp "${DISTDIR}/${file}" "${S}/tessdata/${file%-*}" || die
fi
done
}
src_prepare() {
default
eautoreconf
}
src_configure() {
# scrollview disabled for now, see bug #686944
local myeconfargs=(
--enable-shared
--disable-graphics
$(use_enable opencl)
$(use_enable openmp)
$(use_enable static-libs static)
)
econf "${myeconfargs[@]}"
}
src_compile() {
default
use doc && emake doc
use training && emake training
}
src_install() {
use doc && HTML_DOCS=( doc/html/. )
default
find "${D}" -name '*.la' -type f -delete || die
if use training; then
emake DESTDIR="${D}" training-install
fi
insinto /usr/share/tessdata
doins tessdata/*traineddata* # language files
}
|