# Copyright 2020 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2

EAPI=7

inherit check-reqs

DESCRIPTION="Data files for NLTK"
HOMEPAGE="https://www.nltk.org/nltk_data/"

# at least some of the files have poorly documented licenses
# TODO: create a USE flag for free-ish subset
LICENSE="all-rights-reserved"
SLOT="0"
KEYWORDS="amd64 x86"
IUSE="extra"
RESTRICT="bindist mirror"

BDEPEND="app-arch/unzip"

PACKAGES_ZIP=(
	# wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=0]' -v @subdir -o "/" -v @id -n - | sort
	corpora/comtrans
	corpora/conll2007
	corpora/jeita
	corpora/knbc
	corpora/machado
	corpora/masc_tagged
	corpora/nombank.1.0
	corpora/panlex_swadesh
	corpora/propbank
	corpora/reuters
	corpora/semcor
	corpora/universal_treebanks_v20
	sentiment/vader_lexicon
	stemmers/snowball_data
)

PACKAGES_UNPACK=(
	# wget -O - https://www.nltk.org/nltk_data/ | xml sel -t -m '//package[@unzip=1]' -v @subdir -o "/" -v @id -n - | sort
	corpora/abc
	corpora/alpino
	corpora/brown
	corpora/cess_cat
	corpora/cess_esp
	corpora/chat80
	corpora/city_database
	corpora/cmudict
	corpora/comparative_sentences
	corpora/conll2000
	corpora/conll2002
	corpora/crubadan
	corpora/dependency_treebank
	corpora/dolch
	corpora/europarl_raw
	corpora/floresta
	corpora/framenet_v15
	corpora/framenet_v17
	corpora/gazetteers
	corpora/genesis
	corpora/gutenberg
	corpora/ieer
	corpora/inaugural
	corpora/indian
	corpora/lin_thesaurus
	corpora/mac_morpho
	corpora/movie_reviews
	corpora/mte_teip5
	corpora/names
	corpora/nonbreaking_prefixes
	corpora/nps_chat
	corpora/omw
	corpora/opinion_lexicon
	corpora/pl196x
	corpora/ppattach
	corpora/product_reviews_1
	corpora/product_reviews_2
	corpora/pros_cons
	corpora/ptb
	corpora/qc
	corpora/rte
	corpora/senseval
	corpora/sentence_polarity
	corpora/sentiwordnet
	corpora/shakespeare
	corpora/sinica_treebank
	corpora/state_union
	corpora/stopwords
	corpora/subjectivity
	corpora/swadesh
	corpora/switchboard
	corpora/timit
	corpora/toolbox
	corpora/treebank
	corpora/twitter_samples
	corpora/udhr
	corpora/udhr2
	corpora/verbnet
	corpora/webtext
	corpora/wordnet
	corpora/wordnet_ic
	corpora/words
	grammars/book_grammars
	grammars/large_grammars
	grammars/sample_grammars
	misc/perluniprops
	models/bllip_wsj_no_aux
	models/moses_sample
	models/wmt15_eval
	models/word2vec_sample
	stemmers/porter_test
	stemmers/rslp
	taggers/averaged_perceptron_tagger
	taggers/averaged_perceptron_tagger_ru
	taggers/universal_tagset
	tokenizers/punkt
)

PACKAGES_UNPACK_EXTRA=(
	chunkers/maxent_ne_chunker
	corpora/biocreative_ppi
	corpora/brown_tei
	corpora/kimmo
	corpora/paradigms
	corpora/pe08
	corpora/pil
	corpora/problem_reports
	corpora/smultron
	corpora/unicode_samples
	corpora/verbnet3
	corpora/ycoe
	grammars/basque_grammars
	grammars/spanish_grammars
	help/tagsets
	misc/mwa_ppdb
	taggers/maxent_treebank_pos_tagger
)

add_data() {
	local x
	for x; do
		SRC_URI+="
			https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/${x}.zip
				-> nltk-${x#*/}-${PV}.zip"
	done
}

add_data "${PACKAGES_ZIP[@]}" "${PACKAGES_UNPACK[@]}"
SRC_URI+="
	extra? ("
add_data "${PACKAGES_UNPACK_EXTRA[@]}"
SRC_URI+="
	)"

CHECKREQS_DISK_USR=3G
CHECKREQS_DISK_BUILD=${CHECKREQS_DISK_USR}

src_unpack() {
	local x
	local to_unpack=( "${PACKAGES_UNPACK[@]}" )
	use extra && to_unpack+=( "${PACKAGES_UNPACK_EXTRA[@]}" )
	for x in "${to_unpack[@]}"; do
		local cat=${x%/*}
		local pkg=${x#*/}

		mkdir -p "${S}/${cat}" || die
		cd "${S}/${cat}" || die
		unpack "nltk-${pkg}-${PV}.zip"
	done
}

src_install() {
	dodir /usr/share/nltk_data
	mv * "${ED}/usr/share/nltk_data/" || die

	local x
	for x in "${PACKAGES_ZIP[@]}"; do
		local cat=${x%/*}
		local pkg=${x#*/}

		insinto "/usr/share/nltk_data/${cat}"
		newins "${DISTDIR}/nltk-${pkg}-${PV}.zip" "${pkg}.zip"
	done
}