forked from dtuggener/CharSplit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsetup.py
54 lines (49 loc) · 2.19 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#! /usr/bin/env python
#
# Copyright (C) 2018-2022 Dmytro Samchuk
DESCRIPTION = "CharSplit - An *ngram*-based compound splitter for German"
LONG_DESCRIPTION = """\
Splits a German compound into its body and head, e.g.
> Autobahnraststätte -> Autobahn - Raststätte
Implementation of the method decribed in the appendix of the thesis:
Tuggener, Don (2016). *Incremental Coreference Resolution for German.* University of Zurich, Faculty of Arts.
**TL;DR**: The method calculates probabilities of ngrams occurring at the beginning, end and in the middle of words and identifies the most likely position for a split.
The method achieves ~95% accuracy for head detection on the [Germanet compound test set](http://www.sfs.uni-tuebingen.de/lsd/compounds.shtml).
A model is provided, trained on 10 Mio German nouns from newspaper text.
"""
DISTNAME = 'charsplit'
MAINTAINER = 'Dmytro Samchuk'
MAINTAINER_EMAIL = '[email protected]'
URL = 'https://github.com/Codealist/CharSplit'
LICENSE = 'GNU GPL-3.0'
DOWNLOAD_URL = 'https://github.com/Codealist/CharSplit/releases'
VERSION = '1.3.5.1'
try:
from setuptools import setup
_has_setuptools = True
except ImportError:
from distutils.core import setup
if __name__ == "__main__":
setup(name=DISTNAME,
author=MAINTAINER,
author_email=MAINTAINER_EMAIL,
maintainer=MAINTAINER,
maintainer_email=MAINTAINER_EMAIL,
description=DESCRIPTION,
long_description=LONG_DESCRIPTION,
license=LICENSE,
url=URL,
version=VERSION,
download_url=DOWNLOAD_URL,
install_requires=[],
packages=['charsplit', 'charsplit.pretrained'],
classifiers=[
'Intended Audience :: Science/Research',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'License :: OSI Approved :: {}'.format(LICENSE),
'Topic :: Scientific/Engineering :: Natural language processing',
'Operating System :: POSIX',
'Operating System :: Unix',
'Operating System :: MacOS'])