# -*- coding: utf-8 -*-
from setuptools import setup

package_dir = \
{'': 'src'}

packages = \
['webarticlecurator']

package_data = \
{'': ['*']}

install_requires = \
['beautifulsoup4>=4.9.0,<5.0.0',
 'chardet>=4.0.0,<5.0.0',
 'lxml>=4.5.0,<5.0.0',
 'mplogger>=1.0.0,<2.0.0',
 'pyyaml>=6.0.0,<7.0.0',
 'ratelimit>=2.2.1,<3.0.0',
 'requests>=2.26.0,<3.0.0',
 'urllib3>=1.26.7,<2.0.0',
 'warcio>=1.7.0,<2.0.0',
 'yamale>=4.0.2,<5.0.0']

extras_require = \
{'full': ['newspaper3k>=0.2.8,<0.3.0'],
 'newspaper3k': ['newspaper3k>=0.2.8,<0.3.0']}

entry_points = \
{'console_scripts': ['webarticlecurator = webarticlecurator.__main__:main']}

setup_kwargs = {
    'name': 'webarticlecurator',
    'version': '1.10.1',
    'description': 'A crawler program to download content from portals (news, forums, blogs) and convert it to the desired output format according to the configuration.',
    'long_description': '# Web Article Curator\n\nA crawler program which can be used for downloading the content of portals (news, forums, blogs) and converting it to the desired output format, in accordance with the configuration.\n\n## Requirements\n\n- Python 3.8+\n- (optional for corpus converter if installed as `webarticlecurator[newspaper3k]`) for Newspaper3k, the installation of the following packages must precede the installation of this program: python3-dev libxml2-dev libxslt-dev libjpeg-dev zlib1g-dev libpng12-dev\n\n## Install\n\n### pip\n\n`pip3 install webarticlecurator`\n\nThe following extras can be installed:\n\n- Newspaper3k: `newspaper`\n- All the above: `full`\n\nE.g. `pip3 install webarticlecurator[full]`\n\n### Manual\n\n[_Poetry_](https://python-poetry.org/) and (optionally) [_GNU Make_](https://www.gnu.org/software/make/) are required.\n\n1. `git clone https://github.com/ELTE-DH/webarticlecurator.git`\n2. Run `make`\n\nOn Windows or without Make (after cloning the repository):\n\n1. `poetry install --no-root`\n2. `poetry build`\n3. `poetry run pip install --upgrade dist/*.whl` (the correct filename must be specified on Windows)\n\nTo install extras run: `poetry install -E [NAME OF THE EXTRA TO INSTALL]`\n\n## Usage\n\nThe program can be used in multiple ways:\n\n- Crawling (see the options below): `python3 -m webarticlecurator crawl CONFIGURATION [parameters]`\n- Listing URLs in a previously created WARC file: `python3 -m webarticlecurator listurls -s SOURCE_WARC`\n- Validating a previously created WARC file (with [warcio](https://github.com/webrecorder/warcio)): `python3 -m webarticlecurator validate -s SOURCE_WARC`\n- Sampling a previously created WARC file based on a list of URLs (one URL per line, URLs not present in the source archive are downloaded if `--offline` is False. If `--negative` is specified all URLs are sampled except ones from the list): `python3 -m webarticlecurator sample -s SOURCE_WARC -i selected_urls.txt TARGET_WARC --offline True/False --negative True/False`\n- Printing the content of the selected URLs into an empty directory: `python3 -m webarticlecurator cat -s SOURCE_WARC -i selected_urls.txt TARGET_DIR`\n- Downloading a single URL (for testing purposes): `python3 -m webarticlecurator download SOURCE_URL TARGET_WARC`\n- Check URLs in the extracted article urls of an archive warc (for debugging a portal): `python3 -m webarticlecurator checkurls -s SOURCE_WARC -i selected_urls.txt -d TARGET_DIR CONFIGURATION`\n\n# Configuration schema\n\nThe configuration is divided into three levels. On the first two levels, YAML is used for the configuration format with schema checks.\nThe third level of configuration uses Python functions.\n\n## Crawl-level configuration\n\nIt specifies the configuration for the current crawling process with the following fields:\n\n- `schema`: The filename pointing to the schema of the portal to be crawled (second level configuration)\n- `output_corpus` (optional): The desired filename of the output corpus (default: no output corpus)\n- `log_file_archive` (optional): The log file for the archive crawler (default: log is not saved)\n- `log_file_articles` (optional): The log file for the article crawler (default: log is no saved)\n- `new_problematic_archive_urls` (optional): The file where the problematic archive URLs should be written (default: URLs are not saved)\n- `new_problematic_urls` (optional): The file where the problematic article URLs should be written (default: URLs are not saved)\n- `new_good_archive_urls` (optional): The file where the newly downloaded, good archive URLs should be written (default: URLs are not saved)\n- `new_good_urls` (optional): The file where the newly downloaded, good article URLs should be written (default: URLs are not saved)\n- `date_from` (optional): The inclusive minimal date of the required articles in ISO 8601 format, YYYY-MM-DD (default: from the schema of the portal if applies)\n- `date_until` (optional): The inclusive maximal date of the required articles in ISO 8601 format, YYYY-MM-DD (default: yesterday if applies)\n\n## Site schemas\n\nThe following parameters must be filled in the case of every portal:\n\n- `site_name`: A friendly name for the portal\n- `new_article_url_threshold`: The minimal amount of new URLs required on an archive page (e.g. the archive pages slid due to new articles in case of an active portal)\n\nPython functions:\n\n- `portal_specific_exctractor_functions_file`: The filename pointing to the python file which contains the required extractor functions\n- `extract_next_page_url_fun` (it can be NULL): The name of the function to be imported from the `portal_specific_exctractor_functions_file` to extract the "next page URL"\n- `extract_article_urls_from_page_fun`: The name of the function to be imported from the `portal_specific_exctractor_functions_file` to extract the article URLs from the archive page\n- `extract_article_urls_from_page_plus_fun`: The name of the function to be imported from the `portal_specific_exctractor_functions_file` to extract the article URLs from the archive page with metadata form the portal\'s archive (for `checkurls` mode)\n- `next_page_of_article_fun` (it can be NULL): The name of the function to be imported from the `portal_specific_exctractor_functions_file` if there are multipage articles. This function extracts the "next page URL" for the rest of the pages in a multipage article. (It must be used with `MultiPageArticleConverter` or similar as `corpus_converter` to work!)\n- `corpus_converter_file`: The filename pointing to the python file which contains the required corpus extractor class\n- `corpus_converter`: The name of the class to be imported from the `corpus_converter_file`. The default is to do nothing (`dummy-converter`).\n\nBoolean features to describe the site:\n\n- `next_url_by_pagenum`: Use page numbering for pagination of the archive, e.g. infinite scrolling (false means no pages or pages handled by `extract_next_page_url_fun`)\n- `infinite_scrolling`: The crawler increment page numbers until the first page with zero article urls\n- `archive_page_urls_by_date`: Group the archive page URLs by their dates\n- `go_reverse_in_archive`: Go reverse (backwards in time) in the archive by date (when the earliest article is not known)\n- `verify_request`: Suppress complaining about invalid HTTPS certificates\n- `ignore_archive_cache`: Ignore archive cache (for those portals which only use pagination)\n- `stop_on_empty_archive_page` (optional): Stop archive crawling if no articles extracted from page (default: false)\n- `stop_on_taboo_set` (optional): Stop archive crawling when one or more URLs in `taboo_article_urls` list is specified (default: false)\n\nColumn definitions:\n\nIn the `columns` dictionary, the following features can be set for each column (defined with a friendly name):\n\n- `date_first_article` (optional): The date of the first article on the portal/column (also used for archive crawling)\n- `date_first_article` (optional): The date of the last article on the portal/column (also used for archive crawling)\n- `initial_pagenum` (optional): The initial page number which could be omitted (an empty string if not set, else it should be `min_pagenum` - 1)\n- `min_pagenum` (optional): The "first" page number to increment (e.g. initial_pagenum + 1 = min_pagenum <= max_pagenum if not a single page column where only initial_pagenum must be specified, min_pagenum and max_pagenum must be omitted)\n- `max_pagenum` (optional): The upper bound of the number of pages for safety or for stop criteria\n- `archive_url_format`: The schema for the archive URL of the portal/column (supply `#year`, `#month`, `#day` and\n `#next-year`, `#next-month`, `#next-day` tags which have to be replaced with the actual field of date, and\n `#pagenum` with the actual page number during the crawling)\n- `max_tries` (optional): How many times should we try to download an archive URL? (defautlt: 1)\n\nNote: One can iterate the archive by months or years by omitting `#day` (`#next-day`) or `#month` (`#next-month`) \n\n## Site-specific extractors\n\nThere are maximum three types of extractors to be included for each portal.\nSee the examples in the `configs` directory for further information and `DummyConverter` for the converter API.\n\n## Command line parameters\n\nThe first two command-line parameters should be `crawl` and the filename pointing to the configuration file of the current crawl. These can be followed by some optional parameters:\n\n- `--old-archive-warc OLD_ARCHIVE_WARC`: Existing WARC archives of the portal\'s archive (use them as cache)\n- `--archive-warc ARCHIVE_WARC`: New WARC archive of the portal\'s archive (copy all cached pages if `--old-archive-warc` is specified)\n- `--old-articles-warc OLD_ARTICLES_WARC`: Existing WARC archives of the portal\'s archive (use them as cache)\n- `--articles-warc ARTICLES_WARC`: New WARC archive of the portal\'s archive (copy all cached pages if `--old-archive-warc` is specified)\n- `--archive-just-cache [ARCHIVE_JUST_CACHE]`: Use only cached pages (no output WARC file): `--old-archive-warc` must be specified!\n- `--articles-just-cache [ARTICLES_JUST_CACHE]`: Use only cached pages (no output WARC file): `--old-articles-warc` must be specified!\n- `--debug-news-archive [DEBUG_NEWS_ARCHIVE]`: Set DEBUG logging on NewsArchiveCrawler and print the number of extracted URLs per page\n- `--strict [STRICT]`: Set strict-mode in WARCReader to enable validation\n- `--crawler-name CRAWLER_NAME`: The name of the crawler for the WARC info record\n- `--user-agent USER_AGENT`: The User-Agent string to use in headers while downloading\n- `--no-overwrite-warc`: Do not overwrite `--{archive,articles}-warc` if needed\n- `--cumulative-error-threshold CUMULATIVE_ERROR_THRESHOLD`: The sum of download errors before giving up\n- `--known-bad-urls KNOWN_BAD_URLS`: Known bad URLs to be excluded from download (filename, one URL per line)\n- `--known-article-urls KNOWN_ARTICLE_URLS`: Known article URLs to mark the desired end of the archive (filename, one URL per line)\n- `--max-no-of-calls-in-period MAX_NO_OF_CALLS_IN_PERIOD`: Limit the number of HTTP requests per period\n- `--limit-period LIMIT_PERIOD`: Limit the period of HTTP requests (in seconds), see also `--max-no-of-calls-in-period`\n- `--proxy-url PROXY_URL`: SOCKS Proxy URL to use, e.g. socks5h://localhost:9050\n- `--allow-cookies [ALLOW_COOKIES]`: Allow session cookies\n- `--stay-offline [STAY_OFFLINE]`: Do not download but write output WARC (see `--just-cache` when no output WARC file is needed)\n- `--archive`: Crawl only the portal\'s archive\n- `--articles`: Crawl articles (and optionally use cached WARC for the portal\'s archive), DEFAULT behaviour\n- `--corpus`: Use `--old-articles-warc` to create a corpus (no crawling, equals to `--archive-just-cache` and `--articles-just-cache`)\n\n# Licence\n\nThis project is licensed under the terms of the GNU LGPL 3.0 license.\n\n# Acknowledgement\n\nThis software is the direct continuation of [corpusbuilder](https://github.com/ppke-nlpg/corpusbuilder).\nThe authors gratefully acknowledge the groundbreaking work of all pioneers who inspired this program.\nSpecial thanks go to Tibor Kákonyi who put the initial implementation under the GNU LGPL 3.0 license and allowing us to continue his work.\n\n# References\n\nThe DOI of the code is: https://doi.org/10.5281/zenodo.3755323\n\nIf you use this program, please cite the following paper:\n\n[__The ELTE.DH Pilot Corpus – Creating a Handcrafted Gigaword Web Corpus with Metadata__ Balázs Indig, Árpád Knap, Zsófia Sárközi-Lindner, Mária Timári, Gábor Palkó _In the Proceedings of the 12th Web as Corpus Workshop (WAC XII)_, pages 33-41 Marseille, France 2020](https://www.aclweb.org/anthology/2020.wac-1.5.pdf)\n\n```\n@inproceedings{indig-etal-2020-elte,\n    title = "The {ELTE}.{DH} Pilot Corpus {--} Creating a Handcrafted {G}igaword Web Corpus with Metadata",\n    author = {Indig, Bal{\\\'a}zs  and\n      Knap, {\\\'A}rp{\\\'a}d  and\n      S{\\\'a}rk{\\"o}zi-Lindner, Zs{\\\'o}fia  and\n      Tim{\\\'a}ri, M{\\\'a}ria  and\n      Palk{\\\'o}, G{\\\'a}bor},\n    booktitle = "Proceedings of the 12th Web as Corpus Workshop",\n    month = may,\n    year = "2020",\n    address = "Marseille, France",\n    publisher = "European Language Resources Association",\n    url = "https://www.aclweb.org/anthology/2020.wac-1.5",\n    pages = "33--41",\n    abstract = "In this article, we present the method we used to create a middle-sized corpus using targeted web crawling. Our corpus contains news portal articles along with their metadata, that can be useful for diverse audiences, ranging from digital humanists to NLP users. The method presented in this paper applies rule-based components that allow the curation of the text and the metadata content. The curated data can thereon serve as a reference for various tasks and measurements. We designed our workflow to encourage modification and customisation. Our concept can also be applied to other genres of portals by using the discovered patterns in the architecture of the portals. We found that for a systematic creation or extension of a similar corpus, our method provides superior accuracy and ease of use compared to The Wayback Machine, while requiring minimal manpower and computational resources. Reproducing the corpus is possible if changes are introduced to the text-extraction process. The standard TEI format and Schema.org encoded metadata is used for the output format, but we stress that placing the corpus in a digital repository system is recommended in order to be able to define semantic relations between the segments and to add rich annotation.",\n    language = "English",\n    ISBN = "979-10-95546-68-9",\n}\n```\n',
    'author': 'dlazesz',
    'author_email': None,
    'maintainer': None,
    'maintainer_email': None,
    'url': 'https://github.com/ELTE-DH/WebArticleCurator',
    'package_dir': package_dir,
    'packages': packages,
    'package_data': package_data,
    'install_requires': install_requires,
    'extras_require': extras_require,
    'entry_points': entry_points,
    'python_requires': '>=3.8,<4.0',
}


setup(**setup_kwargs)
