From e6842b3a5b851ad4e13534040ada641c2dc13c06 Mon Sep 17 00:00:00 2001
From: Gallenkamp <bax1489@uni-hamburg.de>
Date: Fri, 10 Jan 2020 16:36:27 +0100
Subject: [PATCH] multiple changes
---
app.py | 746 ++++++++++++++-----
templates/export/softwares.jinja2 | 40 +-
templates/export/softwares_categories.jinja2 | 32 +
3 files changed, 592 insertions(+), 226 deletions(-)
create mode 100644 templates/export/softwares_categories.jinja2
diff --git a/app.py b/app.py
index d6dcfea..3286eec 100644
--- a/app.py
+++ b/app.py
@@ -223,27 +223,34 @@ def index():
standalonesoftwareincategory = []
serversoftwareincategory = []
libsoftwareincategory = []
- software_categorys = SoftwareCategory.query.all()
-
- # standalone software
- for software_category in software_categorys:
- softwares = Software.query.filter((Software.softwarecategory_id == software_category.id) & (Software.architecture == "stand-alone application"))
- standalonesoftwareincategory.append((software_category, softwares))
- # server software
- for software_category in software_categorys:
- softwares = Software.query.filter((Software.softwarecategory_id == software_category.id) & (Software.architecture == "server application"))
- serversoftwareincategory.append((software_category, softwares))
- # other software
- for software_category in software_categorys:
- softwares_r = Software.query.filter((Software.softwarecategory_id == software_category.id) & ((Software.architecture != "stand-alone application") & (Software.architecture != "server application")) & Software.programminglanguages.any(Programminglanguage.name=="R"))
- softwares_py = Software.query.filter((Software.softwarecategory_id == software_category.id) & ((Software.architecture != "stand-alone application") & (Software.architecture != "server application")) & Software.programminglanguages.any(name="Python"))
- softwares_misc = Software.query.filter((Software.softwarecategory_id == software_category.id) & ((Software.architecture != "stand-alone application") & (Software.architecture != "server application")) & (~ (Software.programminglanguages.any(name="Python") | Software.programminglanguages.any(name="R"))))
- libsoftwareincategory.append((software_category, softwares_r, softwares_py,softwares_misc))
+ software_categories = SoftwareCategory.query.all()
- # Generate tools overview page
+ # Generate categories overview page
with open('templates/export/softwares.jinja2', "r", encoding="utf-8") as file_:
template = Template(file_.read())
- template.stream(standalonesoftwareincategory=standalonesoftwareincategory,serversoftwareincategory=serversoftwareincategory,libsoftwareincategory=libsoftwareincategory).dump('../digitale_Methoden.wiki/SoftwareToolsList.asciidoc', encoding='utf-8')
+ template.stream(software_categories=software_categories).dump(
+ '../digitale_Methoden.wiki/SoftwareToolsList.asciidoc', encoding='utf-8')
+
+ for software_category in software_categories:
+ # standalone software
+ standalonesoftwareincategory = Software.query.filter((Software.softwarecategory_id == software_category.id) & (Software.architecture == "stand-alone application"))
+ # server software
+ serversoftwareincategory = Software.query.filter((Software.softwarecategory_id == software_category.id) & (Software.architecture == "server application"))
+ # other software
+
+ softwares_r = Software.query.filter((Software.softwarecategory_id == software_category.id) & ((Software.architecture != "stand-alone application") & (Software.architecture != "server application")) & Software.programminglanguages.any(Programminglanguage.name=="R"))
+ softwares_py = Software.query.filter((Software.softwarecategory_id == software_category.id) & ((Software.architecture != "stand-alone application") & (Software.architecture != "server application")) & Software.programminglanguages.any(name="Python"))
+ softwares_misc = Software.query.filter((Software.softwarecategory_id == software_category.id) & ((Software.architecture != "stand-alone application") & (Software.architecture != "server application")) & (~ (Software.programminglanguages.any(name="Python") | Software.programminglanguages.any(name="R"))))
+ libsoftwareincategory = (softwares_r, softwares_py, softwares_misc)
+ # Generate tools in category overview page
+ with open('templates/export/softwares_categories.jinja2', "r", encoding="utf-8") as file_:
+ template = Template(file_.read())
+ template.stream(software_category_name= software_category.name,
+ software_category_short_description=software_category.short_description,
+ standalonesoftwareincategory= standalonesoftwareincategory,
+ serversoftwareincategory= serversoftwareincategory,
+ libsoftwareincategory= libsoftwareincategory).dump(
+ '../digitale_Methoden.wiki/SoftwareCategory_' + software_category.name.replace(' ', '').replace('/', '') + '.asciidoc', encoding='utf-8')
# Generate methods overview page
hierarchy = db.session.query(Method, literal(0).label('level')).filter(Method.parent_id == null()) \
@@ -288,7 +295,7 @@ def index():
class AdvancedSoftwareView(sqla.ModelView):
column_sortable_list = ('name', ('license', ("license.name", "license.version")) , ('softwarecategory', 'softwarecategory.name'), 'lastchanged', 'architecture', )
- column_list = ('name', 'license', 'softwarecategory', 'links', 'architecture','programminglanguages','price', )
+ column_list = ('name', 'license', 'softwarecategory', 'links', 'architecture', 'programminglanguages', 'price', )
inline_models = (Link,)
column_hide_backrefs = False
page_size = 100
@@ -369,272 +376,300 @@ In this project a more inclusive conception of digital methods is assumed: the u
description="Transfer between different formats in order to unify and handle vacancies.",
parent=method2)
db.session.add(method3)
- m_text_preprocessing = Method(id=6,
+ m_algotextex = Method(id=6,
+ name="algoritmic text extraction",
+ description="Automated text extraction from markup language. See: (<<Günther_etal2014>>,117)",
+ parent=method2)
+ db.session.add(m_algotextex)
+ method3 = Method(id=7,
+ name="body text extraction",
+ description="Wrangling on structure based features of webpages. See: (<<Günther_etal2014>>,118)",
+ parent=m_algotextex)
+ db.session.add(method3)
+ method3 = Method(id=8,
+ name="boilerpipe",
+ description="More local approuch than body text extraction (<<Günther_etal2014>>,118)",
+ parent=m_algotextex)
+ db.session.add(method3)
+ method3 = Method(id=9,
+ name="jusText",
+ description="A heuristic based boilerplate removal tool See: (<<Pomikálek2011>>)",
+ parent=m_algotextex)
+ db.session.add(method3)
+ m_text_preprocessing = Method(id=10,
name="text preprocessing",
description="Some text preprocessing tasks in natuaral language processing.",
parent=method1)
db.session.add(m_text_preprocessing)
- m_tokenization = Method(id=7,
+ m_tokenization = Method(id=11,
name="tokenization",
description="Identify words in character input sequence.",
parent=m_text_preprocessing)
db.session.add(m_tokenization)
- m_stopword_removal = Method(id=8,
+ m_stopword_removal = Method(id=12,
name="stop-word removal",
description="Removing high-frequency words like pronoums, determiners or prepositions.",
parent=m_text_preprocessing)
db.session.add(m_stopword_removal)
- m_stemming = Method(id=9,
+ m_stemming = Method(id=13,
name="stemming",
description="Identify common stems on a syntactical level.",
parent=m_text_preprocessing)
db.session.add(m_stemming)
- m_word_sentence_segmentation = Method(id=10,
+ m_word_sentence_segmentation = Method(id=14,
name="word/sentence segmentation",
description="Separate a chunk of continuous text into separate words/sentences.",
parent=m_text_preprocessing)
db.session.add(m_word_sentence_segmentation)
- m_pos_tagging = Method(id=11,
+ m_pos_tagging = Method(id=15,
name="part-of-speech(POS)-tagging",
description="Identify the part of speech for words.",
parent=m_text_preprocessing)
db.session.add(m_pos_tagging)
- m_dependency_parsing = Method(id=12,
+ m_dependency_parsing = Method(id=16,
name="dependency parsing",
description="Create corresponding syntactic, semantic or morphologic trees from input text.",
parent=m_text_preprocessing)
db.session.add(m_dependency_parsing)
- m_syntactic_parsing = Method(id=13,
+ m_syntactic_parsing = Method(id=17,
name="syntactic parsing",
description="Create syntactic trees from input text using mostly unsupervised learning on manually annotated treebanks (<<Ignatow_etal2017>>,61).",
parent=m_dependency_parsing)
db.session.add(m_syntactic_parsing)
- m_word_sense_disambiguation = Method(id=14,
+ m_word_sense_disambiguation = Method(id=18,
name="word-sense disambiguation",
description="Recognizing context-sensetive meaning of words.",
parent=m_text_preprocessing)
db.session.add(m_word_sense_disambiguation)
- method2 = Method(id=15, name="information extraction",
+ method2 = Method(id=19, name="information extraction",
description="Extract factual information(e.g. people, places or situations) in free text.",
parent=method1)
db.session.add(method3)
- method3 = Method(id=16,
+ method3 = Method(id=20,
name="(named-)entity-recognition/resolution/extraction/tagging",
description="Identify instances of specific (pre-)defined types(e.g place, name or color) in text.",
parent=method2)
db.session.add(method3)
- method3 = Method(id=17,
+ method3 = Method(id=21,
name="relation extraction",
description="Extract relationships between entities.",
parent=method3)
db.session.add(method3)
- method2 = Method(id=18,name="information retrieval",
+ method2 = Method(id=22,name="information retrieval",
description="Retrieve relevant informations in response to the information requests.",
parent=method1)
db.session.add(method2)
- m_indexing = Method(id=19,name="indexing",
+ m_indexing = Method(id=23,name="indexing",
description="'organize data in such a way that it can be easily retrieved later on'(<<Ignatow_etal2017>>,137)",
parent=method2)
db.session.add(m_indexing)
- m_search_query = Method(id=20,name="searching/querying",
+ m_search_query = Method(id=24,name="searching/querying",
description="'take information requests in the form of queries and return relevant documents'(<<Ignatow_etal2017>>,137). There are different models in order to estimate the similarity between records and the search queries (e.g. boolean, vector space or a probabilistic model)(ibid).",
parent=method2)
db.session.add(m_search_query)
- method2 = Method(id=21,name="statistical analysis",
+ method2 = Method(id=25,name="statistical analysis",
description="",
parent=method1)
db.session.add(method2)
- m_frequency_analysis = Method(id=22,name="frequency analysis",
+ m_frequency_analysis = Method(id=26,name="frequency analysis",
description="Descriptiv statistical analysis by using specific text abundances.",
parent=method2)
db.session.add(m_frequency_analysis)
- m_w_dict_frequencies = Method(id=23,name="word frequencies/dictionary analysis",
+ m_w_dict_frequencies = Method(id=27,name="word frequencies/dictionary analysis",
description="Analyse statistical significant occurence of words/word-groups. Can also be combined with meta-data (e.g. creation time of document).",
parent=m_frequency_analysis)
db.session.add(m_w_dict_frequencies)
- m_co_occurence = Method(id=24,name="co-occurence analysis",
+ m_co_occurence = Method(id=28,name="co-occurence analysis",
description="Analyse statistical significant co-occurence of words in different contextual units.",
parent=m_frequency_analysis)
db.session.add(m_co_occurence)
- m_context_volatility = Method(id=25,name="context volatility",
+ m_context_volatility = Method(id=29,name="context volatility",
description="'Analyse contextual change for certain words over a period of time.'(<<Niekler_etal2018>>,1316)",
parent=m_frequency_analysis)
db.session.add(m_context_volatility)
- method3 = Method(id=26,name="classification/machine learning",
+ method3 = Method(id=30,name="classification/statistical learning",
description="Various techniques to (semi-)automatically identify specific classes. ",
parent=method2)
db.session.add(method3)
- m_supervised_classification = Method(id=27,name="supervised classification",
+ m_supervised_classification = Method(id=31,name="supervised classification",
description="Use given training examples in order to classify certain entities.",
parent=method3)
db.session.add(m_supervised_classification)
- method4 = Method(id=28,name="latent semantic analysis",
+ method4 = Method(id=32,name="latent semantic analysis",
description="'The basic idea of latent semantic analysis (LSA) is, that text do have a higher order (=latent semantic) structure which, however, is obscured by word usage (e.g. through the use of synonyms or polysemy). By using conceptual indices that are derived statistically via a truncated singular value decomposition (a two-mode factor analysis) over a given document-term matrix, this variability problem can be overcome.'(link:https://cran.r-project.org/web/packages/lsa/lsa.pdf[CRAN-R])",
parent=method3)
db.session.add(method4)
- method4 = Method(id=29,name="topic modeling",
+ method4 = Method(id=33,name="topic modeling",
description="Probabilistic models to infer semantic clusters. See especially <<Papilloud_etal2018>>.",
parent=method3)
db.session.add(method4)
- lda = Method(id=30,name="latent dirichlet allocation",
+ lda = Method(id=34,name="latent dirichlet allocation",
description="""'The application of LDA is based on three nested concepts: the text collection to be modelled is referred to as the corpus; one item within the corpus is a document, with words within a document called terms.(...) +
The aim of the LDA algorithm is to model a comprehensive representation of the corpus by inferring latent content variables, called topics. Regarding the level of analysis, topics are heuristically located on an intermediate level between the corpus and the documents and can be imagined as content-related categories, or clusters. (...) Since topics are hidden in the first place, no information about them is directly observable in the data. The LDA algorithm solves this problem by inferring topics from recurring patterns of word occurrence in documents.'(<<Maier_etal2018>>,94)""",
parent=method4)
db.session.add(lda)
- nmf = Method(id=31,name="non-negative-matrix-factorization",
+ nmf = Method(id=35,name="non-negative-matrix-factorization",
description="Inclusion of non-negative constraint.",
parent=method4)
db.session.add(nmf)
- stm = Method(id=32,name="structural topic modeling",
+ stm = Method(id=36,name="structural topic modeling",
description="Inclusion of meta-data. Refer especially to <<roberts2013>>.",
parent=method4)
db.session.add(stm)
- sa = Method(id=33,name="sentiment analysis",
+ sa = Method(id=37,name="sentiment analysis",
description="'Subjectivity and sentiment analysis focuses on the automatic identification of private states, such as opinions, emotions, sentiments, evaluations, beliefs, and speculations in natural language. While subjectivity classification labels text as either subjective or objective, sentiment classification adds an additional level of granularity, by further classifying subjective text as either positive, negative, or neutral.' (<<Ignatow_etal2017>> pp. 148)",
parent=method3)
db.session.add(sa)
- method4 = Method(id=34,name="automated narrative, argumentative structures, irony, metaphor detection/extraction",
+ method4 = Method(id=38,name="automated narrative, argumentative structures, irony, metaphor detection/extraction",
description="For automated narrative methapor analysis see (<<Ignatow_etal2017>>, 89-106. For argumentative structures(Task: Retrieving sentential arguments for any given controversial topic) <<Stab_etal2018>> .Refer for a current overview <<Cabrio2018>>.",
parent=method3)
db.session.add(method4)
- method3 = Method(id=35,name="network analysis/modeling",
+ method3 = Method(id=39,name="network analysis/modeling",
description="Generate networks out of text/relationships between text.",
parent=method2)
db.session.add(method3)
- method4 = Method(id=36, name="knowledge graph construction",
+ method4 = Method(id=40, name="knowledge graph construction",
description="Modelling entities and their relationships.",
parent=method3)
db.session.add(method4)
- method2 = Method(id=37,name="data visualization",
+ method2 = Method(id=41,name="data visualization",
description="Visualize the mined informations.",
parent=method1)
db.session.add(method2)
- method3 = Method(id=38,name="word relationships",
+ method3 = Method(id=42,name="word relationships",
description="",
parent=method2)
db.session.add(method3)
- method3 = Method(id=39,name="networks",
+ method3 = Method(id=43,name="networks",
description="",
parent=method2)
db.session.add(method3)
- method3 = Method(id=40,name="geo-referenced",
+ method3 = Method(id=44,name="geo-referenced",
description="",
parent=method2)
db.session.add(method3)
- method3 = Method(id=41,name="dynamic visualizations",
+ method3 = Method(id=45,name="dynamic visualizations",
description="Visualizations with user interaction or animations.",
parent=method2)
db.session.add(method3)
- method1 = Method(id=42,name="research practice",
+ method1 = Method(id=46,name="digital research practice in social or economic sciences",
description="",
parent=method)
db.session.add(method1)
- m_automated_data_collection = Method(id=43,
+ m_automated_data_collection = Method(id=47,
name="automated data collection",
description="""In principal there are multiple possible data sources in a data mining process. A basic distinction in relevance to automated data collection can be drawn between connected devices(internet, intranets) or unconnected devices(sensors, etc.). +
Furthermore the server-client-model is the established communication paradigms for connected devices. In order to obtain data either from server or client there exists three different interfaces: log files, apis and user interfaces which constitute the available procedures <<Jünger2018>>.""",
parent=method1)
db.session.add(m_automated_data_collection)
- m_collect_log = Method(id=44,
+ m_collect_log = Method(id=48,
name="collect log-data",
description="Collect log data which occur during providing the (web-)service or the information processing.",
parent=m_automated_data_collection)
db.session.add(m_collect_log)
- m_parsing = Method(id=45,
+ m_parsing = Method(id=49,
name="parsing from api",
description="Parse structured data from via a documented REST-API.",
parent=m_automated_data_collection)
db.session.add(m_parsing)
- m_scraping = Method(id=46,
+ m_scraping = Method(id=50,
name="scraping",
description="Automatically parse unstructured or semi-structured data from a normal website (⇒ web-scraping) or service.",
parent=m_automated_data_collection)
db.session.add(m_scraping)
- m_scraping_stat = Method(id=47,
+ m_scraping_stat = Method(id=51,
name="scraping (static content)",
description="Automatically parse data from static HTML websites.",
parent=m_scraping)
db.session.add(m_scraping_stat)
- m_scraping_dyn = Method(id=48,
+ m_scraping_dyn = Method(id=52,
name="scraping (dynamic content)",
description="Automatically parse dynamic content (HTML5/Javascript,) ⇒ sometimes requires mimicking user-interaction.",
parent=m_scraping)
db.session.add(m_scraping_dyn)
- m_crawling = Method(id=49,
+ m_crawling = Method(id=53,
name="crawling",
description="Collect websites with an initial set of webpages by following contained links <<Ignatow_etal2017>>.",
parent=m_automated_data_collection)
db.session.add(m_crawling)
-
- method2 = Method(id=50,name="digital research design",
+ m_tracking = Method(id=54,
+ name="tracking",
+ description="User-informed passive data collection.",
+ parent=m_automated_data_collection)
+ db.session.add(m_tracking)
+ m_esm_ema = Method(id=55,name="ecological momentary assessments (EMA)/Experience Sampling Method (ESM)",
+ description="EMA and ESM are mostly equivalent. EMA focusses on medical questions or measurements in a natural environment; ESM more on subjective Questions in the real life. Four characteristics: 1) data collection in natural environments 2) Focussing on near events/impressions/actions 3) questions triggered randomly or event-based 4) multiple questions over a certain period of time [Citation after Stone and Shiffmann 1994] (<<Salganik2018>>,109)",
+ parent=m_tracking)
+ db.session.add(m_esm_ema)
+ method2 = Method(id=56,name="blended reading/text mining",
+ description="Application of data mining methods on texts in social research. Refer <<Stulpe_etal2016>> for a detailed explanation.",
+ parent=method1)
+ db.session.add(method2)
+ method2 = Method(id=57,name="new forms of digital research design",
description="New possibilities in surveys or data aquisition techniques.",
parent=method1)
db.session.add(method2)
- m_esm_ema = Method(id=51,name="ecological momentary assessments (EMA)/Experience Sampling Method (ESM)",
- description="Mostly equivalent. EMA focusses on medical questions or measurements in a natural environment; ESM more on subjective Questions in the real life. Four characteristics: 1) data collection in natural environments 2) Focussing on near events/impressions/actions 3) questions triggered randomly or event-based 4) multiple questions over a certain period of time [Citation after Stone and Shiffmann 1994] (<<Salganik2018>>,109)",
- parent=method2)
- db.session.add(m_esm_ema)
- method3 = Method(id=52,name="wiki surveys",
+ method3 = Method(id=58,name="wiki surveys",
description="Guide open-answer questions with user feedback. Refer also (<<Salganik2018>>,111)",
parent=method2)
db.session.add(method3)
- m_online_experiment = Method(id=53,name="Online experiments",
+ m_online_experiment = Method(id=59,name="online experiments",
description="Synchronous or asynchronous online experiments.",
parent=method2)
db.session.add(m_online_experiment)
- method3 = Method(id=54,name="survey data linked to big data sources",
+ method3 = Method(id=60,name="survey data linked to big data sources",
description="",
parent=method2)
db.session.add(method3)
- method4 = Method(id=55,name="enriched asking",
+ method4 = Method(id=61,name="enriched asking",
description="'In enriched asking, survey data build context around a big data source that contains some important measurements but lacks others.'(<<Salganik2018>>,118)",
parent=method3)
db.session.add(method4)
- method4 = Method(id=56,name="amplified asking",
+ method4 = Method(id=62,name="amplified asking",
description="'Amplified asking using a predictive model to combine survey data from few people with a big data source from many people.'(<<Salganik2018>>,122)",
parent=method3)
db.session.add(method4)
- method2 = Method(id=57,name="collaborative work",
+ method2 = Method(id=63,name="collaborative work",
description="",
parent=method1)
db.session.add(method2)
- method3 = Method(id=58,name="open call projects",
+ method3 = Method(id=64,name="open call projects",
description="(e.g. annotation).",
parent=method2)
db.session.add(method3)
- method3 = Method(id=59,name="distributed data collection",
+ method3 = Method(id=65,name="distributed data collection",
description="",
parent=method2)
db.session.add(method3)
- method2 = Method(id=60,name="digital communication",
+ method2 = Method(id=66,name="digital communication",
description="",
parent=method1)
db.session.add(method2)
- method2 = Method(id=61,name="digital data/phenomena as reasearch-objective",
+ method2 = Method(id=67,name="digital data/phenomena as reasearch-objective",
description="",
parent=method1)
db.session.add(method2)
- m_statistical_modeling = Method(id=62,name="statistical modeling",
+ m_statistical_modeling = Method(id=68,name="statistical modeling",
description="",
parent=method1)
db.session.add(m_statistical_modeling)
- m_regression_analysis = Method(id=63,name="regression analysis",
+ m_regression_analysis = Method(id=69,name="regression analysis",
description="",
parent=m_statistical_modeling)
db.session.add(m_regression_analysis)
- m_time_series_analysis = Method(id=64,name="time-series analysis",
+ m_time_series_analysis = Method(id=70,name="time-series analysis",
description="",
parent=m_statistical_modeling)
db.session.add(m_time_series_analysis)
- m_nowcasting = Method(id=65,name="forecasting/nowcasting",
+ m_nowcasting = Method(id=71,name="forecasting/nowcasting",
description="Using methods to predict the future for estimation of current values. (Example: predict influenza epidemiology combining CDC Data and Google Trends(<<Salganik2018>>,46–50)).",
parent=m_time_series_analysis)
db.session.add(m_nowcasting)
- m_social_complexity = Method(id=66,name="social complexity modeling/ social simulation",
+ m_social_complexity = Method(id=72,name="social complexity modeling/ social simulation",
description="",
parent=method1)
db.session.add(m_social_complexity)
- m_agent_based_modeling = Method(id=67,name="agent-based modeling",
+ m_agent_based_modeling = Method(id=73,name="agent-based modeling",
description="",
parent=m_social_complexity)
db.session.add(m_agent_based_modeling)
@@ -681,6 +716,14 @@ The aim of the LDA algorithm is to model a comprehensive representation of the c
reference = Reference(name="Stulpe_etal2016",
cited="Stulpe, A., & Lemke, M. (2016). Blended Reading. In Text Mining in den Sozialwissenschaften (pp. 17–61). Springer.")
db.session.add(reference)
+ reference = Reference(name="Günther_etal2014",
+ cited="Günther E., Scharkow M. (2014). Automatisierte Datenbereinigung bei Inhalts- und Linkanalysen von Online-Nachrichten, in: K. Sommer, M. Wettstein, W. Wirth, & J. Matthes (Hrsg.): Automatisierung der Inhaltsanalyse, Köln: Halem, 111 - 126")
+ db.session.add(reference)
+ reference = Reference(name="Pomikálek2011",
+ cited="J. Pomikálek (2011). Removing Boilerplate and Duplicate Content from Web Corpora. PhD thesis, Masaryk University, Brno, 2011.")
+ db.session.add(reference)
+
+
lic_unknown = License(name="Unknown")
lic_bsd = License(name="BSD")
@@ -719,15 +762,20 @@ The aim of the LDA algorithm is to model a comprehensive representation of the c
prol_perl = Programminglanguage(name="Perl")
cat_tracking = SoftwareCategory(name="user-consented tracking", short_description="Collection of sensor data on (mobile) devices in accordance with data protection laws.")
- cat_scraping = SoftwareCategory(name="scraping", short_description="Tools in the area of web-scraping")
+
+ cat_scraping = SoftwareCategory(name="general crawling/scraping", short_description="Tools in the area of web-crawling and scraping.")
+ cat_scraping_social_platforms = SoftwareCategory(name="scraping social platforms", short_description="Tools for specific social (media) platforms.")
+ cat_scraping_doc = SoftwareCategory(name="scraping documents", short_description="Tools for extracting semantic content out of documents (e.g pdfs, ...)")
+ cat_scraping_browser_based = SoftwareCategory(name="crawling/scraping by automating web browsers", short_description="Scraping with remote controlled browser-engines.")
+ cat_scraping_html = SoftwareCategory(name="scraping websites", short_description="Tools for extraction of text body in html websites.")
+
cat_int = SoftwareCategory(name="tools for corpus linguistics/text mining/(semi-)automated text analysis", short_description="Integrated platforms for corpus analysis and processing.")
cat_qda = SoftwareCategory(name="computer assisted/aided qualitative data analysis software (CAQDAS)", short_description="assist with qualitative research such as transcription analysis, coding and text interpretation, recursive abstraction, content analysis, discourse analysis, grounded theory methodology, etc.")
cat_tm = SoftwareCategory(name="natuaral language processing(NLP)", short_description="")
cat_senti = SoftwareCategory(name="sentiment analysis", short_description="")
cat_topic = SoftwareCategory(name="topic-models", short_description="")
cat_visu = SoftwareCategory(name="visualization", short_description="")
- cat_kollab_anno = SoftwareCategory(name="collaborative annotation", short_description="")
- cat_kollab_write = SoftwareCategory(name="collaborative writing", short_description="")
+ cat_kollab_anno = SoftwareCategory(name="collaborative annotation/writing", short_description="")
cat_stat = SoftwareCategory(name="statistical software", short_description="software that helps calcualting with specific statistical models")
cat_repo = SoftwareCategory(name="research data archiving", short_description="")
cat_now = SoftwareCategory(name="nowcasting", short_description="")
@@ -742,82 +790,89 @@ The aim of the LDA algorithm is to model a comprehensive representation of the c
cat_eye = SoftwareCategory(name="(remote) eye tracking")
cat_misc = SoftwareCategory(name="miscellaneous", short_description="")
- tool = Software(name="AWARE",
- short_description="'AWARE is an Android framework dedicated to instrument, infer, log and share mobile context information, for application developers, researchers and smartphone users. AWARE captures hardware-, software-, and human-based data. The data is then analyzed using AWARE plugins. They transform data into information you can understand.' link:http://www.awareframework.com/what-is-aware/[Source, visited: 27.02.2019]",
+ tool = Software(name="Scrapy",
+ short_description="",
developer="",
maintainer="",
- softwarecategory=cat_tracking,
+ softwarecategory=cat_scraping,
architecture="framework",
- license=lic_apache2,
- programminglanguages=[prol_java],
+ license=lic_bsd,
+ programminglanguages=[prol_py],
price="0")
db.session.add(tool)
- db.session.add(Link(software=tool, type="website", url="http://www.awareframework.com/", comment=""))
- db.session.add(Link(software=tool, type="repository", url="https://github.com/denzilferreira/aware-client", comment="android"))
- db.session.add(Link(software=tool, type="repository", url="https://github.com/tetujin/aware-client-ios", comment="iOS"))
- db.session.add(Link(software=tool, type="repository", url="https://github.com/tetujin/aware-client-osx", comment="OSX"))
- db.session.add(Link(software=tool, type="repository", url="https://github.com/denzilferreira/aware-server", comment="server"))
+ db.session.add(Link(software=tool, type="website", url="https://scrapy.org/", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/scrapy/scrapy", comment=""))
- tool = Software(name="MEILI",
- short_description="",
- developer="Adrian C. Prelipcean",
- maintainer="Adrian C. Prelipcean",
- softwarecategory=cat_tracking,
- architecture="framework",
+ tool = Software(name="Beautiful Soup",
+ short_description="'Beautiful Soup is a Python library designed for quick turnaround projects like screen-scraping. Three features make it powerful: Beautiful Soup provides a few simple methods and Pythonic idioms for navigating, searching, and modifying a parse tree: a toolkit for dissecting a document and extracting what you need. It doesn't take much code to write an application;Beautiful Soup automatically converts incoming documents to Unicode and outgoing documents to UTF-8. You don't have to think about encodings, unless the document doesn't specify an encoding and Beautiful Soup can't detect one. Then you just have to specify the original encoding.;Beautiful Soup sits on top of popular Python parsers like lxml and html5lib, allowing you to try out different parsing strategies or trade speed for flexibility.'link:https://www.crummy.com/software/BeautifulSoup/[Retrieved 22.03.2019]",
+ developer="Leonard Richardson",
+ maintainer="Leonard Richardson",
+ softwarecategory=cat_scraping,
+ architecture="library",
+ license=lic_mit,
+ programminglanguages=[prol_py],
+ price="0")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="https://www.crummy.com/software/BeautifulSoup/", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/scrapy/scrapy", comment=""))
+
+ tool = Software(name="Rcrawler",
+ short_description="'RCrawler is a contributed R package for domain-based web crawling and content scraping. As the first implementation of a parallel web crawler in the R environment, RCrawler can crawl, parse, store pages, extract contents, and produce data that can be directly employed for web content mining applications.' link:https://www.sciencedirect.com/science/article/pii/S2352711017300110[Retrieved 22.03.2019]",
+ developer="Salim Khalil",
+ maintainer="Salim Khalil",
+ softwarecategory=cat_scraping,
+ architecture="library",
license=lic_gpl3,
- programminglanguages=[prol_java],
- price="0",
- recommandedcitation="")
+ programminglanguages=[prol_r],
+ price="0")
db.session.add(tool)
- db.session.add(Link(software=tool, type="website", url="http://adrianprelipcean.github.io/", comment="dev"))
- db.session.add(Link(software=tool, type="repository", url="https://github.com/Badger-MEILI",
- comment="group"))
+ db.session.add(Link(software=tool, type="website", url="https://www.sciencedirect.com/science/article/pii/S2352711017300110", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/salimk/Rcrawler", comment=""))
- tool = Software(name="Passive Data Kit",
+ tool = Software(name="Robobrowser",
short_description="",
- developer="Chris Karr",
- maintainer="Chris Karr",
- softwarecategory=cat_tracking,
- architecture="framework",
- license=lic_apache2,
- programminglanguages=[prol_py,prol_java],
- languages=[lang_en],
- operatingsystems=[os_ix,os_droid,os_ios],
- currentversion="",
- lastchanged=datetime.datetime.strptime('26022019', '%d%m%Y').date(),
- methods=[m_esm_ema],
+ developer="Joshua Carp",
+ maintainer="Joshua Carp",
+ softwarecategory=cat_scraping,
+ architecture="library",
+ license=lic_mit,
+ programminglanguages=[prol_py],
price="0")
db.session.add(tool)
- db.session.add(Link(software=tool, type="website", url="https://passivedatakit.org/", comment=""))
- db.session.add(Link(software=tool, type="repository", url="https://github.com/audaciouscode/PassiveDataKit-Django", comment="djangoserver"))
- db.session.add(Link(software=tool, type="repository", url="https://github.com/audaciouscode/PassiveDataKit-Android", comment="android"))
- db.session.add(Link(software=tool, type="repository", url="https://github.com/audaciouscode/PassiveDataKit-iOS", comment="iOS"))
+ db.session.add(Link(software=tool, type="website", url="https://robobrowser.readthedocs.io/en/latest/readme.html", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/jmcarp/robobrowser", comment=""))
- tool = Software(name="Web Historian(CE)",
- short_description="Chrome browser extension designed to integrate web browsing history data collection into research projects collecting other types of data from participants (e.g. surveys, in-depth interviews, experiments). It uses client-side D3 visualizations to inform participants about the data being collected during the informed consent process. It allows participants to delete specific browsing data or opt-out of browsing data collection. It directs participants to an online survey once they have reviewed their data and made a choice of whether to participate. It has been used with Qualtrics surveys, but any survey that accepts data from a URL will work. It works with the open source Passive Data Kit (PDK) as the backend for data collection. To successfully upload, you need to fill in the address of your PDK server in the js/app/config.js file.",
- developer="Ericka Menchen-Trevino and Chris Karr",
- maintainer="Ericka Menchen-Trevino and Chris Karr",
- softwarecategory=cat_tracking,
- architecture="plugin",
+ tool = Software(name="rvest",
+ short_description="",
+ developer="Hadley Wickham",
+ maintainer="Hadley Wickham",
+ softwarecategory=cat_scraping,
+ architecture="library",
license=lic_gpl3,
- programminglanguages=[prol_js],
- languages=[lang_en],
- operatingsystems=[os_browser],
- currentversion="e06b3e174f9668f5c62f30a9bedde223023e0bca",
- lastchanged=datetime.datetime.strptime('18022019', '%d%m%Y').date(),
- methods=[m_esm_ema],
- price="0",
- recommandedcitation="Menchen-Trevino, E., & Karr, C. (2018). Web Historian - Community Edition. Zenodo. https://doi.org/10.5281/zenodo.1322782")
+ programminglanguages=[prol_r],
+ price="0")
db.session.add(tool)
- db.session.add(Link(software=tool, type="website", url="https://doi.org/10.5281/zenodo.1322782", comment="doi"))
- db.session.add(Link(software=tool, type="website", url="http://www.webhistorian.org", comment=""))
- db.session.add(Link(software=tool, type="repository", url="https://github.com/WebHistorian/community", comment=""))
+ db.session.add(Link(software=tool, type="website", url="https://cran.r-project.org/web/packages/rvest/index.html", comment="cran"))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/tidyverse/rvest", comment=""))
+
+ tool = Software(name="facepager",
+ short_description="",
+ developer="Jakob Jünger and Till Keyling",
+ maintainer="Jakob Jünger",
+ softwarecategory=cat_scraping_social_platforms,
+ architecture="stand-alone application",
+ license=lic_mit,
+ programminglanguages=[prol_py],
+ price="0")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="wiki", url="https://github.com/strohne/Facepager", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/strohne/Facepager", comment=""))
tool = Software(name="TWINT",
short_description="TWINT (Twitter Intelligence Tool) 'Formerly known as Tweep, Twint is an advanced Twitter scraping tool written in Python that allows for scraping Tweets from Twitter profiles without using Twitter's API.' link:https://github.com/twintproject/twint[Retrieved 07.03.2019]",
developer="Cody Zacharias",
maintainer="Cody Zacharias",
- softwarecategory=cat_scraping,
+ softwarecategory=cat_scraping_social_platforms,
architecture="library",
license=lic_mit,
programminglanguages=[prol_py],
@@ -830,7 +885,7 @@ The aim of the LDA algorithm is to model a comprehensive representation of the c
short_description="'This repository contains an R script as well as an interactive Jupyter notebook to demonstrate how to automatically collect, format, and explore YouTube comments, including the emojis they contain. The script and notebook showcase the following steps: Getting access to the YouTube API Extracting comments for a video Formatting the comments & extracting emojis Basic sentiment analysis for text & emojis' link:https://github.com/JuKo007/YouTubeComments[Retrieved 07.03.2019]",
developer="Kohne, J., Breuer, J., & Mohseni, M. R.",
maintainer="Kohne, J., Breuer, J., & Mohseni, M. R.",
- softwarecategory=cat_scraping,
+ softwarecategory=cat_scraping_social_platforms,
architecture="library",
license=lic_unknown,
programminglanguages=[prol_r],
@@ -840,63 +895,200 @@ The aim of the LDA algorithm is to model a comprehensive representation of the c
db.session.add(Link(software=tool, type="website", url="https://osf.io/hqsxe/", comment=""))
db.session.add(Link(software=tool, type="repository", url="https://github.com/JuKo007/YouTubeComments", comment=""))
- tool = Software(name="facepager",
- short_description="",
- developer="Jakob Jünger and Till Keyling",
- maintainer="Jakob Jünger",
- softwarecategory=cat_scraping,
- architecture="stand-alone application",
- license=lic_mit,
+ tool = Software(name="youtube-transcript-scraper",
+ short_description="Since YouTube does not provide automatically generated transcripts via its API and normal scraping does not work with YT's ajaxy interface, this script uses browser automation to click through the YouTube web interface and download the transcript file.",
+ developer="Bernhard Rieder",
+ maintainer="Bernhard Rieder",
+ softwarecategory=cat_scraping_social_platforms,
+ architecture="library",
+ license=lic_unknown,
programminglanguages=[prol_py],
- price="0")
+ price="0",
+ recommandedcitation="")
db.session.add(tool)
- db.session.add(Link(software=tool, type="wiki", url="https://github.com/strohne/Facepager", comment=""))
- db.session.add(Link(software=tool, type="repository", url="https://github.com/strohne/Facepager", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/bernorieder/youtube-transcript-scraper", comment=""))
- tool = Software(name="Scrapy",
+ tool = Software(name="Grobid",
+ short_description="GROBID is a machine learning library for extracting, parsing and re-structuring raw documents such as PDF into structured XML/TEI encoded documents with a particular focus on technical and scientific publications.",
+ developer="Patrice Lopez",
+ maintainer="Patrice Lopez",
+ softwarecategory=cat_scraping_doc,
+ architecture="library",
+ license=lic_apache2,
+ programminglanguages=[prol_java],
+ price="0",
+ recommandedcitation="@misc{GROBID, title = {GROBID}, howpublished = {\\url{https://github.com/kermitt2/grobid}}, publisher = {GitHub}, year = {2008 --- 2019}, archivePrefix = {swh}, eprint = {1:dir:6a298c1b2008913d62e01e5bc967510500f80710}}")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="http://cloud.science-miner.com/grobid/", comment="demo"))
+ db.session.add(Link(software=tool, type="website", url="https://grobid.readthedocs.io/en/latest/", comment="documentation"))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/kermitt2/grobid", comment=""))
+
+ tool = Software(name="LexisNexisTools",
+ short_description="This package provides functions to read files manually downloaded from ‘LexisNexis’ and comes with a few other features I think come in handy while working with data from the popular newspaper archive.",
+ developer="Johannes Gruber",
+ maintainer="Johannes Gruber",
+ softwarecategory=cat_scraping_doc,
+ architecture="library",
+ license=lic_gpl3,
+ programminglanguages=[prol_r],
+ price="0",
+ recommandedcitation="")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="https://cran.r-project.org/web/packages/LexisNexisTools/index.html", comment="cran"))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/JBGruber/LexisNexisTools", comment=""))
+
+ tool = Software(name="news_extract",
+ short_description="news_extract allows the output of the NexisUni and Factiva databases to be imported into Python. Note, you must export your documents manually first! This module does not scrape the databases directly; rather, it extracts articles and associated metadata from pre-exported output files.",
+ developer="Deen Freelon",
+ maintainer="Deen Freelon",
+ softwarecategory=cat_scraping_doc,
+ architecture="library",
+ license=lic_bsd,
+ programminglanguages=[prol_py],
+ price="0",
+ recommandedcitation="")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/dfreelon/news_extract", comment=""))
+
+ tool = Software(name="pdf2xml",
+ short_description="convert PDF files to XML. This script heavily relies on Apache Tika and pdftotext for the extraction of text and the conversion to XML. It tries to combine information from both tools and different conversion modes:",
+ developer="Jörg Tiedemann",
+ maintainer="Jörg Tiedemann",
+ softwarecategory=cat_scraping_doc,
+ architecture="library",
+ license=lic_gpl3,
+ programminglanguages=[prol_perl, prol_java],
+ price="0",
+ recommandedcitation="")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="repository", url="https://bitbucket.org/tiedemann/pdf2xml/src/master/", comment=""))
+
+ tool = Software(name="PdfAct",
+ short_description="A basic tool that extracts the structure from the PDF files of scientific articles.",
+ developer="Claudius Korzen",
+ maintainer="Claudius Korzen",
+ softwarecategory=cat_scraping_doc,
+ architecture="library",
+ license=lic_apache2,
+ programminglanguages=[prol_java],
+ price="0",
+ recommandedcitation="")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/ad-freiburg/pdfact", comment=""))
+
+ tool = Software(name="pdfalto",
+ short_description="pdfalto is a command line executable for parsing PDF files and producing structured XML representations of the PDF content in ALTO format.",
+ developer="Hervé Déjean (XRCE)",
+ maintainer="Patrice Lopez",
+ softwarecategory=cat_scraping_doc,
+ architecture="library",
+ license=lic_gpl2,
+ programminglanguages=[prol_c],
+ price="0",
+ recommandedcitation="")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/kermitt2/pdfalto", comment=""))
+
+ tool = Software(name="PDFBox",
+ short_description="The Apache PDFBox® library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents.",
+ developer="",
+ maintainer="",
+ softwarecategory=cat_scraping_doc,
+ architecture="library",
+ license=lic_apache2,
+ programminglanguages=[prol_java],
+ price="0",
+ recommandedcitation="")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="https://pdfbox.apache.org/", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://svn.apache.org/viewvc/pdfbox/", comment=""))
+
+ tool = Software(name="PDFMiner.six",
short_description="",
developer="",
maintainer="",
- softwarecategory=cat_scraping,
- architecture="framework",
- license=lic_bsd,
- programminglanguages=[prol_py],
- price="0")
+ softwarecategory=cat_scraping_doc,
+ architecture="library",
+ license=None,
+ programminglanguages=[],
+ price="0",
+ recommandedcitation="")
db.session.add(tool)
- db.session.add(Link(software=tool, type="website", url="https://scrapy.org/", comment=""))
- db.session.add(Link(software=tool, type="repository", url="https://github.com/scrapy/scrapy", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/pdfminer/pdfminer.six", comment=""))
- tool = Software(name="Beautiful Soup",
- short_description="'Beautiful Soup is a Python library designed for quick turnaround projects like screen-scraping. Three features make it powerful: Beautiful Soup provides a few simple methods and Pythonic idioms for navigating, searching, and modifying a parse tree: a toolkit for dissecting a document and extracting what you need. It doesn't take much code to write an application;Beautiful Soup automatically converts incoming documents to Unicode and outgoing documents to UTF-8. You don't have to think about encodings, unless the document doesn't specify an encoding and Beautiful Soup can't detect one. Then you just have to specify the original encoding.;Beautiful Soup sits on top of popular Python parsers like lxml and html5lib, allowing you to try out different parsing strategies or trade speed for flexibility.'link:https://www.crummy.com/software/BeautifulSoup/[Retrieved 22.03.2019]",
- developer="Leonard Richardson",
- maintainer="Leonard Richardson",
- softwarecategory=cat_scraping,
+ tool = Software(name="poppler",
+ short_description="A library for rendering PDF files, and examining or modifying their structure.",
+ developer="Derek Noonburg and others",
+ maintainer="",
+ softwarecategory=cat_scraping_doc,
architecture="library",
- license=lic_mit,
- programminglanguages=[prol_py],
+ license=lic_gpl3,
+ programminglanguages=[prol_c],
+ price="0",
+ recommandedcitation="")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="https://poppler.freedesktop.org/", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://gitlab.freedesktop.org/poppler/poppler", comment=""))
+
+ tool = Software(name="readtext",
+ short_description="readtext is a one-function package that does exactly what it says on the tin: It reads files containing text, along with any associated document-level metadata, which we call “docvars”, for document variables. Plain text files do not have docvars, but other forms such as .csv, .tab, .xml, and .json files usually do.",
+ developer="Kenneth Benoit [aut, cre, cph], Adam Obeng [aut], Kohei Watanabe [ctb], Akitaka Matsuo [ctb], Paul Nulty [ctb], Stefan Müller [ctb]",
+ maintainer="Kenneth Benoit",
+ softwarecategory=cat_scraping_doc,
+ architecture="library",
+ license=lic_gpl3,
+ programminglanguages=[prol_r],
+ price="0",
+ recommandedcitation="")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="https://readtext.quanteda.io/reference/readtext.html", comment=""))
+ db.session.add(Link(software=tool, type="website", url="https://cran.r-project.org/package=readtext", comment="cran"))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/quanteda/readtext", comment=""))
+
+ tool = Software(name="xpdf",
+ short_description="Xpdf is a free PDF viewer and toolkit, including a text extractor, image converter, HTML converter, and more. Most of the tools are available as open source.",
+ developer="Derek Noonburg",
+ maintainer="Derek Noonburg",
+ softwarecategory=cat_scraping_doc,
+ architecture="library",
+ license=lic_gpl3,
+ programminglanguages=[prol_c],
+ price="0",
+ recommandedcitation="")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="repository", url="http://www.xpdfreader.com/about.html", comment=""))
+
+ tool = Software(name="Puppeteer",
+ short_description="Puppeteer is a Node library which provides a high-level API to control Chrome or Chromium over the DevTools Protocol. Puppeteer runs headless by default, but can be configured to run full (non-headless) Chrome or Chromium.",
+ developer="Chrome DevTools team",
+ maintainer="Chrome DevTools team",
+ softwarecategory=cat_scraping_browser_based,
+ architecture="library",
+ license=lic_apache2,
+ programminglanguages=[prol_js],
price="0")
db.session.add(tool)
- db.session.add(Link(software=tool, type="website", url="https://www.crummy.com/software/BeautifulSoup/", comment=""))
- db.session.add(Link(software=tool, type="repository", url="https://github.com/scrapy/scrapy", comment=""))
+ db.session.add(Link(software=tool, type="website", url="https://pptr.dev/", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/GoogleChrome/puppeteer", comment=""))
- tool = Software(name="Robobrowser",
- short_description="",
- developer="Joshua Carp",
- maintainer="Joshua Carp",
- softwarecategory=cat_scraping,
+ tool = Software(name="Pyppeteer",
+ short_description="Unofficial Python port of puppeteer JavaScript (headless) chrome/chromium browser automation library.",
+ developer="",
+ maintainer="",
+ softwarecategory=cat_scraping_browser_based,
architecture="library",
license=lic_mit,
programminglanguages=[prol_py],
price="0")
db.session.add(tool)
- db.session.add(Link(software=tool, type="website", url="https://robobrowser.readthedocs.io/en/latest/readme.html", comment=""))
- db.session.add(Link(software=tool, type="repository", url="https://github.com/jmcarp/robobrowser", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/miyakogi/pyppeteer", comment=""))
+ db.session.add(Link(software=tool, type="website", url="https://miyakogi.github.io/pyppeteer/", comment="documentation"))
tool = Software(name="RSelenium",
short_description="",
developer="John Harrison",
maintainer="Ju Yeong Kim",
- softwarecategory=cat_scraping,
+ softwarecategory=cat_scraping_browser_based,
architecture="library",
license=lic_agpl3,
programminglanguages=[prol_r],
@@ -904,18 +1096,168 @@ The aim of the LDA algorithm is to model a comprehensive representation of the c
db.session.add(tool)
db.session.add(Link(software=tool, type="repository", url="https://github.com/ropensci/RSelenium", comment=""))
- tool = Software(name="rvest",
- short_description="",
- developer="Hadley Wickham",
- maintainer="Hadley Wickham",
- softwarecategory=cat_scraping,
+ tool = Software(name="Selenium",
+ short_description="Selenium is an umbrella project encapsulating a variety of tools and libraries enabling web browser automation. Selenium specifically provides infrastructure for the W3C WebDriver specification — a platform and language-neutral coding interface compatible with all major web browsers.",
+ developer="Jason Huggins",
+ maintainer="",
+ softwarecategory=cat_scraping_browser_based,
+ architecture="library",
+ license=lic_apache2,
+ programminglanguages=[prol_java],
+ price="0")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="https://docs.seleniumhq.org", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/SeleniumHQ/selenium", comment=""))
+
+ tool = Software(name="splash",
+ short_description="Splash is a javascript rendering service with an HTTP API. It's a lightweight browser with an HTTP API, implemented in Python 3 using Twisted and QT5.",
+ developer="Scrapinghub",
+ maintainer="Scrapinghub",
+ softwarecategory=cat_scraping_browser_based,
+ architecture="library",
+ license=lic_bsd,
+ programminglanguages=[prol_py],
+ price="0")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="https://splash.readthedocs.io/en/stable/", comment="documentation"))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/scrapinghub/splash", comment=""))
+
+ tool = Software(name="boilerpipeR",
+ short_description="Generic Extraction of main text content from HTML files; removal of ads, sidebars and headers using the boilerpipe (http://code.google.com/p/boilerpipe/) Java library. The extraction heuristics from boilerpipe show a robust performance for a wide range of web site templates.",
+ developer="C. Kohlschütter,P.Fankhauser,W.Nejdl",
+ maintainer="Mario Annau",
+ softwarecategory=cat_scraping_html,
architecture="library",
license=lic_gpl3,
programminglanguages=[prol_r],
price="0")
db.session.add(tool)
- db.session.add(Link(software=tool, type="website", url="https://cran.r-project.org/web/packages/rvest/index.html", comment="cran"))
- db.session.add(Link(software=tool, type="repository", url="https://github.com/tidyverse/rvest", comment=""))
+ db.session.add(Link(software=tool, type="website", url="https://cran.r-project.org/web/packages/boilerpipeR/index.html", comment="cran"))
+
+ tool = Software(name="goose3",
+ short_description="'Goose was originally an article extractor written in Java that has most recently (Aug2011) been converted to a scala project. This is a complete rewrite in Python. The aim of the software is to take any news article or article-type web page and not only extract what is the main body of the article but also all meta data and most probable image candidate.'",
+ developer="Xavier Grangier",
+ maintainer="Xavier Grangier",
+ softwarecategory=cat_scraping_html,
+ architecture="library",
+ license=lic_apache2,
+ programminglanguages=[prol_py],
+ price="0")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/goose3/goose3", comment=""))
+
+ tool = Software(name="jusText",
+ short_description="'jusText is a tool for removing boilerplate content, such as navigation links, headers, and footers from HTML pages. It is designed to preserve mainly text containing full sentences and it is therefore well suited for creating linguistic resources such as Web corpora.'",
+ developer="Jan Pomikálek",
+ maintainer="",
+ softwarecategory=cat_scraping_html,
+ architecture="library",
+ license=lic_bsd,
+ programminglanguages=[prol_py],
+ price="0")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="http://corpus.tools/wiki/Justext", comment=""))
+
+ tool = Software(name="newspaper3k",
+ short_description="Newspaper delivers Instapaper style article extraction.",
+ developer="Lucas Ou-Yang",
+ maintainer="Lucas Ou-Yang",
+ softwarecategory=cat_scraping_html,
+ architecture="library",
+ license=lic_mit,
+ programminglanguages=[prol_py],
+ price="0")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="https://newspaper.readthedocs.io/en/latest/", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/codelucas/newspaper", comment=""))
+
+ tool = Software(name="python-boilerpipe",
+ short_description="A python wrapper for Boilerpipe, an excellent Java library for boilerplate removal and fulltext extraction from HTML pages.",
+ developer="",
+ maintainer="",
+ softwarecategory=cat_scraping_html,
+ architecture="library",
+ license=lic_apache2,
+ programminglanguages=[prol_py],
+ price="0")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/misja/python-boilerpipe", comment=""))
+
+ tool = Software(name="AWARE",
+ short_description="'AWARE is an Android framework dedicated to instrument, infer, log and share mobile context information, for application developers, researchers and smartphone users. AWARE captures hardware-, software-, and human-based data. The data is then analyzed using AWARE plugins. They transform data into information you can understand.' link:http://www.awareframework.com/what-is-aware/[Source, visited: 27.02.2019]",
+ developer="",
+ maintainer="",
+ softwarecategory=cat_tracking,
+ architecture="framework",
+ license=lic_apache2,
+ programminglanguages=[prol_java],
+ price="0")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="http://www.awareframework.com/", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/denzilferreira/aware-client", comment="android"))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/tetujin/aware-client-ios", comment="iOS"))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/tetujin/aware-client-osx", comment="OSX"))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/denzilferreira/aware-server", comment="server"))
+
+ tool = Software(name="Filter-Bubble-Web-Historian",
+ short_description="Part of this project makes use of the Web Historian browser extension, developed by Ericka Menchen-Trevino. To configure this extension, the files contained in this repository can be combined with the Web Historian repository, which is added as a submodule.",
+ developer="Ericka Menchen-Trevino and Chris Karr",
+ maintainer="Laurens Bogaardt",
+ softwarecategory=cat_tracking,
+ architecture="plugin",
+ license=lic_gpl3,
+ programminglanguages=[prol_py, prol_js],
+ languages=[lang_en],
+ operatingsystems=[os_browser],
+ currentversion="1.0",
+ lastchanged=datetime.datetime.strptime('13092019', '%d%m%Y').date(),
+ methods=[m_esm_ema],
+ price="0",
+ recommandedcitation="")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="http://ccs.amsterdam/projects/jeds/", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/Filter-Bubble", comment="all"))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/WebHistorian/community", comment=""))
+
+ tool = Software(name="Passive Data Kit",
+ short_description="",
+ developer="Chris Karr",
+ maintainer="Chris Karr",
+ softwarecategory=cat_tracking,
+ architecture="framework",
+ license=lic_apache2,
+ programminglanguages=[prol_py, prol_java],
+ languages=[lang_en],
+ operatingsystems=[os_ix, os_droid, os_ios],
+ currentversion="",
+ lastchanged=datetime.datetime.strptime('26022019', '%d%m%Y').date(),
+ methods=[m_esm_ema],
+ price="0")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="https://passivedatakit.org/", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/audaciouscode/PassiveDataKit-Django", comment="djangoserver"))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/audaciouscode/PassiveDataKit-Android", comment="android"))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/audaciouscode/PassiveDataKit-iOS", comment="iOS"))
+
+ tool = Software(name="Web Historian(CE)",
+ short_description="Chrome browser extension designed to integrate web browsing history data collection into research projects collecting other types of data from participants (e.g. surveys, in-depth interviews, experiments). It uses client-side D3 visualizations to inform participants about the data being collected during the informed consent process. It allows participants to delete specific browsing data or opt-out of browsing data collection. It directs participants to an online survey once they have reviewed their data and made a choice of whether to participate. It has been used with Qualtrics surveys, but any survey that accepts data from a URL will work. It works with the open source Passive Data Kit (PDK) as the backend for data collection. To successfully upload, you need to fill in the address of your PDK server in the js/app/config.js file.",
+ developer="Ericka Menchen-Trevino and Chris Karr",
+ maintainer="Ericka Menchen-Trevino and Chris Karr",
+ softwarecategory=cat_tracking,
+ architecture="plugin",
+ license=lic_gpl3,
+ programminglanguages=[prol_py, prol_js],
+ languages=[lang_en],
+ operatingsystems=[os_browser],
+ currentversion="e06b3e174f9668f5c62f30a9bedde223023e0bca",
+ lastchanged=datetime.datetime.strptime('18022019', '%d%m%Y').date(),
+ methods=[m_esm_ema],
+ price="0",
+ recommandedcitation="Menchen-Trevino, E., & Karr, C. (2018). Web Historian - Community Edition. Zenodo. https://doi.org/10.5281/zenodo.1322782")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="https://doi.org/10.5281/zenodo.1322782", comment="doi"))
+ db.session.add(Link(software=tool, type="website", url="http://www.webhistorian.org", comment=""))
+ db.session.add(Link(software=tool, type="repository", url="https://github.com/WebHistorian/community", comment=""))
tool = Software(name="AmCAT",
short_description="'The Amsterdam Content Analysis Toolkit (AmCAT) is an open source infrastructure that makes it easy to do large-scale automatic and manual content analysis (text analysis) for the social sciences and humanities.'",
@@ -1158,6 +1500,19 @@ The aim of the LDA algorithm is to model a comprehensive representation of the c
db.session.add(Link(software=tool, type="website", url="https://pypi.org/project/gensim/", comment=""))
db.session.add(Link(software=tool, type="repository", url="", comment=""))
+ tool = Software(name="koRpus",
+ short_description="'A set of tools to analyze texts.'",
+ developer="Meik Michalke, Earl Brown, Alberto Mirisola, Alexandre Brulet, Laura Hauser",
+ maintainer="Meik Michalke",
+ softwarecategory=cat_tm,
+ architecture="library",
+ license=lic_gpl3,
+ programminglanguages=[prol_r],
+ price="0")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="https://reaktanz.de/?c=hacking&s=koRpus", comment=""))
+ db.session.add(Link(software=tool, type="website", url="https://cran.r-project.org/web/packages/koRpus/index.html", comment="cran"))
+
tool = Software(name="NLTK",
short_description="'NLTK is a leading platform for building Python programs to work with human language data. It provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, wrappers for industrial-strength NLP libraries, and an active discussion forum.'",
developer="",
@@ -1261,7 +1616,18 @@ The aim of the LDA algorithm is to model a comprehensive representation of the c
db.session.add(tool)
db.session.add(Link(software=tool, type="website", url="http://tm.r-forge.r-project.org/", comment=""))
db.session.add(Link(software=tool, type="website", url="https://cran.r-project.org/package=tm", comment="cran"))
- db.session.add(Link(software=tool, type="repository", url="", comment=""))
+
+ tool = Software(name="tosca",
+ short_description="'A framework for statistical analysis in content analysis. In addition to a pipeline for preprocessing text corpora and linking to the latent Dirichlet allocation from the 'lda' package, plots are offered for the descriptive analysis of text corpora and topic models. In addition, an implementation of Chang's intruder words and intruder topics is provided.'",
+ developer="Lars Koppers, Jonas Rieger, Karin Boczek, Gerret von Nordheim",
+ maintainer="Lars Koppers",
+ softwarecategory=cat_tm,
+ architecture="library",
+ license=lic_gpl3,
+ programminglanguages=[prol_r],
+ price="0")
+ db.session.add(tool)
+ db.session.add(Link(software=tool, type="website", url="https://cran.r-project.org/web/packages/tosca/index.html", comment="cran"))
tool = Software(name="xtas",
short_description="the eXtensible Text Analysis Suite(xtas) 'is a collection of natural language processing and text mining tools, brought together in a single software package with built-in distributed computing and support for the Elasticsearch document store.'",
@@ -1412,7 +1778,7 @@ The aim of the LDA algorithm is to model a comprehensive representation of the c
short_description="",
developer="",
maintainer="",
- softwarecategory=cat_kollab_write,
+ softwarecategory=cat_kollab_anno,
architecture="server application",
license=lic_agpl3,
programminglanguages=[prol_py,prol_js])
@@ -1602,7 +1968,7 @@ The aim of the LDA algorithm is to model a comprehensive representation of the c
license=lic_unknown,
programminglanguages=[prol_java])
db.session.add(tool)
- db.session.add(Link(software=tool, type="website", url="", comment=""))
+ db.session.add(Link(software=tool, type="website", url="https://exmaralda.org/de/", comment=""))
db.session.add(Link(software=tool, type="repository", url="https://github.com/EXMARaLDA/exmaralda", comment=""))
tool = Software(name="tesseract",
diff --git a/templates/export/softwares.jinja2 b/templates/export/softwares.jinja2
index 1b7aa26..f3a1b6e 100644
--- a/templates/export/softwares.jinja2
+++ b/templates/export/softwares.jinja2
@@ -4,41 +4,9 @@
:sectnums:
:sectnumlevels: 8
-== stand-alone applications
-{% for softwarecategory in standalonesoftwareincategory %}{% if softwarecategory[1].count() > 0 %}
-=== {{ softwarecategory[0].name }}
-_{{softwarecategory[0].short_description}}_
-{% for software in softwarecategory[1] %}
-link:Tool_{{ software.name.replace(' ', '').replace('/', '') }}[{{ software.name }}] ({% for link in software.links %}link:{{link.url}}[{{link.type}}{{'-'+link.comment if link.comment else '' }}] {% endfor %}):: {{software.short_description}} < {% if software.currentversion %}{{software.currentversion}}{% endif %} | {{software.license}} | {{software.architecture}} | {{ software.programminglanguages |map(attribute='name')|join(', ') }} | {{ software.languages |map(attribute='name')|join(', ') }}>
-{% endfor %}{% endif %}{% endfor %}
+{% for software_category in software_categories %}
+=== link:SoftwareCategory_{{ software_category.name.replace(' ', '').replace('/', '') }}[{{ software_category.name }}]
+_{{software_category.short_description}}_
-== server applications
-{% for softwarecategory in serversoftwareincategory %}{% if softwarecategory[1].count() > 0 %}
-=== {{ softwarecategory[0].name }}
-_{{softwarecategory[0].short_description}}_
-{% for software in softwarecategory[1] %}
-link:Tool_{{ software.name.replace(' ', '').replace('/', '') }}[{{ software.name }}] ({% for link in software.links %}link:{{link.url}}[{{link.type}}{{'-'+link.comment if link.comment else '' }}] {% endfor %}):: {{software.short_description}} < {% if software.currentversion %}{{software.currentversion}}{% endif %} | {{software.license}} | {{software.architecture}} | {{ software.programminglanguages |map(attribute='name')|join(', ') }} | {{ software.languages |map(attribute='name')|join(', ') }}>
-{% endfor %}{% endif %}{% endfor %}
+{% endfor %}
-== programming-frameworks/libraries etc.
-=== R
-{% for softwarecategory in libsoftwareincategory %}{% if softwarecategory[1].count() > 0 %}
-==== {{ softwarecategory[0].name }}
-_{{softwarecategory[0].short_description}}_
-{% for software in softwarecategory[1] %}
-link:Tool_{{ software.name.replace(' ', '').replace('/', '') }}[{{ software.name }}] ({% for link in software.links %}link:{{link.url}}[{{link.type}}{{'-'+link.comment if link.comment else '' }}] {% endfor %}):: {{software.short_description}} < {% if software.currentversion %}{{software.currentversion}}{% endif %} | {{software.license}} | {{software.architecture}} | {{ software.programminglanguages |map(attribute='name')|join(', ') }} | {{ software.languages |map(attribute='name')|join(', ') }}>
-{% endfor %}{% endif %}{% endfor %}
-=== Python
-{% for softwarecategory in libsoftwareincategory %}{% if softwarecategory[2].count() > 0 %}
-==== {{ softwarecategory[0].name }}
-_{{softwarecategory[0].short_description}}_
-{% for software in softwarecategory[2] %}
-link:Tool_{{ software.name.replace(' ', '').replace('/', '') }}[{{ software.name }}] ({% for link in software.links %}link:{{link.url}}[{{link.type}}{{'-'+link.comment if link.comment else '' }}] {% endfor %}):: {{software.short_description}} < {% if software.currentversion %}{{software.currentversion}}{% endif %} | {{software.license}} | {{software.architecture}} | {{ software.programminglanguages |map(attribute='name')|join(', ') }} | {{ software.languages |map(attribute='name')|join(', ') }}>
-{% endfor %}{% endif %}{% endfor %}
-=== Others
-{% for softwarecategory in libsoftwareincategory %}{% if softwarecategory[3].count() > 0 %}
-==== {{ softwarecategory[0].name }}
-_{{softwarecategory[0].short_description}}_
-{% for software in softwarecategory[3] %}
-link:Tool_{{ software.name.replace(' ', '').replace('/', '') }}[{{ software.name }}] ({% for link in software.links %}link:{{link.url}}[{{link.type}}{{'-'+link.comment if link.comment else '' }}] {% endfor %}):: {{software.short_description}} < {% if software.currentversion %}{{software.currentversion}}{% endif %} | {{software.license}} | {{software.architecture}} | {{ software.programminglanguages |map(attribute='name')|join(', ') }} | {{ software.languages |map(attribute='name')|join(', ') }}>
-{% endfor %}{% endif %}{% endfor %}
diff --git a/templates/export/softwares_categories.jinja2 b/templates/export/softwares_categories.jinja2
new file mode 100644
index 0000000..e9a81cb
--- /dev/null
+++ b/templates/export/softwares_categories.jinja2
@@ -0,0 +1,32 @@
+:toc:
+:toclevels: 8
+:toc-title: Software category {{software_category_name}} (work in progress: this list is preliminary and will be updated continuously)
+:sectnums:
+:sectnumlevels: 8
+
+== {{software_category_name}}
+_{{software_category_short_description}}_
+
+{% if standalonesoftwareincategory.count() > 0 %}=== stand-alone applications
+{% for software in standalonesoftwareincategory %}
+link:Tool_{{ software.name.replace(' ', '').replace('/', '') }}[{{ software.name }}] ({% for link in software.links %}link:{{link.url}}[{{link.type}}{{'-'+link.comment if link.comment else '' }}] {% endfor %}):: {{software.short_description}} < {% if software.currentversion %}{{software.currentversion}}{% endif %} | {{software.license}} | {{software.architecture}} | {{ software.programminglanguages |map(attribute='name')|join(', ') }} | {{ software.languages |map(attribute='name')|join(', ') }}>
+{% endfor %}{% endif %}
+
+{% if serversoftwareincategory.count() > 0 %}=== server applications
+{% for software in serversoftwareincategory %}
+link:Tool_{{ software.name.replace(' ', '').replace('/', '') }}[{{ software.name }}] ({% for link in software.links %}link:{{link.url}}[{{link.type}}{{'-'+link.comment if link.comment else '' }}] {% endfor %}):: {{software.short_description}} < {% if software.currentversion %}{{software.currentversion}}{% endif %} | {{software.license}} | {{software.architecture}} | {{ software.programminglanguages |map(attribute='name')|join(', ') }} | {{ software.languages |map(attribute='name')|join(', ') }}>
+{% endfor %}{% endif %}
+
+=== programming-frameworks/libraries etc.
+{% if libsoftwareincategory[0].count() > 0 %}==== R
+{% for software in libsoftwareincategory[0] %}
+link:Tool_{{ software.name.replace(' ', '').replace('/', '') }}[{{ software.name }}] ({% for link in software.links %}link:{{link.url}}[{{link.type}}{{'-'+link.comment if link.comment else '' }}] {% endfor %}):: {{software.short_description}} < {% if software.currentversion %}{{software.currentversion}}{% endif %} | {{software.license}} | {{software.architecture}} | {{ software.programminglanguages |map(attribute='name')|join(', ') }} | {{ software.languages |map(attribute='name')|join(', ') }}>
+{% endfor %}{% endif %}
+{% if libsoftwareincategory[1].count() > 0 %}==== Python
+{% for software in libsoftwareincategory[1] %}
+link:Tool_{{ software.name.replace(' ', '').replace('/', '') }}[{{ software.name }}] ({% for link in software.links %}link:{{link.url}}[{{link.type}}{{'-'+link.comment if link.comment else '' }}] {% endfor %}):: {{software.short_description}} < {% if software.currentversion %}{{software.currentversion}}{% endif %} | {{software.license}} | {{software.architecture}} | {{ software.programminglanguages |map(attribute='name')|join(', ') }} | {{ software.languages |map(attribute='name')|join(', ') }}>
+{% endfor %}{% endif %}
+{% if libsoftwareincategory[2].count() > 0 %}==== Others
+{% for software in libsoftwareincategory[2] %}
+link:Tool_{{ software.name.replace(' ', '').replace('/', '') }}[{{ software.name }}] ({% for link in software.links %}link:{{link.url}}[{{link.type}}{{'-'+link.comment if link.comment else '' }}] {% endfor %}):: {{software.short_description}} < {% if software.currentversion %}{{software.currentversion}}{% endif %} | {{software.license}} | {{software.architecture}} | {{ software.programminglanguages |map(attribute='name')|join(', ') }} | {{ software.languages |map(attribute='name')|join(', ') }}>
+{% endfor %}{% endif %}
--
GitLab