From 7b8730bc6f6e630346904dea0c4677c15d6fe757 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gro=C3=9Fe=2C=20Judith?= <judith.grosse@studium.uni-hamburg.de> Date: Sun, 14 Nov 2021 16:58:50 +0000 Subject: [PATCH] Main --- input/input_fj.py | 144 ------------------ verarbeitung/Processing.py | 117 ++++++++++++++ .../__pycache__/input_fj.cpython-38.pyc | Bin 0 -> 4341 bytes .../__pycache__/json_demo.cpython-38.pyc | Bin 0 -> 750 bytes verarbeitung/json_demo.py | 33 ++++ 5 files changed, 150 insertions(+), 144 deletions(-) delete mode 100755 input/input_fj.py create mode 100644 verarbeitung/Processing.py create mode 100644 verarbeitung/__pycache__/input_fj.cpython-38.pyc create mode 100644 verarbeitung/__pycache__/json_demo.cpython-38.pyc create mode 100644 verarbeitung/json_demo.py diff --git a/input/input_fj.py b/input/input_fj.py deleted file mode 100755 index 00bb012..0000000 --- a/input/input_fj.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python3 -""" -Functions for information retrieval of articles from the ACS journal JCIM - -""" - -__author__ = "Florian Jochens" -__email__ = "fj@andaco.de" -__status__ = "Production" -#__copyright__ = "" -#__credits__ = ["", "", "", ""] -#__license__ = "" -#__version__ = "" -#__maintainer__ = "" - -from bs4 import BeautifulSoup as bs -import requests as req -import sys -from pathlib import Path - -class Publication: - #_registry = [] - _citations = [] - - def __init__(self, title, publication_date, contributors, doi_url, - subjects, num_citations): - #self._registry.append(self) - self.title = title - self.publication_date = publication_date - self.contributors = contributors - self.doi_url = doi_url - self.subjects = subjects - self.num_citations = num_citations - -class Citation: - def __init__(self, title, journal, contributors, doi_url): - self.title = title - self.journal = journal - self.contributors = contributors - self.doi_url = doi_url - -def get_article_info(soup): - header = soup.find('div', class_ = 'article_header-left pull-left') - article_title = header.find('span', class_ = 'hlFld-Title').text - publication_date = header.find('span', class_ = 'pub-date-value').text - for link in header.find('div', class_ = 'article_header-doiurl'): - doi_url = link.get('href') - subs = header.find('div', class_ = 'article_header-taxonomy') - subjects = [] - for sub in subs.find_all('a'): - subjects.append(sub.get('title')) - cons = header.find('ul', class_ = 'loa') - contributors = [] - for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): - contributors.append(con.text) - numc = header.find('div', class_ = 'articleMetrics_count') - if not numc.a: - num_citations = 0 - else: - num_citations = numc.a.text - - pub = Publication(article_title, publication_date, contributors, doi_url, - subjects, num_citations) - return pub - -def get_download_url(): - export = soup.find('div', class_ = 'cit-download-dropdown_content') - url = 'https://pubs.acs.org' - for link in export.find_all('a'): - if link.get('title') == 'Citation and references': - url += link.get('href') - print(url) - return url - -def download(url): # Download citation and references file - if url.find('='): - filename = url.rsplit('=', 1)[1] - path = Path(('./files/' + filename)) - if path.is_file(): - print("File already exists") - else: - print("File does not exist") - -def get_citation_info(pub, num_citations, soup): - pub._citations = [] - details = soup.find('ol', class_ = 'cited-content_cbyCitation') - titles = [] - for title in details.find_all('span', - class_ = 'cited-content_cbyCitation_article-title'): - titles.append(title.text.replace('.', '')) - journal_names = [] - for name in details.find_all('span', - class_ = 'cited-content_cbyCitation_journal-name'): - journal_names.append(name.text) - doi_urls = [] - for url in details.find_all('a'): - doi_urls.append(url.get('href')) - contributors = [] - for contrib in details.find_all('span', - class_ = 'cited-content_cbyCitation_article-contributors'): - contributors.append(contrib.text) - for i in range(0, int(num_citations)): - pub._citations.append(Citation(titles[i], journal_names[i], - contributors[i], doi_urls[i])) -def print_pub_info(pub): - print(f'''Article title: {pub.title} -Publication date: {pub.publication_date} -DOI-URL: {pub.doi_url} - -Subjects:''') - print(*(pub.subjects), sep = ", ") - print('\nContributors:') - print(*(pub.contributors), sep = ", ") - - if int(pub.num_citations) > 0: - if int(pub.num_citations) == 1: - print(f'\nThis publication is cited by the following publication:\n') - else: - print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n') - for citation in pub._citations: - print(f''' - Title: {citation.title} - Journal: {citation.journal} - Contributors: {citation.contributors} - DOI-URL: {citation.doi_url} - ''') - else: - print('\nThis publication is not cited by any other publication.') - -def input(url): - html_text = req.get(url).text - soup = bs(html_text, 'html.parser') - - pub = get_article_info(soup) - if int(pub.num_citations) > 0: - get_citation_info(pub, int(pub.num_citations), soup) - return pub - -#if len(sys.argv) != 2: -# sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) -# exit(1) -#url = sys.argv[1] -#pub = input(url) -#print_pub_info(pub) diff --git a/verarbeitung/Processing.py b/verarbeitung/Processing.py new file mode 100644 index 0000000..bfa533a --- /dev/null +++ b/verarbeitung/Processing.py @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Nov 3 16:54:43 2021 + +@author: Malte Schokolowski +""" + +from bs4 import BeautifulSoup as bs +import requests as req +import sys +from pathlib import Path +from input_fj import input +from json_demo import output_to_json + + + +def process_main(doi_input_array, depth): + # ERROR-Handling doi_array = NULL + if (len(doi_input_array) == 0): + print("Error, no input data") + + # ERROR- wenn für die Tiefe eine negative Zahl eingegeben wird + if (depth < 0): + print("Error, depth of search must be positive") + + + # Leeres Array für die Knoten(nodes) wird erstellt. + # Leeres Array für die Kanten(edges) wird erstellt. + global nodes, edges + nodes = [] + edges = [] + + # Jede Publikation aus dem Input-Array wird in den Knoten-Array(nodes) eingefügt. + for pub_doi in doi_input_array: + pub = input(pub_doi) + not_in_nodes = True + for node in nodes: + if (pub.doi_url == node.doi_url): + not_in_nodes = False + break + if (not_in_nodes): + nodes.append(pub) + else: + doi_input_array.remove(pub_doi) + + process_rec_depth(doi_input_array, 0, depth) + + output_to_json(nodes,edges) + return(nodes,edges) + + +def process_rec_depth(array, depth, depth_max): + # Die Tiefe wird bei jedem rekursiven Aufruf um 1 erhöht. + depth += 1 + + # Für jede Publikation im Input-Array wird ein Klassenobjekt erstellt. + for pub_doi in array: + pub = input(pub_doi) + + # Für jede citation, die in der entsprecheneden Klasseninstanz der Publikation gespeichert sind, + # wird geprüft, ob diese bereits als Knoten existiert. + for citation in pub._citations: + + # Wenn die citation noch nicht im Knoten-Array(nodes) existiert UND die maximale Tiefe + # noch nicht erreicht wurde, wird diese als Knoten im Knoten-Array gespeichert. Zusätzlich + # wird die Verbindung zur Publikation als Tupel im Kanten-Array(edges) gespeichert. + not_in_nodes = True + for node in nodes: + if (citation.doi_url == node.doi_url): + not_in_nodes = False + break + if (not_in_nodes): + if (depth <= depth_max): + nodes.append(citation) + edges.append([pub.doi_url,citation.doi_url]) + + # Wenn die citaion bereits im Knoten-Array existiert, wird nur die Verbindung zur Publikation + # als Tupel im Kanten-Array(edges) gespeichert. + else: + edges.append([pub.doi_url,citation.doi_url]) + + # Wenn die maximale Tiefe noch nicht erreicht wurde, werden alle citations aus der Publikation + # in ein Array geschrieben und mit diesem die Funktion erneut aufgerufen. + if (depth < depth_max): + cit_arr = [] + for citation in pub._citations: + + # Momentan werden nur die citations mit acs in der URL gespeichert, da wir von anderen + # Quellen die Infotmationen nicht extrahieren können. + if ("acs" in citation.doi_url): + cit_arr.append(citation.doi_url) + + # Rekusriver Aufruf der Funktion. + process_rec_depth(cit_arr, depth, depth_max) + + + +# Programmtest, weil noch keine Verbindung zum Input besteht. +arr = [] +arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') +arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') +arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332') +#arr.append('https://doi.org/10.1021/acs.jcim.0c00741') + +#arr.append('https://doi.org/10.1021/ci700007b') +#arr.append('https://doi.org/10.1021/acs.jcim.5b00292') +#url = sys.argv[1] +#arr.append[url] + +nodes,edges = process_main(arr,1) + +print("Knoten:\n") +for node in nodes: + print(node.title, "\n") +print("\nKanten:\n") +for edge in edges: + print(edge,"\n") \ No newline at end of file diff --git a/verarbeitung/__pycache__/input_fj.cpython-38.pyc b/verarbeitung/__pycache__/input_fj.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb7f56fc9742c5d19e2fff3d15c51dc4d59e1b1a GIT binary patch literal 4341 zcmWIL<>g{vU|?AJttZh-n1SIjh=Yuo85kHG7#J9eI~W)kQW&BbQW&EcQ<$QdQkbKd zQ&>`1b69d&qu3Z3Qdm;ha@eCdz%*wRX9`OSdk$AFcN8}xgF8bCM+#>PLkedzQxuOo zLkd?4cMC%bcQaEIZwgNeZwo^dUkYCee+xqte~LhgU<*T(K#EX`a0^3}V2VhJXbVG> zP>NWJcnd?6a0**6gQi5450_hMUUErheqOOcT7Hp2W?ou;QEnngK%pqLq$o4BEHOtR zKTRRAs3bEvC$(51ttdZNp(G<!!O=NbAuGSMC@(Qb!OPjxmy4^4-z_J<C^Ion!7D#G zBQ>wMiYG10Au%r{F*#o^B{hmGpeR436l~i|P~d1X-r{viO)M?ROe@U^&Mz(S(`3BG z5|CJukqnYX#!RsAlVD(ANM(p(Oks#(N@s{-PGL-8YGH_CNns9V&}6yA9Z;H-lbH<m zKr%a$2_QDe5NA*%XfQA^)G*dC#52?|)iA^})-cyF#52{f)G)*|*Ra+w#Iw|})iA`f zrZ5CEtYq}lWV^*$l39|IdP|@HVSIc_VoB;Pp5*+zlA_F{(vtk5;#=$~`I+&hMLD-P zic6ESQj<%HZ}H}p=Ef&ymVlj9tjT_hr8qSwt%!|*fuRWGs3Hy!ixWhEl28#hC>b#n z@iH(l{0eloiU}=FEh>)jP0T4tjd9N`iE+*h)(t4i&q~cMiAm1XE!Hi7aCFO3ixP{H zQZq|R^U`BbIGK3`r6uucS$YMPw>aYCGxIV_;^QSj0l@<bA!as47_1UO3O_xVQa?>r zkiNXc+|>B^TU_z+x%nxjIUqJqe0*VPVh&se?$08AkcU8?0Q&?&@PWiw7#J8tL3V=} z984@sMIZ+52p2($aA;&RLnE8R85*p~tRR0tF^CP1X&F2*4T{|&kZMg9aCF>ahbCu4 zlxVURfr1DuT?BG9LJHy~un)oh0(n>k<S$TsGC)JLN)YBmq`+h;5+AcznG5g@Hd z#v#lE`CS=gCdfY=j71=RG9%QVAPS@tl!(Bo`US|Z3|S1IP)cF$WvXSYVax&*D6Ak} z7Go_Fl+Om`GlThzDeNG=3qvelElUl{0;Uv>g^abVg*H%GPLOOZTRKB6dkxzH<`k}l zj4lkZBDEYf><d_GI2JIaa4%$B$jHc0m;u(olfnzu!I{ob%T>df1uEY7Kq4*-u>!T+ zHCzi=YPho?I_80OFsAT><ZF3qc(PcFPo)Ud^5$`X*jbFVyd`WkOf}3kTs5pU>@~d2 zY_)tfe8CKwf_}G{Q!>k{WT6F9d`4<wN@|gAPHI|-LP2Rx4w$u)@fKThPGWI!{4JK^ zg2cQk?u;C_oD|&<P;Rc`D=1CU1?6Jhvc#Oy)GARlvr_UiL3x=aqbN14N*qnHB(Wkt zFF&{P7GokfdE8<u&52^p$xo~jhS=c@DHt3}OEU6{qC}wDeL;n4a&dfeeraAw5hyM} zMM7F;Udk<&lGKWlTg>UHCAT<00`ZACIk(sn3kp*6Qi`NO2?dl>H2H6_6z7)~++u_H zIEoi)ay+<*0+%VbSaLG+vWqww7#MD`6qhCy7lDeFTg=6!Nw-*%^Ye<q$@ms?a(-SE zOI~SiGPu;a#avLDgp>|J$@-Q+dTL2L%mz^Hpan{gI-tbI$j2zaD8MMh$iv9T$im3S zC<LZKBp)dKGjT9+F)}eQ{byk+0_j4@<Dd!=l=r|X*NcIHp@boe5uS1xQdnvkYZw+V zrLclin?fy94buYV6t;zoDeT!yMKU#vU_Qq}MstQ*#zL7AmKw%pMn;Aj#$X0bPQM~C z1_p*I+2qU;-IV<Dyqx^R6y21f`~nas9#mqd=9LtQgVcy*l#~<{Tj}c;lqMDHB_<c^ z<rk$xLK{>>SBb+iutH*9ib7FpT53^hUUF)2krV?1gC=W{5-46|L4*Q`kOu`hYe7+F zUWq32Ew<E(g8ZVATg;_JIYpqt0c+@j90hVAsN!$}g)S)I8Ngx52M%Kn#v(li1_qRz z4+=j}asY>)3pnR5U`SzH$XFy*!>|Axk_(wqm?Rl$8B3T_m}?lt8Ja;Q0Mi2Ig$zX& zB`hf{pzzA$C}Bxq1+hS-p(dN(FGky!pj27Kp{JjgnUh+qUnT68nUkuJm{XLRm{O^b zT9H{?QVbPL$xkg-$jdK*h-$JHfkNUITTyXAPG(6F$c?wyGmGOv1{G<80*tvx7ZgGq zAQ4claf_uOu_OaMSU6xo;tdK8PzlVy$i*nY$iv9N$i-A-3JMI6;h<Cq!k`ipl-R+6 z@rr?gA)NtIxg#Yh=5&TymKx?PCQxe&RNS~Q#7fk%*03yKu3^n$S-_gYmcqV}30#n= zr8CsBLDh0V)e1n0zZ&)|7O<M2bcR}v8s-H|DV*RWDpJc?!?A$5hI0WMsE7osDFPRP z5Iecy=75So<{ECOIkQUGQ+P@^K*_h6$%P?StCpvRA&X%Fa|ve+OAU`CLk%04<N}i% zAhH?af?eP!<Mq46l%G>2nVeaYnxYF$+wsXsm9T^hPUl6Epv10@QxPnS>w*iBTa0?Q z7^>88tA<vTx}Ye8WEN0%sM5o&16qWD9SCbKL5e@FB4JR@0i~QGa2`?wapgeCguN)W zASW?7wMZVsV=YR|OHaMUoS9bwt_d}Hi!?z>K`kb5KD@=Al3J3OnNxg=4eX6#aKRkK z3w3Qg$hF0{SU^-12ehSD3{H@c@{k=AdPSK@w-__gQ=}lM7=*QM;z8wL2q<Y9fl?nM z8#u4BF*5z<ViIFyVT7<?Wg>zv#3;hZ#=!Q!$Ptu)LAea19fU#kJIEX01bhr!8m2JT zFk~^#W=LVGVaQ^d&5*)e!;r;1n<0f|E;}f{mN3*XWU-{Mrm$r)_AxRtlrW?q#95KV z*=iWF*qa$sII>xac9p<*oY_o81tknATp+XZgi07vxIyY>Go<j$Wd@1WFl2GKFvRNB zGL|r;@YXP9f!xAZ!<fZ6n<0h2hB1qEHbaU)4I|i1f^#`RdI~%8I7%2&gg|U?;uQ9) z5^{v}92CG&Vx^#<pim{uh1ASc0M)=&AVC*@Pu<WUAFvvbIt7I)ZZ581XiM7a7ISfG zK^2pZLKPnuv~mTvx~;(JsK@}6!Y#Q%GBS%5kecZVnZ*j=WT}u;3GTO~<>%z&muKds zBdfCFs?s4vYZX7T?qVyhDq${=lfexOxCfwuUXWfAR2V9V;scQHLE2G6po$l4CP)H= zt1Pg(5LEoaJ(!qRsgPfiky?c8JUvY=aOn%q4&Vj?xM=`x6o6BHku^LYf%77$rK`yV z&Rw@SV5uHGOYnhfx%h(8ByfJn1my=%HO0Wl!^rZVg^A@K7pnjx2crNJs21a55@O_H zWP!@CFthw+VPW~h$6f@gyHIL2P^|{S;Jg5;4@(#pFxD_EWXxi!Wh`N?VN79cW~yZ> zVX0wiW~^l{VXa}#VrynhVajGHvMXV)fs52IH8V3Z)POpM%zjne86~+ndIgC^#i>Q$ z1f$7ui@7MZ5L~ttfr}faq+)PG1t~Z{IaQMdTt-B3f=q}9HPXPP4LEqv1BDgb*J=WV z1*F>IVdP@s0Jn{b+(8wf$SwBx_>}x)NJlCW+&GMnzr`6JpPHMPnFA3nE=epYEsl@B z#hg@ZQUod-irhdYAV*PZVQFe{NpX=Yh|64DS$vBfRDtGXCV^{Pa9;{sI6^!EiVtvg z2`+NLVGVW$lCwc|XA!8yaf`zy7u>J216f!M3K14o7EqM2gX&!lMjj?UK0bC12@XyU JP7YBHNdSGl8^Hho literal 0 HcmV?d00001 diff --git a/verarbeitung/__pycache__/json_demo.cpython-38.pyc b/verarbeitung/__pycache__/json_demo.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..519d14366f17ebf12bfdc3f6ae2e985ad9d6d229 GIT binary patch literal 750 zcmWIL<>g{vU|_gqJTWnfnStRkh=Yuo7#J8F7#J9e6&M&8QW#Pga~PsPG*b>^E>jc} zBSQ*v3QG$^6mtq=FoP!ROOReaO~zZSnRx}JCCMP|AR!P2S;Wb}z~BrrZUqAaLkUAO zLoE}SWG-QBW~gPUVQ^uH)u?3!^Vn)wvzThwQW$L*YFM+FYuHklY#3^o7O>Q?Eo5Y5 zC={z<bYY0qsbvSN=BQy$VU}d5;YeYzVW?qGVFk0<Y#3^o7qHZDfOWXkFx9Z6u!F>E zSW-A_7)n@EI8(TqnfirlIZN0Uu-CBEa4uwGWC&+4We8*lVu)Z!VF+f><o3J8l9!m9 zdW$(FKeLDt6kcq_`K3k4sV^a%lEk9))RJ4QdHE@+#kW{<GV`*Ft9Y}D^Yh|MQY%XI zN-9cjF_vp`++xYeEH1gll9HKRa*MSjvm__=7JEv5W_)Q;&MmgYf`Zh%lv^zM1*v(r zSU|dOv80sd7HD$aVhp>*=z2>Cq&q%8Egob~Jjjq@gji}ydMZ?m8zco%mza}tiv^_k z7DrxcIau}<3rOKDE*L-F=oWi^X-QgUPU=dAB3T9ohF?L>RxzQ)sYS&xzKJ;{sWI-E zB{9yK!MXuO`B|yiB{9jFy2ZK$5RPtHYEfcQQfg*NX<m8^3J2tY_>|P#e7%CoTYUMY zB?YA=@g@23AaPbu8c+hI3Pv_C<X~iD<YMGv;$Z}_co;#DgOP)Yhp|YWfq_AjrAQDI r>>S`^6`z(>1WH+8DX^Q72syAio80`A(wtN~kg{SH1_lNWMjj>r?i$Hd literal 0 HcmV?d00001 diff --git a/verarbeitung/json_demo.py b/verarbeitung/json_demo.py new file mode 100644 index 0000000..77ce148 --- /dev/null +++ b/verarbeitung/json_demo.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +import json +from input_fj import input + +def output_to_json(V,E): + list_of_node_dicts = list() + list_of_edge_dicts = list() + dict_of_all = dict() + for node in V: + new_dict = dict() + new_dict["name"] = node.title + new_dict["author"] = node.contributors + new_dict["year"] = node.publication_date + new_dict["doi"] = node.doi_url + + + list_of_node_dicts.append(new_dict) + for edge in E: + new_dict_2 = dict() + new_dict_2["source"] = edge[0] + new_dict_2["target"] = edge[1] + list_of_edge_dicts.append(new_dict_2) + dict_of_all["nodes"] = list_of_node_dicts + dict_of_all["links"] = list_of_edge_dicts + #return(dict_of_all) + with open('json_text.txt','w') as outfile: + json.dump(dict_of_all, outfile) + + +#knoten = ["doi1", "doi2", "doi3"] +#kanten = [[1,2],[3,4],[5,6]] +#output_to_json(knoten,kanten) + -- GitLab