From 7b8730bc6f6e630346904dea0c4677c15d6fe757 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gro=C3=9Fe=2C=20Judith?=
 <judith.grosse@studium.uni-hamburg.de>
Date: Sun, 14 Nov 2021 16:58:50 +0000
Subject: [PATCH] Main

---
 input/input_fj.py                             | 144 ------------------
 verarbeitung/Processing.py                    | 117 ++++++++++++++
 .../__pycache__/input_fj.cpython-38.pyc       | Bin 0 -> 4341 bytes
 .../__pycache__/json_demo.cpython-38.pyc      | Bin 0 -> 750 bytes
 verarbeitung/json_demo.py                     |  33 ++++
 5 files changed, 150 insertions(+), 144 deletions(-)
 delete mode 100755 input/input_fj.py
 create mode 100644 verarbeitung/Processing.py
 create mode 100644 verarbeitung/__pycache__/input_fj.cpython-38.pyc
 create mode 100644 verarbeitung/__pycache__/json_demo.cpython-38.pyc
 create mode 100644 verarbeitung/json_demo.py

diff --git a/input/input_fj.py b/input/input_fj.py
deleted file mode 100755
index 00bb012..0000000
--- a/input/input_fj.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/usr/bin/env python3
-"""
-Functions for information retrieval of articles from the ACS journal JCIM
-
-"""
-
-__author__ = "Florian Jochens"
-__email__ = "fj@andaco.de"
-__status__ = "Production"
-#__copyright__ = ""
-#__credits__ = ["", "", "", ""]
-#__license__ = ""
-#__version__ = ""
-#__maintainer__ = ""
-
-from bs4 import BeautifulSoup as bs
-import requests as req
-import sys  
-from pathlib import Path
-
-class Publication:
-    #_registry = []
-    _citations = []
-    
-    def __init__(self, title, publication_date, contributors, doi_url, 
-                 subjects, num_citations):
-        #self._registry.append(self)
-        self.title = title
-        self.publication_date = publication_date
-        self.contributors = contributors
-        self.doi_url = doi_url
-        self.subjects = subjects
-        self.num_citations = num_citations
-
-class Citation:
-    def __init__(self, title, journal, contributors, doi_url):
-        self.title = title
-        self.journal = journal
-        self.contributors = contributors
-        self.doi_url = doi_url
-
-def get_article_info(soup):
-    header = soup.find('div', class_ = 'article_header-left pull-left')
-    article_title = header.find('span', class_ = 'hlFld-Title').text
-    publication_date = header.find('span', class_ = 'pub-date-value').text
-    for link in header.find('div', class_ = 'article_header-doiurl'):
-        doi_url = link.get('href')
-    subs = header.find('div', class_ = 'article_header-taxonomy')
-    subjects = []
-    for sub in subs.find_all('a'):
-        subjects.append(sub.get('title'))
-    cons = header.find('ul', class_ = 'loa')
-    contributors = []
-    for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'):
-        contributors.append(con.text)
-    numc = header.find('div', class_ = 'articleMetrics_count')
-    if not numc.a:
-        num_citations = 0
-    else:
-        num_citations = numc.a.text
-
-    pub = Publication(article_title, publication_date, contributors, doi_url,
-                      subjects, num_citations)
-    return pub
-
-def get_download_url():
-    export = soup.find('div', class_ = 'cit-download-dropdown_content')
-    url = 'https://pubs.acs.org'
-    for link in export.find_all('a'):
-        if link.get('title') == 'Citation and references':
-            url += link.get('href')     
-    print(url)
-    return url
-
-def download(url): # Download citation and references file
-    if url.find('='):
-        filename = url.rsplit('=', 1)[1]
-    path = Path(('./files/' + filename))
-    if path.is_file():
-        print("File already exists")
-    else:
-        print("File does not exist")
-
-def get_citation_info(pub, num_citations, soup):
-    pub._citations = []
-    details = soup.find('ol', class_ = 'cited-content_cbyCitation')
-    titles = [] 
-    for title in details.find_all('span', 
-            class_ = 'cited-content_cbyCitation_article-title'):
-        titles.append(title.text.replace('.', ''))
-    journal_names = []
-    for name in details.find_all('span',
-            class_ = 'cited-content_cbyCitation_journal-name'):
-        journal_names.append(name.text)
-    doi_urls = []
-    for url in details.find_all('a'):
-        doi_urls.append(url.get('href'))
-    contributors = []
-    for contrib in details.find_all('span', 
-            class_ = 'cited-content_cbyCitation_article-contributors'):
-        contributors.append(contrib.text)
-    for i in range(0, int(num_citations)):
-        pub._citations.append(Citation(titles[i], journal_names[i], 
-                              contributors[i], doi_urls[i]))
-def print_pub_info(pub):
-    print(f'''Article title:    {pub.title}
-Publication date: {pub.publication_date}
-DOI-URL:          {pub.doi_url}
-
-Subjects:''')
-    print(*(pub.subjects), sep = ", ")
-    print('\nContributors:')
-    print(*(pub.contributors), sep = ", ")
-
-    if int(pub.num_citations) > 0:
-        if int(pub.num_citations) == 1:
-            print(f'\nThis publication is cited by the following publication:\n')
-        else:
-            print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n')
-        for citation in pub._citations:
-            print(f'''
-    Title:        {citation.title}
-    Journal:      {citation.journal}
-    Contributors: {citation.contributors}
-    DOI-URL:      {citation.doi_url}
-            ''')
-    else:
-        print('\nThis publication is not cited by any other publication.')
-
-def input(url):
-    html_text = req.get(url).text
-    soup = bs(html_text, 'html.parser')
-    
-    pub = get_article_info(soup)
-    if int(pub.num_citations) > 0:
-        get_citation_info(pub, int(pub.num_citations), soup)
-    return pub
-
-#if len(sys.argv) != 2:
-#    sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0]))
-#    exit(1)
-#url = sys.argv[1]
-#pub = input(url)
-#print_pub_info(pub)
diff --git a/verarbeitung/Processing.py b/verarbeitung/Processing.py
new file mode 100644
index 0000000..bfa533a
--- /dev/null
+++ b/verarbeitung/Processing.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Nov  3 16:54:43 2021
+
+@author: Malte Schokolowski
+"""
+
+from bs4 import BeautifulSoup as bs
+import requests as req
+import sys  
+from pathlib import Path
+from input_fj import input
+from json_demo import output_to_json
+
+
+
+def process_main(doi_input_array, depth):
+    # ERROR-Handling doi_array = NULL
+    if (len(doi_input_array) == 0):
+        print("Error, no input data")
+
+    # ERROR- wenn für die Tiefe eine negative Zahl eingegeben wird
+    if (depth < 0):
+        print("Error, depth of search must be positive")
+    
+
+    # Leeres Array für die Knoten(nodes) wird erstellt.
+    # Leeres Array für die Kanten(edges) wird erstellt.
+    global nodes, edges
+    nodes = []
+    edges = []
+    
+    # Jede Publikation aus dem Input-Array wird in den Knoten-Array(nodes) eingefügt.
+    for pub_doi in doi_input_array:
+        pub = input(pub_doi)
+        not_in_nodes = True
+        for node in nodes:
+            if (pub.doi_url == node.doi_url):
+                not_in_nodes = False
+                break
+        if (not_in_nodes):
+            nodes.append(pub)
+        else:
+            doi_input_array.remove(pub_doi)
+
+    process_rec_depth(doi_input_array, 0, depth)
+
+    output_to_json(nodes,edges)
+    return(nodes,edges)
+    
+    
+def process_rec_depth(array, depth, depth_max):  
+    # Die Tiefe wird bei jedem rekursiven Aufruf um 1 erhöht.
+    depth += 1
+
+    # Für jede Publikation im Input-Array wird ein Klassenobjekt erstellt.
+    for pub_doi in array:
+        pub = input(pub_doi)
+
+        # Für jede citation, die in der entsprecheneden Klasseninstanz der Publikation gespeichert sind, 
+        # wird geprüft, ob diese bereits als Knoten existiert.
+        for citation in pub._citations:
+
+            # Wenn die citation noch nicht im Knoten-Array(nodes) existiert UND die maximale Tiefe 
+            # noch nicht erreicht wurde, wird diese als Knoten im Knoten-Array gespeichert. Zusätzlich 
+            # wird die Verbindung zur Publikation als Tupel im Kanten-Array(edges) gespeichert. 
+            not_in_nodes = True
+            for node in nodes:
+                if (citation.doi_url == node.doi_url):
+                    not_in_nodes = False
+                    break
+            if (not_in_nodes):
+                if (depth <= depth_max):
+                    nodes.append(citation)
+                    edges.append([pub.doi_url,citation.doi_url])
+
+            # Wenn die citaion bereits im Knoten-Array existiert, wird nur die Verbindung zur Publikation 
+            # als Tupel im Kanten-Array(edges) gespeichert.            
+            else:
+                edges.append([pub.doi_url,citation.doi_url])
+            
+        # Wenn die maximale Tiefe noch nicht erreicht wurde, werden alle citations aus der Publikation 
+        # in ein Array geschrieben und mit diesem die Funktion erneut aufgerufen.      
+        if (depth < depth_max):
+            cit_arr = []
+            for citation in pub._citations:
+
+                # Momentan werden nur die citations mit acs in der URL gespeichert, da wir von anderen 
+                # Quellen die Infotmationen nicht extrahieren können.
+                if ("acs" in citation.doi_url):
+                    cit_arr.append(citation.doi_url)
+
+            # Rekusriver Aufruf der Funktion.
+            process_rec_depth(cit_arr, depth, depth_max)
+            
+ 
+    
+# Programmtest, weil noch keine Verbindung zum Input besteht.
+arr = []
+arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
+arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
+arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332')
+#arr.append('https://doi.org/10.1021/acs.jcim.0c00741')
+
+#arr.append('https://doi.org/10.1021/ci700007b')
+#arr.append('https://doi.org/10.1021/acs.jcim.5b00292')
+#url = sys.argv[1]
+#arr.append[url]
+
+nodes,edges = process_main(arr,1)
+
+print("Knoten:\n")
+for node in nodes:
+    print(node.title, "\n")
+print("\nKanten:\n")
+for edge in edges:
+    print(edge,"\n")
\ No newline at end of file
diff --git a/verarbeitung/__pycache__/input_fj.cpython-38.pyc b/verarbeitung/__pycache__/input_fj.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb7f56fc9742c5d19e2fff3d15c51dc4d59e1b1a
GIT binary patch
literal 4341
zcmWIL<>g{vU|?AJttZh-n1SIjh=Yuo85kHG7#J9eI~W)kQW&BbQW&EcQ<$QdQkbKd
zQ&>`1b69d&qu3Z3Qdm;ha@eCdz%*wRX9`OSdk$AFcN8}xgF8bCM+#>PLkedzQxuOo
zLkd?4cMC%bcQaEIZwgNeZwo^dUkYCee+xqte~LhgU<*T(K#EX`a0^3}V2VhJXbVG>
zP>NWJcnd?6a0**6gQi5450_hMUUErheqOOcT7Hp2W?ou;QEnngK%pqLq$o4BEHOtR
zKTRRAs3bEvC$(51ttdZNp(G<!!O=NbAuGSMC@(Qb!OPjxmy4^4-z_J<C^Ion!7D#G
zBQ>wMiYG10Au%r{F*#o^B{hmGpeR436l~i|P~d1X-r{viO)M?ROe@U^&Mz(S(`3BG
z5|CJukqnYX#!RsAlVD(ANM(p(Oks#(N@s{-PGL-8YGH_CNns9V&}6yA9Z;H-lbH<m
zKr%a$2_QDe5NA*%XfQA^)G*dC#52?|)iA^})-cyF#52{f)G)*|*Ra+w#Iw|})iA`f
zrZ5CEtYq}lWV^*$l39|IdP|@HVSIc_VoB;Pp5*+zlA_F{(vtk5;#=$~`I+&hMLD-P
zic6ESQj<%HZ}H}p=Ef&ymVlj9tjT_hr8qSwt%!|*fuRWGs3Hy!ixWhEl28#hC>b#n
z@iH(l{0eloiU}=FEh>)jP0T4tjd9N`iE+*h)(t4i&q~cMiAm1XE!Hi7aCFO3ixP{H
zQZq|R^U`BbIGK3`r6uucS$YMPw>aYCGxIV_;^QSj0l@<bA!as47_1UO3O_xVQa?>r
zkiNXc+|>B^TU_z+x%nxjIUqJqe0*VPVh&se?$08AkcU8?0Q&?&@PWiw7#J8tL3V=}
z984@sMIZ+52p2($aA;&RLnE8R85*p~tRR0tF^CP1X&F2*4T{|&kZMg9aCF>ahbCu4
zlxVURfr1DuT?BG9LJHy~un)oh0(n>k<S$TsGC)JLN)YBmq`+h;5&#8+AcznG5g@Hd
z#v#lE`CS=gCdfY=j71=RG9%QVAPS@tl!(Bo`US|Z3|S1IP)cF$WvXSYVax&*D6Ak}
z7Go_Fl+Om`GlThzDeNG=3qvelElUl{0;Uv>g^abVg*H%GPLOOZTRKB6dkxzH<`k}l
zj4lkZBDEYf><d_GI2JIaa4%$B$jHc0m;u(olfnzu!I{ob%T>df1uEY7Kq4*-u>!T+
zHCzi=YPho?I_80OFsAT><ZF3qc(PcFPo)Ud^5$`X*jbFVyd`WkOf}3kTs5pU>@~d2
zY_)tfe8CKwf_}G{Q!>k{WT6F9d`4<wN@|gAPHI|-LP2Rx4w$u)@fKThPGWI!{4JK^
zg2cQk?u;C_oD|&<P;Rc`D=1CU1?6Jhvc#Oy)GARlvr_UiL3x=aqbN14N*qnHB(Wkt
zFF&{P7GokfdE8<u&52^p$xo~jhS=c@DHt3}OEU6{qC}wDeL;n4a&dfeeraAw5hyM}
zMM7F;Udk<&lGKWlTg>UHCAT<00`ZACIk(sn3kp*6Qi`NO2?dl>H2H6_6z7)~++u_H
zIEoi)ay+<*0+%VbSaLG+vWqww7#MD`6qhCy7lDeFTg=6!Nw-*%^Ye<q$@ms?a(-SE
zOI~SiGPu;a#avLDgp>|J$@-Q+dTL2L%mz^Hpan{gI-tbI$j2zaD8MMh$iv9T$im3S
zC<LZKBp)dKGjT9+F)}eQ{byk+0_j4@<Dd!=l=r|X*NcIHp@boe5uS1xQdnvkYZw+V
zrLclin?fy94buYV6t;zoDeT!yMKU#vU_Qq}MstQ*#zL7AmKw%pMn;Aj#$X0bPQM~C
z1_p*I+2qU;-IV<Dyqx^R6y21f`~nas9#mqd=9LtQgVcy*l#~<{Tj}c;lqMDHB_<c^
z<rk$xLK{>>SBb+iutH*9ib7FpT53^hUUF)2krV?1gC=W{5-46|L4*Q`kOu`hYe7+F
zUWq32Ew<E(g8ZVATg;_JIYpqt0c+@j90hVAsN!$}g)S)I8Ngx52M%Kn#v(li1_qRz
z4+=j}asY>)3pnR5U`SzH$XFy*!>|Axk_(wqm?Rl$8B3T_m}?lt8Ja;Q0Mi2Ig$zX&
zB`hf{pzzA$C}Bxq1+hS-p(dN(FGky!pj27Kp{JjgnUh+qUnT68nUkuJm{XLRm{O^b
zT9H{?QVbPL$xkg-$jdK*h-$JHfkNUITTyXAPG(6F$c?wyGmGOv1{G<80*tvx7ZgGq
zAQ4claf_uOu_OaMSU6xo;tdK8PzlVy$i*nY$iv9N$i-A-3JMI6;h<Cq!k`ipl-R+6
z@rr?gA)NtIxg#Yh=5&TymKx?PCQxe&RNS~Q#7fk%*03yKu3^n$S-_gYmcqV}30#n=
zr8CsBLDh0V)e1n0zZ&)|7O<M2bcR}v8s-H|DV*RWDpJc?!?A$5hI0WMsE7osDFPRP
z5Iecy=75So<{ECOIkQUGQ+P@^K*_h6$%P?StCpvRA&X%Fa|ve+OAU`CLk%04<N}i%
zAhH?af?eP!<Mq46l%G>2nVeaYnxYF$+wsXsm9T^hPUl6Epv10@QxPnS>w*iBTa0?Q
z7^>88tA<vTx}Ye8WEN0%sM5o&16qWD9SCbKL5e@FB4JR@0i~QGa2`?wapgeCguN)W
zASW?7wMZVsV=YR|OHaMUoS9bwt_d}Hi!?z>K`kb5KD@=Al3J3OnNxg=4eX6#aKRkK
z3w3Qg$hF0{SU^-12ehSD3{H@c@{k=AdPSK@w-__gQ=}lM7=*QM;z8wL2q<Y9fl?nM
z8#u4BF*5z<ViIFyVT7<?Wg>zv#3;hZ#=!Q!$Ptu)LAea19fU#kJIEX01bhr!8m2JT
zFk~^#W=LVGVaQ^d&5*)e!;r;1n<0f|E;}f{mN3*XWU-{Mrm$r)_AxRtlrW?q#95KV
z*=iWF*qa$sII>xac9p<*oY_o81tknATp+XZgi07vxIyY>Go<j$Wd@1WFl2GKFvRNB
zGL|r;@YXP9f!xAZ!<fZ6n<0h2hB1qEHbaU)4I|i1f^#`RdI~%8I7%2&gg|U?;uQ9)
z5^{v}92CG&Vx^#<pim{uh1ASc0M)=&AVC*@Pu<WUAFvvbIt7I)ZZ581XiM7a7ISfG
zK^2pZLKPnuv~mTvx~;(JsK@}6!Y#Q%GBS%5kecZVnZ*j=WT}u;3GTO~<>%z&muKds
zBdfCFs?s4vYZX7T?qVyhDq${=lfexOxCfwuUXWfAR2V9V;scQHLE2G6po$l4CP)H=
zt1Pg(5LEoaJ(!qRsgPfiky?c8JUvY=aOn%q4&Vj?xM=`x6o6BHku^LYf%77$rK`yV
z&Rw@SV5uHGOYnhfx%h(8ByfJn1my=%HO0Wl!^rZVg^A@K7pnjx2crNJs21a55@O_H
zWP!@CFthw+VPW~h$6f@gyHIL2P^|{S;Jg5;4@(#pFxD_EWXxi!Wh`N?VN79cW~yZ>
zVX0wiW~^l{VXa}#VrynhVajGHvMXV)fs52IH8V3Z)POpM%zjne86~+ndIgC^#i>Q$
z1f$7ui@7MZ5L~ttfr}faq+)PG1t~Z{IaQMdTt-B3f=q}9HPXPP4LEqv1BDgb*J=WV
z1*F>IVdP@s0Jn{b+(8wf$SwBx_>}x)NJlCW+&GMnzr`6JpPHMPnFA3nE=epYEsl@B
z#hg@ZQUod-irhdYAV*PZVQFe{NpX=Yh|64DS$vBfRDtGXCV^{Pa9;{sI6^!EiVtvg
z2`+NLVGVW$lCwc|XA!8yaf`zy7u>J216f!M3K14o7EqM2gX&!lMjj?UK0bC12@XyU
JP7YBHNdSGl8^Hho

literal 0
HcmV?d00001

diff --git a/verarbeitung/__pycache__/json_demo.cpython-38.pyc b/verarbeitung/__pycache__/json_demo.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..519d14366f17ebf12bfdc3f6ae2e985ad9d6d229
GIT binary patch
literal 750
zcmWIL<>g{vU|_gqJTWnfnStRkh=Yuo7#J8F7#J9e6&M&8QW#Pga~PsPG*b>^E>jc}
zBSQ*v3QG$^6mtq=FoP!ROOReaO~zZSnRx}JCCMP|AR!P2S;Wb}z~BrrZUqAaLkUAO
zLoE}SWG-QBW~gPUVQ^uH)u?3!^Vn)wvzThwQW$L*YFM+FYuHklY#3^o7O>Q?Eo5Y5
zC={z<bYY0qsbvSN=BQy$VU}d5;YeYzVW?qGVFk0<Y#3^o7qHZDfOWXkFx9Z6u!F>E
zSW-A_7)n@EI8(TqnfirlIZN0Uu-CBEa4uwGWC&+4We8*lVu)Z!VF+f><o3J8l9!m9
zdW$(FKeLDt6kcq_`K3k4sV^a%lEk9))RJ4QdHE@+#kW{<GV`*Ft9Y}D^Yh|MQY%XI
zN-9cjF_vp`++xYeEH1gll9HKRa*MSjvm__=7JEv5W_)Q;&MmgYf`Zh%lv^zM1*v(r
zSU|dOv80sd7HD$aVhp>*=z2>Cq&q%8Egob~Jjjq@gji}ydMZ?m8zco%mza}tiv^_k
z7DrxcIau}<3rOKDE*L-F=oWi^X-QgUPU=dAB3T9ohF?L>RxzQ)sYS&xzKJ;{sWI-E
zB{9yK!MXuO`B|yiB{9jFy2ZK$5RPtHYEfcQQfg*NX<m8^3J2tY_>|P#e7%CoTYUMY
zB?YA=@g@23AaPbu8c+hI3Pv_C<X~iD<YMGv;$Z}_co;#DgOP)Yhp|YWfq_AjrAQDI
r>>S`^6`z(>1WH+8DX^Q72syAio80`A(wtN~kg{SH1_lNWMjj>r?i$Hd

literal 0
HcmV?d00001

diff --git a/verarbeitung/json_demo.py b/verarbeitung/json_demo.py
new file mode 100644
index 0000000..77ce148
--- /dev/null
+++ b/verarbeitung/json_demo.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+import json
+from input_fj import input
+
+def output_to_json(V,E):
+    list_of_node_dicts = list()
+    list_of_edge_dicts = list()
+    dict_of_all = dict()
+    for node in V:
+        new_dict = dict()
+        new_dict["name"] = node.title
+        new_dict["author"] = node.contributors
+        new_dict["year"] = node.publication_date
+        new_dict["doi"] = node.doi_url
+        
+        
+        list_of_node_dicts.append(new_dict)
+    for edge in E:
+        new_dict_2 = dict()
+        new_dict_2["source"] = edge[0]
+        new_dict_2["target"] = edge[1]
+        list_of_edge_dicts.append(new_dict_2)
+    dict_of_all["nodes"] = list_of_node_dicts
+    dict_of_all["links"] = list_of_edge_dicts
+    #return(dict_of_all)
+    with open('json_text.txt','w') as outfile:
+        json.dump(dict_of_all, outfile)
+  
+
+#knoten = ["doi1", "doi2", "doi3"]
+#kanten = [[1,2],[3,4],[5,6]]
+#output_to_json(knoten,kanten)
+
-- 
GitLab