Große, Judith
--- a/verarbeitung/Processing.py

+ 219

− 89
+++ b/verarbeitung/Processing.py

+ 219

− 89
 # -*- coding: utf-8 -*-
 """
-Created on Wed Nov  3 16:54:43 2021
+Functions to generate a graph representing citations between multiple ACS/Nature journals

-@author: Malte Schokolowski
 """

+__authors__ = "Donna Löding, Alina Molkentin, Xinyi Tang, Judith Große, Malte Schokolowski"
+__email__ = "cis-project2021@zbh.uni-hamburg.de"
+__status__ = "Production"
+#__copyright__ = ""
+#__credits__ = ["", "", "", ""]
+#__license__ = ""
+#__version__ = ""
+#__maintainer__ = ""
+
 from bs4 import BeautifulSoup as bs
 import requests as req
 import sys  
 from pathlib import Path
 from input_fj import input
+from input_test import input_test_func
 from json_demo import output_to_json

+# adds every publication from input list to graph structure
+# doi_input_list: list of publication dois from user
+def initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var):
+    references_pub_obj_list = []
+    citations_pub_obj_list = []

+    for pub_doi in doi_input_list:

-def process_main(doi_input_array, depth):
-    # ERROR-Handling doi_array = NULL
-    if (len(doi_input_array) == 0):
-        print("Error, no input data")
+        #checks if its a test and chooses input function accordingly
+        if(test_var):
+            pub = input_test_func(pub_doi)
+        else:
+            pub = input(pub_doi)
+
+        # checks if publication already exists in nodes
+        not_in_nodes = True
+        for node in nodes:                                              # checks if a pub is already in nodes
+            if (pub.doi_url == node.doi_url):
+                not_in_nodes = False
+                break
+        if (not_in_nodes):
+            nodes.append(pub)
+            pub.group = "input"
+        else:
+            doi_input_list.remove(pub_doi)

-    # ERROR- wenn für die Tiefe eine negative Zahl eingegeben wird
-    if (depth < 0):
-        print("Error, depth of search must be positive")
+        # inserts references as publication objects into list and 
+        # inserts first depth references into nodes/edges if maximum search depth > 0
+        for reference in create_graph_structure_references(pub, 0, search_depth_max, test_var):
+            references_pub_obj_list.append(reference)
+
+        # inserts citations as publication objects into list and 
+        # inserts first height citations into nodes if maximum search height > 0
+        for citation in create_graph_structure_citations(pub, 0, search_height_max, test_var):
+            citations_pub_obj_list.append(citation)
+
+    return(references_pub_obj_list, citations_pub_obj_list)
+        
    
+# adds edges between citation and reference group
+def complete_inner_edges(test_var):
+    for node in nodes:
+        if (node.group == "depth"):
+            for citation in node.citations:
+                for cit in nodes:
+                    if (citation.doi_url == cit.doi_url and [citation.doi_url, node.doi_url] not in edges):
+                        edges.append([citation.doi_url, node.doi_url])
+        if (node.group == "height"):
+            for reference in node.references:
+                for ref in nodes:
+                    if (reference.doi_url == ref.doi_url and [node.doi_url, reference.doi_url] not in edges):
+                        edges.append([node.doi_url,reference.doi_url])
+
+
+
+# adds a node for every publication unknown
+# adds edges for references between publications     
+def create_graph_structure_references(pub, search_depth, search_depth_max, test_var):
+    references_pub_obj_list = []
+    for reference in pub.references:
+        not_in_nodes = True
+        for node in nodes:
+            # checks every reference for duplication 
+            if (reference.doi_url == node.doi_url):
+                not_in_nodes = False
+                break
+        if (not_in_nodes):
+            if (search_depth < search_depth_max):
+
+                #checks if its a test and chooses input function accordingly
+                if (test_var):
+                    reference_pub_obj = input_test_func(reference.doi_url)
+                else:
+                    reference_pub_obj = input(reference.doi_url)
+
+                reference_pub_obj.group = "depth"
+                nodes.append(reference_pub_obj)
+                edges.append([pub.doi_url,reference_pub_obj.doi_url])
+                references_pub_obj_list.append(reference_pub_obj)
+
+        # adds edge only if citation already exists           
+        elif [pub.doi_url,reference.doi_url] not in edges:
+            edges.append([pub.doi_url,reference.doi_url])  
+    return references_pub_obj_list 
+
+
+# recursive function to implement height-first-search on references
+# references_pub_obj_list: input list of references as publication objects
+# search_depth: current search_depth of height-first-search
+# search_depth_max: maximal search_depth for dfs
+def process_references_rec(references_pub_obj_list, search_depth, search_depth_max, test_var):  
+    # adds next level to nodes/edges
+    for pub in references_pub_obj_list:
+        new_reference_pub_obj_list = create_graph_structure_references(pub, search_depth, search_depth_max, test_var)    
+
+        # If the maximum height has not yet been reached, calls function recursivly with increased height     
+        if (search_depth < search_depth_max):
+            process_references_rec(new_reference_pub_obj_list, search_depth+1, search_depth_max, test_var)
+
+

-    # Leeres Array für die Knoten(nodes) wird erstellt.
-    # Leeres Array für die Kanten(edges) wird erstellt.
-    global nodes, edges
-    nodes = []
-    edges = []
    
-    # Jede Publikation aus dem Input-Array wird in den Knoten-Array(nodes) eingefügt.
-    for pub_doi in doi_input_array:
-        pub = input(pub_doi)
+# adds a node for every publication unknown
+# adds edges for citations between publications     
+def create_graph_structure_citations(pub, search_height, search_height_max, test_var):
+    citations_pub_obj_list = []
+    for citation in pub.citations:
        not_in_nodes = True
        for node in nodes:
-            if (pub.doi_url == node.doi_url):
+            # checks every citation for duplication 
+            if (citation.doi_url == node.doi_url):
                not_in_nodes = False
                break
        if (not_in_nodes):
-            nodes.append(pub)
-        else:
-            doi_input_array.remove(pub_doi)
+            if (search_height < search_height_max):
+
+                #checks if its a test and chooses input function accordingly
+                if (test_var):
+                    citation_pub_obj = input_test_func(citation.doi_url)
+                else:
+                    citation_pub_obj = input(citation.doi_url)
+
+                citation_pub_obj.group = "height"
+                nodes.append(citation_pub_obj)
+                edges.append([citation_pub_obj.doi_url,pub.doi_url])
+                citations_pub_obj_list.append(citation_pub_obj)
+
+        # adds only edge if citation already exists         
+        elif [citation.doi_url,pub.doi_url] not in edges:
+            edges.append([citation.doi_url,pub.doi_url])   
+    return citations_pub_obj_list
+
+
+
+# recursive function to implement height-first-search on citations
+# citations_pub_obj_list: input list of citations as publication objects
+# search_height: current search_height of height-first-search
+# search_height_max: maximal search_height for dfs
+def process_citations_rec(citations_pub_obj_list, search_height, search_height_max, test_var):  
+    # adds next level to nodes/edges
+    for pub in citations_pub_obj_list:
+        new_citation_pub_obj_list = create_graph_structure_citations(pub, search_height, search_height_max, test_var)   
+
+        # If the maximum height has not yet been reached, calls function recursivly with increased height 
+        if (search_height < search_height_max):
+            process_citations_rec(new_citation_pub_obj_list, search_height+1, search_height_max, test_var)
+
+
+
+
+# main function to call. Needs as input:
+# doi_input_list:   input list of dois
+# search_height:    max search height to process to
+# search_depth:     max search depth to process to
+# test_var:         only needed for unit test as True, default is False
+def process_main(doi_input_list, search_height, search_depth, test_var = False):
+    # ERROR-Handling doi_array = NULL
+    if (len(doi_input_list) == 0):
+        print("Error, no input data")
+
+    # ERROR- if a negative number is entered for height
+    if (search_height < 0):
+        print("Error, search_height of search must be positive")
+
+    # ERROR- if a negative number is entered for depth
+    if (search_depth < 0):
+        print("Error, search_depth of search must be positive")       

-    process_rec_depth(doi_input_array, 0, depth)
+    # create empty array for the nodes
+    # create empty array for the edges
+    global nodes, edges
+    nodes = []
+    edges = []
+
+    # initializes nodes/edges from input and gets a list with publication objects for citations and references returned
+    references_obj_list, citations_obj_list = initialize_nodes_list(doi_input_list,search_depth, search_height, test_var)

+    # function calls to begin recursive processing up to max depth/height
+    process_citations_rec(citations_obj_list, 1, search_height, test_var)
+    process_references_rec(references_obj_list, 1, search_depth, test_var)
+
+    # adds edges between reference group and citation group of known publications
+    complete_inner_edges(test_var)
+
+    # calls a skript to save nodes and edges of graph in .json file
    output_to_json(nodes,edges)
-    return(nodes,edges)
-    
+
+    # only for unit tests
+    if (test_var == True):
+        doi_nodes_list = []
+        for node in nodes:
+            doi_nodes_list.append(node.doi_url)
+        return(doi_nodes_list, edges)
+
+
+
    
-def process_rec_depth(array, depth, depth_max):  
-    # Die Tiefe wird bei jedem rekursiven Aufruf um 1 erhöht.
-    depth += 1
-
-    # Für jede Publikation im Input-Array wird ein Klassenobjekt erstellt.
-    for pub_doi in array:
-        pub = input(pub_doi)
-
-        # Für jede citation, die in der entsprecheneden Klasseninstanz der Publikation gespeichert sind, 
-        # wird geprüft, ob diese bereits als Knoten existiert.
-        for citation in pub._citations:
-
-            # Wenn die citation noch nicht im Knoten-Array(nodes) existiert UND die maximale Tiefe 
-            # noch nicht erreicht wurde, wird diese als Knoten im Knoten-Array gespeichert. Zusätzlich 
-            # wird die Verbindung zur Publikation als Tupel im Kanten-Array(edges) gespeichert. 
-            not_in_nodes = True
-            for node in nodes:
-                if (citation.doi_url == node.doi_url):
-                    not_in_nodes = False
-                    break
-            if (not_in_nodes):
-                if (depth <= depth_max):
-                    nodes.append(citation)
-                    edges.append([pub.doi_url,citation.doi_url])
-
-            # Wenn die citaion bereits im Knoten-Array existiert, wird nur die Verbindung zur Publikation 
-            # als Tupel im Kanten-Array(edges) gespeichert.            
-            else:
-                edges.append([pub.doi_url,citation.doi_url])
-            
-        # Wenn die maximale Tiefe noch nicht erreicht wurde, werden alle citations aus der Publikation 
-        # in ein Array geschrieben und mit diesem die Funktion erneut aufgerufen.      
-        if (depth < depth_max):
-            cit_arr = []
-            for citation in pub._citations:
-
-                # Momentan werden nur die citations mit acs in der URL gespeichert, da wir von anderen 
-                # Quellen die Infotmationen nicht extrahieren können.
-                if ("acs" in citation.doi_url):
-                    cit_arr.append(citation.doi_url)
-
-            # Rekusriver Aufruf der Funktion.
-            process_rec_depth(cit_arr, depth, depth_max)
-            
- 
+# a function to print nodes and edges from a graph
+def print_graph(nodes, edges):
+    print("Knoten:\n")
+    for node in nodes:
+        print(node.title, "\n")
+    print("\nKanten:\n")
+    for edge in edges:
+        print(edge,"\n")
+   
    
-# Programmtest, weil noch keine Verbindung zum Input besteht.
-arr = []
-arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
-arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
-arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332')
-#arr.append('https://doi.org/10.1021/acs.jcim.0c00741')
-
-#arr.append('https://doi.org/10.1021/ci700007b')
-#arr.append('https://doi.org/10.1021/acs.jcim.5b00292')
-#url = sys.argv[1]
-#arr.append[url]
-
-nodes,edges = process_main(arr,1)
-
-print("Knoten:\n")
-for node in nodes:
-    print(node.title, "\n")
-print("\nKanten:\n")
-for edge in edges:
-    print(edge,"\n")
 \ No newline at end of file
+# program test, because there is no connection to UI yet.
+def try_known_publications():
+    doi_list = []
+    doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
+    #arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
+    doi_list.append('https://doi.org/10.1021/acs.jmedchem.0c01332')
+    #arr.append('https://doi.org/10.1021/acs.jcim.0c00741')
+
+    #arr.append('https://doi.org/10.1021/ci700007b')
+    #arr.append('https://doi.org/10.1021/acs.jcim.5b00292')
+    #url = sys.argv[1]
+    #arr.append[url]
+
+
+    nodes,edges = process_main(doi_list,2,2)
+
+    print_graph(nodes, edges)    
+\ No newline at end of file