From 45b197458fac038869f7a18ae0d1c03e4710bcdc Mon Sep 17 00:00:00 2001
From: Malte Schokolowski <baw8441@uni-hamburg.de>
Date: Thu, 2 Dec 2021 22:27:41 +0100
Subject: [PATCH] added Processing_pub_objs_only.py, which adds only objects of
 type publication to nodes. Has the same functions as Processing.py

---
 verarbeitung/Processing_pub_objs_only.py | 255 +++++++++++++++++++++++
 1 file changed, 255 insertions(+)
 create mode 100644 verarbeitung/Processing_pub_objs_only.py

diff --git a/verarbeitung/Processing_pub_objs_only.py b/verarbeitung/Processing_pub_objs_only.py
new file mode 100644
index 0000000..a6c1ed3
--- /dev/null
+++ b/verarbeitung/Processing_pub_objs_only.py
@@ -0,0 +1,255 @@
+# -*- coding: utf-8 -*-
+"""
+Functions to generate a graph representing citations between multiple ACS/Nature journals
+
+"""
+
+__authors__ = "Donna Löding, Alina Molkentin, Xinyi Tang, Judith Große, Malte Schokolowski"
+__email__ = "cis-project2021@zbh.uni-hamburg.de"
+__status__ = "Production"
+#__copyright__ = ""
+#__credits__ = ["", "", "", ""]
+#__license__ = ""
+#__version__ = ""
+#__maintainer__ = ""
+
+from bs4 import BeautifulSoup as bs
+import requests as req
+import sys  
+from pathlib import Path
+from input_fj import input
+from input_test import input_test_func
+from json_demo import output_to_json
+
+# adds every publication from input list to graph structure
+# doi_input_list: list of publication dois from user
+def initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var):
+    references_pub_obj_list = []
+    citations_pub_obj_list = []
+
+    for pub_doi in doi_input_list:
+
+        #checks if its a test and chooses input function accordingly
+        if(test_var):
+            pub = input_test_func(pub_doi)
+        else:
+            pub = input(pub_doi)
+
+        # checks if publication already exists in nodes
+        not_in_nodes = True
+        for node in nodes:                                              # checks if a pub is already in nodes
+            if (pub.doi_url == node.doi_url):
+                not_in_nodes = False
+                break
+        if (not_in_nodes):
+            nodes.append(pub)
+            pub.group = "input"
+        else:
+            doi_input_list.remove(pub_doi)
+
+        # inserts references as publication objects into list and 
+        # inserts first depth references into nodes/edges if maximum search depth > 0
+        for reference in create_graph_structure_references(pub, 0, search_depth_max, test_var):
+            references_pub_obj_list.append(reference)
+
+        # inserts citations as publication objects into list and 
+        # inserts first height citations into nodes if maximum search height > 0
+        for citation in create_graph_structure_citations(pub, 0, search_height_max, test_var):
+            citations_pub_obj_list.append(citation)
+
+    return(references_pub_obj_list, citations_pub_obj_list)
+        
+    
+# adds edges between citation and reference group
+def complete_inner_edges(test_var):
+    for node in nodes:
+        if (node.group == "depth"):
+            for citation in node.citations:
+                if (citation in nodes and [citation.doi_url, node.doi_url] not in edges):
+                    edges.append([citation.doi_url, node.doi_url])
+        if (node.group == "height"):
+            for reference in node.references:
+                if (reference in nodes and [node.doi_url, reference.doi_url] not in edges):
+                    edges.append([node.doi_url,reference.doi_url])
+
+
+
+
+# adds a node for every publication unknown
+# adds edges for references between publications     
+def create_graph_structure_references(pub, search_depth, search_depth_max, test_var):
+    references_pub_obj_list = []
+    for reference in pub.references:
+        not_in_nodes = True
+        for node in nodes:
+            # checks every reference for duplication 
+            if (reference.doi_url == node.doi_url):
+                not_in_nodes = False
+                break
+        if (not_in_nodes):
+            if (search_depth < search_depth_max):
+
+                #checks if its a test and chooses input function accordingly
+                if (test_var):
+                    reference_pub_obj = input_test_func(reference.doi_url)
+                else:
+                    reference_pub_obj = input(reference.doi_url)
+
+                reference_pub_obj.group = "depth"
+                nodes.append(reference_pub_obj)
+                edges.append([pub.doi_url,reference_pub_obj.doi_url])
+                references_pub_obj_list.append(reference_pub_obj)
+
+        # adds edge only if citation already exists           
+        elif [pub.doi_url,reference.doi_url] not in edges:
+            edges.append([pub.doi_url,reference.doi_url])  
+    return references_pub_obj_list 
+
+
+# recursive function to implement height-first-search on references
+# references_pub_obj_list: input list of references as publication objects
+# search_depth: current search_depth of height-first-search
+# search_depth_max: maximal search_depth for dfs
+def process_references_rec(references_pub_obj_list, search_depth, search_depth_max, test_var):  
+    # adds next level to nodes/edges
+    for pub in references_pub_obj_list:
+        new_reference_pub_obj_list = create_graph_structure_references(pub, search_depth, search_depth_max, test_var)    
+
+        # If the maximum height has not yet been reached, calls function recursivly with increased height     
+        if (search_depth < search_depth_max):
+            process_references_rec(new_reference_pub_obj_list, search_depth+1, search_depth_max, test_var)
+
+
+
+    
+# adds a node for every publication unknown
+# adds edges for citations between publications     
+def create_graph_structure_citations(pub, search_height, search_height_max, test_var):
+    citations_pub_obj_list = []
+    for citation in pub.citations:
+        not_in_nodes = True
+        for node in nodes:
+            # checks every citation for duplication 
+            if (citation.doi_url == node.doi_url):
+                not_in_nodes = False
+                break
+        if (not_in_nodes):
+            if (search_height < search_height_max):
+
+                #checks if its a test and chooses input function accordingly
+                if (test_var):
+                    citation_pub_obj = input_test_func(citation.doi_url)
+                else:
+                    citation_pub_obj = input(citation.doi_url)
+
+                citation_pub_obj.group = "height"
+                nodes.append(citation_pub_obj)
+                edges.append([citation_pub_obj.doi_url,pub.doi_url])
+                citations_pub_obj_list.append(citation_pub_obj)
+
+        # adds only edge if citation already exists         
+        elif [citation.doi_url,pub.doi_url] not in edges:
+            edges.append([citation.doi_url,pub.doi_url])   
+    return citations_pub_obj_list
+
+
+
+# recursive function to implement height-first-search on citations
+# citations_pub_obj_list: input list of citations as publication objects
+# search_height: current search_height of height-first-search
+# search_height_max: maximal search_height for dfs
+def process_citations_rec(citations_pub_obj_list, search_height, search_height_max, test_var):  
+    # adds next level to nodes/edges
+    for pub in citations_pub_obj_list:
+        new_citation_pub_obj_list = create_graph_structure_citations(pub, search_height, search_height_max, test_var)   
+
+        # If the maximum height has not yet been reached, calls function recursivly with increased height 
+        if (search_height < search_height_max):
+            process_citations_rec(new_citation_pub_obj_list, search_height+1, search_height_max, test_var)
+
+
+
+
+           
+def process_main(doi_input_list, search_height, search_depth, test_var = False):
+    # ERROR-Handling doi_array = NULL
+    if (len(doi_input_list) == 0):
+        print("Error, no input data")
+
+    # ERROR- if a negative number is entered for height
+    if (search_height < 0):
+        print("Error, search_height of search must be positive")
+
+    # ERROR- if a negative number is entered for depth
+    if (search_depth < 0):
+        print("Error, search_depth of search must be positive")       
+
+    # create empty array for the nodes
+    # create empty array for the edges
+    global nodes, edges
+    nodes = []
+    edges = []
+
+    # initializes nodes/edges from input and gets a list with publication objects for citations and references returned
+    references_obj_list, citations_obj_list = initialize_nodes_list(doi_input_list,search_depth, search_height, test_var)
+    process_citations_rec(citations_obj_list, 1, search_height, test_var)
+    process_references_rec(references_obj_list, 1, search_depth, test_var)
+    complete_inner_edges(test_var)
+
+    output_to_json(nodes,edges)
+
+    # only for internal testing
+    doi_nodes = []
+    for node in nodes:
+        doi_nodes.append(node.doi_url)
+    return(doi_nodes,edges)
+
+
+
+    
+# a function to print nodes and edges from a graph
+def print_graph(nodes, edges):
+    print("Knoten:\n")
+    for node in nodes:
+        print(node.title, "\n")
+    print("\nKanten:\n")
+    for edge in edges:
+        print(edge,"\n")
+   
+
+# function to test cycles
+def test_cycle():
+    arr = []
+    arr.append('doiz1')
+    #arr.append('doiz2')
+
+    nodes,edges = process_main(arr,1,1,True)
+
+    print(nodes, edges)
+
+    print_graph(nodes, edges)
+    
+# program test, because there is no connection to the input yet.
+def test_print():
+    arr = []
+    #arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
+    #arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
+    #arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332')
+    #arr.append('https://doi.org/10.1021/acs.jcim.0c00741')
+
+    #arr.append('https://doi.org/10.1021/ci700007b')
+    #arr.append('https://doi.org/10.1021/acs.jcim.5b00292')
+    #url = sys.argv[1]
+    #arr.append[url]
+
+
+    nodes,edges = process_main(arr,2,2,True)
+
+    print_graph(nodes, edges)
+
+#test_print()
+#test_cycle()
+#print(process_main(['doiz1'],1,1,True))
+#print(process_main(['doi1'],0,0,True))
+
+        
\ No newline at end of file
-- 
GitLab