Skip to content
Snippets Groups Projects

Main

Merged Große, Judith requested to merge bav1758/ci-s-projekt-verarbeitung:main into main
2 files
+ 37
33
Compare changes
  • Side-by-side
  • Inline
Files
2
+ 219
89
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 3 16:54:43 2021
Functions to generate a graph representing citations between multiple ACS/Nature journals
@author: Malte Schokolowski
"""
__authors__ = "Donna Löding, Alina Molkentin, Xinyi Tang, Judith Große, Malte Schokolowski"
__email__ = "cis-project2021@zbh.uni-hamburg.de"
__status__ = "Production"
#__copyright__ = ""
#__credits__ = ["", "", "", ""]
#__license__ = ""
#__version__ = ""
#__maintainer__ = ""
from bs4 import BeautifulSoup as bs
import requests as req
import sys
from pathlib import Path
from input_fj import input
from input_test import input_test_func
from json_demo import output_to_json
# adds every publication from input list to graph structure
# doi_input_list: list of publication dois from user
def initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var):
references_pub_obj_list = []
citations_pub_obj_list = []
for pub_doi in doi_input_list:
def process_main(doi_input_array, depth):
# ERROR-Handling doi_array = NULL
if (len(doi_input_array) == 0):
print("Error, no input data")
#checks if its a test and chooses input function accordingly
if(test_var):
pub = input_test_func(pub_doi)
else:
pub = input(pub_doi)
# checks if publication already exists in nodes
not_in_nodes = True
for node in nodes: # checks if a pub is already in nodes
if (pub.doi_url == node.doi_url):
not_in_nodes = False
break
if (not_in_nodes):
nodes.append(pub)
pub.group = "input"
else:
doi_input_list.remove(pub_doi)
# ERROR- wenn für die Tiefe eine negative Zahl eingegeben wird
if (depth < 0):
print("Error, depth of search must be positive")
# inserts references as publication objects into list and
# inserts first depth references into nodes/edges if maximum search depth > 0
for reference in create_graph_structure_references(pub, 0, search_depth_max, test_var):
references_pub_obj_list.append(reference)
# inserts citations as publication objects into list and
# inserts first height citations into nodes if maximum search height > 0
for citation in create_graph_structure_citations(pub, 0, search_height_max, test_var):
citations_pub_obj_list.append(citation)
return(references_pub_obj_list, citations_pub_obj_list)
# adds edges between citation and reference group
def complete_inner_edges(test_var):
for node in nodes:
if (node.group == "depth"):
for citation in node.citations:
for cit in nodes:
if (citation.doi_url == cit.doi_url and [citation.doi_url, node.doi_url] not in edges):
edges.append([citation.doi_url, node.doi_url])
if (node.group == "height"):
for reference in node.references:
for ref in nodes:
if (reference.doi_url == ref.doi_url and [node.doi_url, reference.doi_url] not in edges):
edges.append([node.doi_url,reference.doi_url])
# adds a node for every publication unknown
# adds edges for references between publications
def create_graph_structure_references(pub, search_depth, search_depth_max, test_var):
references_pub_obj_list = []
for reference in pub.references:
not_in_nodes = True
for node in nodes:
# checks every reference for duplication
if (reference.doi_url == node.doi_url):
not_in_nodes = False
break
if (not_in_nodes):
if (search_depth < search_depth_max):
#checks if its a test and chooses input function accordingly
if (test_var):
reference_pub_obj = input_test_func(reference.doi_url)
else:
reference_pub_obj = input(reference.doi_url)
reference_pub_obj.group = "depth"
nodes.append(reference_pub_obj)
edges.append([pub.doi_url,reference_pub_obj.doi_url])
references_pub_obj_list.append(reference_pub_obj)
# adds edge only if citation already exists
elif [pub.doi_url,reference.doi_url] not in edges:
edges.append([pub.doi_url,reference.doi_url])
return references_pub_obj_list
# recursive function to implement height-first-search on references
# references_pub_obj_list: input list of references as publication objects
# search_depth: current search_depth of height-first-search
# search_depth_max: maximal search_depth for dfs
def process_references_rec(references_pub_obj_list, search_depth, search_depth_max, test_var):
# adds next level to nodes/edges
for pub in references_pub_obj_list:
new_reference_pub_obj_list = create_graph_structure_references(pub, search_depth, search_depth_max, test_var)
# If the maximum height has not yet been reached, calls function recursivly with increased height
if (search_depth < search_depth_max):
process_references_rec(new_reference_pub_obj_list, search_depth+1, search_depth_max, test_var)
# Leeres Array für die Knoten(nodes) wird erstellt.
# Leeres Array für die Kanten(edges) wird erstellt.
global nodes, edges
nodes = []
edges = []
# Jede Publikation aus dem Input-Array wird in den Knoten-Array(nodes) eingefügt.
for pub_doi in doi_input_array:
pub = input(pub_doi)
# adds a node for every publication unknown
# adds edges for citations between publications
def create_graph_structure_citations(pub, search_height, search_height_max, test_var):
citations_pub_obj_list = []
for citation in pub.citations:
not_in_nodes = True
for node in nodes:
if (pub.doi_url == node.doi_url):
# checks every citation for duplication
if (citation.doi_url == node.doi_url):
not_in_nodes = False
break
if (not_in_nodes):
nodes.append(pub)
else:
doi_input_array.remove(pub_doi)
if (search_height < search_height_max):
#checks if its a test and chooses input function accordingly
if (test_var):
citation_pub_obj = input_test_func(citation.doi_url)
else:
citation_pub_obj = input(citation.doi_url)
citation_pub_obj.group = "height"
nodes.append(citation_pub_obj)
edges.append([citation_pub_obj.doi_url,pub.doi_url])
citations_pub_obj_list.append(citation_pub_obj)
# adds only edge if citation already exists
elif [citation.doi_url,pub.doi_url] not in edges:
edges.append([citation.doi_url,pub.doi_url])
return citations_pub_obj_list
# recursive function to implement height-first-search on citations
# citations_pub_obj_list: input list of citations as publication objects
# search_height: current search_height of height-first-search
# search_height_max: maximal search_height for dfs
def process_citations_rec(citations_pub_obj_list, search_height, search_height_max, test_var):
# adds next level to nodes/edges
for pub in citations_pub_obj_list:
new_citation_pub_obj_list = create_graph_structure_citations(pub, search_height, search_height_max, test_var)
# If the maximum height has not yet been reached, calls function recursivly with increased height
if (search_height < search_height_max):
process_citations_rec(new_citation_pub_obj_list, search_height+1, search_height_max, test_var)
# main function to call. Needs as input:
# doi_input_list: input list of dois
# search_height: max search height to process to
# search_depth: max search depth to process to
# test_var: only needed for unit test as True, default is False
def process_main(doi_input_list, search_height, search_depth, test_var = False):
# ERROR-Handling doi_array = NULL
if (len(doi_input_list) == 0):
print("Error, no input data")
# ERROR- if a negative number is entered for height
if (search_height < 0):
print("Error, search_height of search must be positive")
# ERROR- if a negative number is entered for depth
if (search_depth < 0):
print("Error, search_depth of search must be positive")
process_rec_depth(doi_input_array, 0, depth)
# create empty array for the nodes
# create empty array for the edges
global nodes, edges
nodes = []
edges = []
# initializes nodes/edges from input and gets a list with publication objects for citations and references returned
references_obj_list, citations_obj_list = initialize_nodes_list(doi_input_list,search_depth, search_height, test_var)
# function calls to begin recursive processing up to max depth/height
process_citations_rec(citations_obj_list, 1, search_height, test_var)
process_references_rec(references_obj_list, 1, search_depth, test_var)
# adds edges between reference group and citation group of known publications
complete_inner_edges(test_var)
# calls a skript to save nodes and edges of graph in .json file
output_to_json(nodes,edges)
return(nodes,edges)
# only for unit tests
if (test_var == True):
doi_nodes_list = []
for node in nodes:
doi_nodes_list.append(node.doi_url)
return(doi_nodes_list, edges)
def process_rec_depth(array, depth, depth_max):
# Die Tiefe wird bei jedem rekursiven Aufruf um 1 erhöht.
depth += 1
# Für jede Publikation im Input-Array wird ein Klassenobjekt erstellt.
for pub_doi in array:
pub = input(pub_doi)
# Für jede citation, die in der entsprecheneden Klasseninstanz der Publikation gespeichert sind,
# wird geprüft, ob diese bereits als Knoten existiert.
for citation in pub._citations:
# Wenn die citation noch nicht im Knoten-Array(nodes) existiert UND die maximale Tiefe
# noch nicht erreicht wurde, wird diese als Knoten im Knoten-Array gespeichert. Zusätzlich
# wird die Verbindung zur Publikation als Tupel im Kanten-Array(edges) gespeichert.
not_in_nodes = True
for node in nodes:
if (citation.doi_url == node.doi_url):
not_in_nodes = False
break
if (not_in_nodes):
if (depth <= depth_max):
nodes.append(citation)
edges.append([pub.doi_url,citation.doi_url])
# Wenn die citaion bereits im Knoten-Array existiert, wird nur die Verbindung zur Publikation
# als Tupel im Kanten-Array(edges) gespeichert.
else:
edges.append([pub.doi_url,citation.doi_url])
# Wenn die maximale Tiefe noch nicht erreicht wurde, werden alle citations aus der Publikation
# in ein Array geschrieben und mit diesem die Funktion erneut aufgerufen.
if (depth < depth_max):
cit_arr = []
for citation in pub._citations:
# Momentan werden nur die citations mit acs in der URL gespeichert, da wir von anderen
# Quellen die Infotmationen nicht extrahieren können.
if ("acs" in citation.doi_url):
cit_arr.append(citation.doi_url)
# Rekusriver Aufruf der Funktion.
process_rec_depth(cit_arr, depth, depth_max)
# a function to print nodes and edges from a graph
def print_graph(nodes, edges):
print("Knoten:\n")
for node in nodes:
print(node.title, "\n")
print("\nKanten:\n")
for edge in edges:
print(edge,"\n")
# Programmtest, weil noch keine Verbindung zum Input besteht.
arr = []
arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332')
#arr.append('https://doi.org/10.1021/acs.jcim.0c00741')
#arr.append('https://doi.org/10.1021/ci700007b')
#arr.append('https://doi.org/10.1021/acs.jcim.5b00292')
#url = sys.argv[1]
#arr.append[url]
nodes,edges = process_main(arr,1)
print("Knoten:\n")
for node in nodes:
print(node.title, "\n")
print("\nKanten:\n")
for edge in edges:
print(edge,"\n")
\ No newline at end of file
# program test, because there is no connection to UI yet.
def try_known_publications():
doi_list = []
doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
#arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
doi_list.append('https://doi.org/10.1021/acs.jmedchem.0c01332')
#arr.append('https://doi.org/10.1021/acs.jcim.0c00741')
#arr.append('https://doi.org/10.1021/ci700007b')
#arr.append('https://doi.org/10.1021/acs.jcim.5b00292')
#url = sys.argv[1]
#arr.append[url]
nodes,edges = process_main(doi_list,2,2)
print_graph(nodes, edges)
\ No newline at end of file
Loading