# -*- coding: utf-8 -*- """ Functions to generate a graph representing citations between multiple ACS/Nature journals """ __authors__ = "Donna Löding, Alina Molkentin, Xinyi Tang, Judith Große, Malte Schokolowski" __email__ = "cis-project2021@zbh.uni-hamburg.de" __status__ = "Production" #__copyright__ = "" #__credits__ = ["", "", "", ""] #__license__ = "" #__version__ = "" #__maintainer__ = "" from bs4 import BeautifulSoup as bs import requests as req import sys from pathlib import Path #sys.path.insert(1, 'C:\Users\Malte\Git\CiS-Projekt\ci-s-projekt-verarbeitung\input') sys.path.append("../") from input.interface import InputInterface as Input #import input from input_test import input_test_func from json_demo import output_to_json def initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var): ''' :param doi_input_list: input list of doi from UI :type doi_input_list: list of strings :param search_depth_max: maximum depth to search for references :type search_depth_max: int :param search_height_max: maximum height to search for citations :type search_height_max: int :param test_var: variable to differenciate between test and url call :type test_var: boolean ''' references_pub_obj_list = [] citations_pub_obj_list = [] for pub_doi in doi_input_list: #iterates over every incoming doi if(test_var): #checks that it is a test and chooses test-input function pub = input_test_func(pub_doi) #creates an object of class Publication else: #checks that it isnt a test and chooses standart-input function #print(pub_doi) inter = Input() try: pub = inter.get_publication(pub_doi) #creates an object of class Publication except ValueError: continue except IndexError: continue # checks if publication already exists in nodes not_in_nodes = True #boolean value to check if a node already exists in the set of nodes for node in nodes: #iterates over every node in the set of nodes if (pub.doi_url == node.doi_url): #determines that a node with this doi already is in the set not_in_nodes = False #false --> node will not be created break if (not_in_nodes): #there is no node with this doi in the set nodes.append(pub) #appends Publication Object pub.group = "input" else: doi_input_list.remove(pub_doi) #deletes the doi-dublicate from input list # inserts references as publication objects into list and # inserts first depth references into nodes/edges if maximum search depth > 0 for reference in create_graph_structure_references(pub, 0, search_depth_max, test_var): references_pub_obj_list.append(reference) # inserts citations as publication objects into list and # inserts first height citations into nodes if maximum search height > 0 for citation in create_graph_structure_citations(pub, 0, search_height_max, test_var): citations_pub_obj_list.append(citation) return(references_pub_obj_list, citations_pub_obj_list) def complete_inner_edges(test_var): ''' :param test_var: variable to differenciate between test and url call :type test_var: boolean ''' for node in nodes: if (node.group == "depth"): for citation in node.citations: for cit in nodes: if (citation.doi_url == cit.doi_url and [citation.doi_url, node.doi_url] not in edges): edges.append([citation.doi_url, node.doi_url]) if (node.group == "height"): for reference in node.references: for ref in nodes: if (reference.doi_url == ref.doi_url and [node.doi_url, reference.doi_url] not in edges): edges.append([node.doi_url,reference.doi_url]) # adds a node for every publication unknown # adds edges for references between publications def create_graph_structure_references(pub, search_depth, search_depth_max, test_var): ''' :param pub: publication which references will be added :type pub: Class Publication :param search_depth: current depth to search for references :type search_depth: int :param search_depth_max: maximum depth to search for references :type search_depth_max: int :param test_var: variable to differenciate between test and url call :type test_var: boolean ''' references_pub_obj_list = [] for reference in pub.references: #iterates over the references of the considered paper not_in_nodes = True #boolean Value to ensure that there will be no dublicates in the set of nodes for node in nodes: #iterates over all nodes in set of nodes if (reference.doi_url == node.doi_url): #determines that the node already exists not_in_nodes = False #boolean false --> node will not be created break if (not_in_nodes): #checks that there is no node with this doi if (search_depth < search_depth_max): #checks that the recursion step is smaller than the limit if (test_var): #determines that it is a test and chooses the test-input function reference_pub_obj = input_test_func(reference.doi_url) #creates an Object of Publication Class else: #determines that it isnt a test and chooses the standart function #reference_pub_obj = Input(reference.doi_url) inter = Input() try: reference_pub_obj = inter.get_publication(reference.doi_url) except ValueError: continue # nur aus Testzwecken, da noch was bei Input falsch ist except IndexError: continue reference_pub_obj.group = "depth" nodes.append(reference_pub_obj) # appends the object to the set of nodes edges.append([pub.doi_url,reference_pub_obj.doi_url]) #appends the edge to the set of edges references_pub_obj_list.append(reference_pub_obj) #appends the node to the set of references # adds edge only if citation already exists elif [pub.doi_url,reference.doi_url] not in edges: edges.append([pub.doi_url,reference.doi_url]) return references_pub_obj_list # recursive function to implement height-first-search on references # references_pub_obj_list: input list of references as publication objects # search_depth: current search_depth of height-first-search # search_depth_max: maximal search_depth for dfs def process_references_rec(references_pub_obj_list, search_depth, search_depth_max, test_var): ''' :param references_pub_obj_list: list of publications which references will be added :type references_pub_obj_list: list of objects of type Class Publications :param search_depth: current depth to search for references :type search_depth: int :param search_depth_max: maximum depth to search for references :type search_depth_max: int :param test_var: variable to differenciate between test and url call :type test_var: boolean ''' # adds next level to nodes/edges for pub in references_pub_obj_list: new_reference_pub_obj_list = create_graph_structure_references(pub, search_depth, search_depth_max, test_var) # If the maximum height has not yet been reached, calls function recursivly with increased height if (search_depth < search_depth_max): process_references_rec(new_reference_pub_obj_list, search_depth+1, search_depth_max, test_var) def create_graph_structure_citations(pub, search_height, search_height_max, test_var): ''' :param pub: publication which citations will be added :type pub: Class Publication :param search_height: current height to search for citations :type search_height_max: int :param search_height_max: maximum height to search for citations :type search_height_max: int :param test_var: variable to differenciate between test and url call :type test_var: boolean ''' citations_pub_obj_list = [] for citation in pub.citations: not_in_nodes = True for node in nodes: # checks every citation for duplication if (citation.doi_url == node.doi_url): not_in_nodes = False break if (not_in_nodes): if (search_height < search_height_max): #checks if its a test and chooses input function accordingly if (test_var): citation_pub_obj = input_test_func(citation.doi_url) else: #citation_pub_obj = Input(citation.doi_url) inter = Input() try: citation_pub_obj = inter.get_publication(citation.doi_url) except ValueError: continue except IndexError: continue citation_pub_obj.group = "height" nodes.append(citation_pub_obj) edges.append([citation_pub_obj.doi_url,pub.doi_url]) citations_pub_obj_list.append(citation_pub_obj) # adds only edge if citation already exists elif [citation.doi_url,pub.doi_url] not in edges: edges.append([citation.doi_url,pub.doi_url]) return citations_pub_obj_list # recursive function to implement height-first-search on citations # citations_pub_obj_list: input list of citations as publication objects # search_height: current search_height of height-first-search # search_height_max: maximal search_height for dfs def process_citations_rec(citations_pub_obj_list, search_height, search_height_max, test_var): ''' :param citations_pub_obj_list: list of publications which citations will be added :type citations_pub_obj_list: list of objects of type Class Publications :param search_height: current height to search for citations :type search_height_max: int :param search_height_max: maximum height to search for citations :type search_height_max: int :param test_var: variable to differenciate between test and url call :type test_var: boolean ''' # adds next level to nodes/edges for pub in citations_pub_obj_list: new_citation_pub_obj_list = create_graph_structure_citations(pub, search_height, search_height_max, test_var) # If the maximum height has not yet been reached, calls function recursivly with increased height if (search_height < search_height_max): process_citations_rec(new_citation_pub_obj_list, search_height+1, search_height_max, test_var) def process_main(doi_input_list, search_height, search_depth, test_var = False): ''' :param doi_input_list: input list of doi from UI :type doi_input_list: list of strings :param search_height: maximum height to search for citations :type search_height: int :param search_depth: maximum depth to search for references :type search_depth: int :param test_var: variable to differenciate between test and url call :type test_var: boolean ''' # ERROR-Handling doi_array = NULL if (len(doi_input_list) == 0): print("Error, no input data") # ERROR- if a negative number is entered for height if (search_height < 0): print("Error, search_height of search must be positive") # ERROR- if a negative number is entered for depth if (search_depth < 0): print("Error, search_depth of search must be positive") global nodes, edges nodes = [] # create empty array for the nodes edges = [] # create empty array for the edges # initializes nodes/edges from input and gets a list with publication objects for citations and references returned references_obj_list, citations_obj_list = initialize_nodes_list(doi_input_list,search_depth, search_height, test_var) # function calls to begin recursive processing up to max depth/height process_citations_rec(citations_obj_list, 1, search_height, test_var) process_references_rec(references_obj_list, 1, search_depth, test_var) # adds edges between reference group and citation group of known publications complete_inner_edges(test_var) # calls a skript to save nodes and edges of graph in .json file output_to_json(nodes,edges) # only for unit tests if (test_var == True): doi_nodes_list = [] for node in nodes: doi_nodes_list.append(node.doi_url) return(doi_nodes_list, edges) return(nodes,edges)