Skip to content
Snippets Groups Projects
Processing.py 9.49 KiB
Newer Older
Functions to generate a graph representing citations between multiple ACS/Nature journals
__authors__ = "Donna Löding, Alina Molkentin, Xinyi Tang, Judith Große, Malte Schokolowski"
__email__ = "cis-project2021@zbh.uni-hamburg.de"
__status__ = "Production"
#__copyright__ = ""
#__credits__ = ["", "", "", ""]
#__license__ = ""
#__version__ = ""
#__maintainer__ = ""

from bs4 import BeautifulSoup as bs
import requests as req
import sys  
from pathlib import Path
from input_fj import input
from input_test import input_test_func
Malte Schokolowski's avatar
Malte Schokolowski committed
from json_demo import output_to_json
# adds every publication from input list to graph structure
# doi_input_list: list of publication dois from user
def initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var):
    references_pub_obj_list = []
    citations_pub_obj_list = []
    for pub_doi in doi_input_list:
        #checks if its a test and chooses input function accordingly
        if(test_var):
            pub = input_test_func(pub_doi)
        else:
            pub = input(pub_doi)

        # checks if publication already exists in nodes
Malte Schokolowski's avatar
Malte Schokolowski committed
        not_in_nodes = True
        for node in nodes:                                              # checks if a pub is already in nodes
Malte Schokolowski's avatar
Malte Schokolowski committed
            if (pub.doi_url == node.doi_url):
                not_in_nodes = False
                break
        if (not_in_nodes):
            pub.group = "input"
            doi_input_list.remove(pub_doi)
        # inserts references as publication objects into list and 
        # inserts first depth references into nodes/edges if maximum search depth > 0
        for reference in create_graph_structure_references(pub, 0, search_depth_max, test_var):
            references_pub_obj_list.append(reference)
        # inserts citations as publication objects into list and 
        # inserts first height citations into nodes if maximum search height > 0
        for citation in create_graph_structure_citations(pub, 0, search_height_max, test_var):
            citations_pub_obj_list.append(citation)
    return(references_pub_obj_list, citations_pub_obj_list)
        
    
# adds edges between citation and reference group
def complete_inner_edges(test_var):
    for node in nodes:
        if (node.group == "depth"):
            for citation in node.citations:
                for cit in nodes:
                    if (citation.doi_url == cit.doi_url and [citation.doi_url, node.doi_url] not in edges):
                        edges.append([citation.doi_url, node.doi_url])
        if (node.group == "height"):
            for reference in node.references:
                for ref in nodes:
                    if (reference.doi_url == ref.doi_url and [node.doi_url, reference.doi_url] not in edges):
                        edges.append([node.doi_url,reference.doi_url])



# adds a node for every publication unknown
# adds edges for references between publications     
def create_graph_structure_references(pub, search_depth, search_depth_max, test_var):
    references_pub_obj_list = []
    for reference in pub.references:
        not_in_nodes = True
        for node in nodes:
            # checks every reference for duplication 
            if (reference.doi_url == node.doi_url):
                not_in_nodes = False
                break
        if (not_in_nodes):
            if (search_depth < search_depth_max):
                #checks if its a test and chooses input function accordingly
                if (test_var):
                    reference_pub_obj = input_test_func(reference.doi_url)
                else:
                    reference_pub_obj = input(reference.doi_url)
                reference_pub_obj.group = "depth"
                nodes.append(reference_pub_obj)
                edges.append([pub.doi_url,reference_pub_obj.doi_url])
                references_pub_obj_list.append(reference_pub_obj)
        # adds edge only if citation already exists           
        elif [pub.doi_url,reference.doi_url] not in edges:
            edges.append([pub.doi_url,reference.doi_url])  
    return references_pub_obj_list 
# recursive function to implement height-first-search on references
# references_pub_obj_list: input list of references as publication objects
# search_depth: current search_depth of height-first-search
# search_depth_max: maximal search_depth for dfs
def process_references_rec(references_pub_obj_list, search_depth, search_depth_max, test_var):  
    # adds next level to nodes/edges
    for pub in references_pub_obj_list:
        new_reference_pub_obj_list = create_graph_structure_references(pub, search_depth, search_depth_max, test_var)    
        # If the maximum height has not yet been reached, calls function recursivly with increased height     
        if (search_depth < search_depth_max):
            process_references_rec(new_reference_pub_obj_list, search_depth+1, search_depth_max, test_var)
    
# adds a node for every publication unknown
# adds edges for citations between publications     
def create_graph_structure_citations(pub, search_height, search_height_max, test_var):
    citations_pub_obj_list = []
    for citation in pub.citations:
        not_in_nodes = True
        for node in nodes:
            # checks every citation for duplication 
            if (citation.doi_url == node.doi_url):
                not_in_nodes = False
                break
        if (not_in_nodes):
            if (search_height < search_height_max):
                #checks if its a test and chooses input function accordingly
                if (test_var):
                    citation_pub_obj = input_test_func(citation.doi_url)
                else:
                    citation_pub_obj = input(citation.doi_url)
                citation_pub_obj.group = "height"
                nodes.append(citation_pub_obj)
                edges.append([citation_pub_obj.doi_url,pub.doi_url])
                citations_pub_obj_list.append(citation_pub_obj)
        # adds only edge if citation already exists         
        elif [citation.doi_url,pub.doi_url] not in edges:
            edges.append([citation.doi_url,pub.doi_url])   
    return citations_pub_obj_list
# recursive function to implement height-first-search on citations
# citations_pub_obj_list: input list of citations as publication objects
# search_height: current search_height of height-first-search
# search_height_max: maximal search_height for dfs
def process_citations_rec(citations_pub_obj_list, search_height, search_height_max, test_var):  
    # adds next level to nodes/edges
    for pub in citations_pub_obj_list:
        new_citation_pub_obj_list = create_graph_structure_citations(pub, search_height, search_height_max, test_var)   

        # If the maximum height has not yet been reached, calls function recursivly with increased height 
        if (search_height < search_height_max):
            process_citations_rec(new_citation_pub_obj_list, search_height+1, search_height_max, test_var)
# main function to call. Needs as input:
# doi_input_list:   input list of dois
# search_height:    max search height to process to
# search_depth:     max search depth to process to
# test_var:         only needed for unit test as True, default is False
def process_main(doi_input_list, search_height, search_depth, test_var = False):
    # ERROR-Handling doi_array = NULL
    if (len(doi_input_list) == 0):
        print("Error, no input data")

    # ERROR- if a negative number is entered for height
    if (search_height < 0):
        print("Error, search_height of search must be positive")

    # ERROR- if a negative number is entered for depth
    if (search_depth < 0):
        print("Error, search_depth of search must be positive")       

    # create empty array for the nodes
    # create empty array for the edges
    global nodes, edges
    nodes = []
    edges = []

    # initializes nodes/edges from input and gets a list with publication objects for citations and references returned
    references_obj_list, citations_obj_list = initialize_nodes_list(doi_input_list,search_depth, search_height, test_var)

    # function calls to begin recursive processing up to max depth/height
    process_citations_rec(citations_obj_list, 1, search_height, test_var)
    process_references_rec(references_obj_list, 1, search_depth, test_var)
    # adds edges between reference group and citation group of known publications
    complete_inner_edges(test_var)
    # calls a skript to save nodes and edges of graph in .json file
    output_to_json(nodes,edges)

    # only for unit tests
    if (test_var == True):
        doi_nodes_list = []
        for node in nodes:
            doi_nodes_list.append(node.doi_url)
        return(doi_nodes_list, edges)
# a function to print nodes and edges from a graph
def print_graph(nodes, edges):
    print("Knoten:\n")
    for node in nodes:
        print(node.title, "\n")
    print("\nKanten:\n")
    for edge in edges:
        print(edge,"\n")
   
# program test, because there is no connection to UI yet.
def try_known_publications():
    doi_list = []
    doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
    #arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
    doi_list.append('https://doi.org/10.1021/acs.jmedchem.0c01332')
    #arr.append('https://doi.org/10.1021/acs.jcim.0c00741')

    #arr.append('https://doi.org/10.1021/ci700007b')
    #arr.append('https://doi.org/10.1021/acs.jcim.5b00292')
    #url = sys.argv[1]
    #arr.append[url]

    nodes,edges = process_main(doi_list,2,2)
    print_graph(nodes, edges)