Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
Processing.py 13.24 KiB
# -*- coding: utf-8 -*-
"""
Functions to generate a graph representing citations between multiple ACS/Nature journals

"""

__authors__ = "Donna Löding, Alina Molkentin, Xinyi Tang, Judith Große, Malte Schokolowski"
__email__ = "cis-project2021@zbh.uni-hamburg.de"
__status__ = "Production"
#__copyright__ = ""
#__credits__ = ["", "", "", ""]
#__license__ = ""
#__version__ = ""
#__maintainer__ = ""

from bs4 import BeautifulSoup as bs
import requests as req
import sys  
from pathlib import Path
#sys.path.insert(1, 'C:\Users\Malte\Git\CiS-Projekt\ci-s-projekt-verarbeitung\input')
sys.path.append("../")
from input.interface import InputInterface as Input
#import input
from input_test import input_test_func
from json_demo import output_to_json


def initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var):
    '''
        :param doi_input_list:      input list of doi from UI
        :type doi_input_list:       list of strings

        :param search_depth_max:    maximum depth to search for references
        :type search_depth_max:     int

        :param search_height_max:   maximum height to search for citations
        :type search_height_max:    int

        :param test_var:            variable to differenciate between test and url call
        :type test_var:             boolean
    '''

    references_pub_obj_list = []
    citations_pub_obj_list = []

    for pub_doi in doi_input_list: #iterates over every incoming doi
        if(test_var): #checks that it is a test and chooses test-input function
            pub = input_test_func(pub_doi) #creates an object of class Publication
        else: #checks that it isnt a test and chooses standart-input function
            #print(pub_doi)
            inter = Input()
            try:
                pub = inter.get_publication(pub_doi) #creates an object of class Publication
            except ValueError:
                continue
            except IndexError:
                continue
            

        # checks if publication already exists in nodes
        not_in_nodes = True #boolean value to check if a node already exists in the set of nodes
        for node in nodes: #iterates over every node in the set of nodes
            if (pub.doi_url == node.doi_url): #determines that a node with this doi already is in the set
                not_in_nodes = False #false --> node will not be created
                break
        if (not_in_nodes): #there is no node with this doi in the set
            nodes.append(pub) #appends Publication Object
            pub.group = "input"
        else:
            doi_input_list.remove(pub_doi) #deletes the doi-dublicate from input list

        # inserts references as publication objects into list and 
        # inserts first depth references into nodes/edges if maximum search depth > 0
        for reference in create_graph_structure_references(pub, 0, search_depth_max, test_var):
            references_pub_obj_list.append(reference)

        # inserts citations as publication objects into list and 
        # inserts first height citations into nodes if maximum search height > 0
        for citation in create_graph_structure_citations(pub, 0, search_height_max, test_var):
            citations_pub_obj_list.append(citation)

    return(references_pub_obj_list, citations_pub_obj_list)
        
    

def complete_inner_edges(test_var):
    '''
        :param test_var:    variable to differenciate between test and url call
        :type test_var:     boolean
    '''

    for node in nodes:
        if (node.group == "depth"):
            for citation in node.citations:
                for cit in nodes:
                    if (citation.doi_url == cit.doi_url and [citation.doi_url, node.doi_url] not in edges):
                        edges.append([citation.doi_url, node.doi_url])
        if (node.group == "height"):
            for reference in node.references:
                for ref in nodes:
                    if (reference.doi_url == ref.doi_url and [node.doi_url, reference.doi_url] not in edges):
                        edges.append([node.doi_url,reference.doi_url])



# adds a node for every publication unknown
# adds edges for references between publications     
def create_graph_structure_references(pub, search_depth, search_depth_max, test_var):
    '''
        :param pub:                 publication which references will be added
        :type pub:                  Class Publication

        :param search_depth:        current depth to search for references
        :type search_depth:         int

        :param search_depth_max:    maximum depth to search for references
        :type search_depth_max:     int

        :param test_var:            variable to differenciate between test and url call
        :type test_var:             boolean
    '''

    references_pub_obj_list = []
    for reference in pub.references: #iterates over the references of the considered paper
        not_in_nodes = True #boolean Value to ensure that there will be no dublicates in the set of nodes
        for node in nodes: #iterates over all nodes in set of nodes 
            if (reference.doi_url == node.doi_url): #determines that the node already exists
                not_in_nodes = False #boolean false --> node will not be created
                break
        if (not_in_nodes): #checks that there is no node with this doi 
            if (search_depth < search_depth_max): #checks that the recursion step is smaller than the limit
                if (test_var): #determines that it is a test and chooses the test-input function
                    reference_pub_obj = input_test_func(reference.doi_url) #creates an Object of Publication Class 
                else: #determines that it isnt a test and chooses the standart function
                    #reference_pub_obj = Input(reference.doi_url)
                    inter = Input()
                    try:
                        reference_pub_obj = inter.get_publication(reference.doi_url)
                    except ValueError:
                        continue
                    # nur aus Testzwecken, da noch was bei Input falsch ist
                    except IndexError:
                        continue

                reference_pub_obj.group = "depth"
                nodes.append(reference_pub_obj) # appends the object to the set of nodes
                edges.append([pub.doi_url,reference_pub_obj.doi_url]) #appends the edge to the set of edges
                references_pub_obj_list.append(reference_pub_obj) #appends the node to the set of references

        # adds edge only if citation already exists           
        elif [pub.doi_url,reference.doi_url] not in edges:
            edges.append([pub.doi_url,reference.doi_url])  
    return references_pub_obj_list 


# recursive function to implement height-first-search on references
# references_pub_obj_list: input list of references as publication objects
# search_depth: current search_depth of height-first-search
# search_depth_max: maximal search_depth for dfs
def process_references_rec(references_pub_obj_list, search_depth, search_depth_max, test_var):  
    '''
        :param references_pub_obj_list: list of publications which references will be added
        :type references_pub_obj_list:  list of objects of type Class Publications

        :param search_depth:            current depth to search for references
        :type search_depth:             int

        :param search_depth_max:        maximum depth to search for references
        :type search_depth_max:         int

        :param test_var:                variable to differenciate between test and url call
        :type test_var:                 boolean
    '''

    # adds next level to nodes/edges
    for pub in references_pub_obj_list:
        new_reference_pub_obj_list = create_graph_structure_references(pub, search_depth, search_depth_max, test_var)    

        # If the maximum height has not yet been reached, calls function recursivly with increased height     
        if (search_depth < search_depth_max):
            process_references_rec(new_reference_pub_obj_list, search_depth+1, search_depth_max, test_var)



    
  
def create_graph_structure_citations(pub, search_height, search_height_max, test_var):
    '''
        :param pub:                 publication which citations will be added
        :type pub:                  Class Publication

        :param search_height:       current height to search for citations
        :type search_height_max:    int

        :param search_height_max:   maximum height to search for citations
        :type search_height_max:    int

        :param test_var:            variable to differenciate between test and url call
        :type test_var:             boolean
    '''

    citations_pub_obj_list = []
    for citation in pub.citations:
        not_in_nodes = True
        for node in nodes: # checks every citation for duplication 
            if (citation.doi_url == node.doi_url):
                not_in_nodes = False
                break
        if (not_in_nodes):
            if (search_height < search_height_max): #checks if its a test and chooses input function accordingly
                if (test_var):
                    citation_pub_obj = input_test_func(citation.doi_url)
                else:
                    #citation_pub_obj = Input(citation.doi_url)
                    inter = Input()
                    try:
                        citation_pub_obj = inter.get_publication(citation.doi_url)
                    except ValueError:
                        continue
                    except IndexError:
                        continue

                citation_pub_obj.group = "height"
                nodes.append(citation_pub_obj)
                edges.append([citation_pub_obj.doi_url,pub.doi_url])
                citations_pub_obj_list.append(citation_pub_obj)

        # adds only edge if citation already exists         
        elif [citation.doi_url,pub.doi_url] not in edges:
            edges.append([citation.doi_url,pub.doi_url])   
    return citations_pub_obj_list



# recursive function to implement height-first-search on citations
# citations_pub_obj_list: input list of citations as publication objects
# search_height: current search_height of height-first-search
# search_height_max: maximal search_height for dfs
def process_citations_rec(citations_pub_obj_list, search_height, search_height_max, test_var):  
    '''
        :param citations_pub_obj_list:  list of publications which citations will be added
        :type citations_pub_obj_list:   list of objects of type Class Publications

        :param search_height:       current height to search for citations
        :type search_height_max:    int

        :param search_height_max:   maximum height to search for citations
        :type search_height_max:    int

        :param test_var:            variable to differenciate between test and url call
        :type test_var:             boolean
    '''

    # adds next level to nodes/edges
    for pub in citations_pub_obj_list:
        new_citation_pub_obj_list = create_graph_structure_citations(pub, search_height, search_height_max, test_var)   

        # If the maximum height has not yet been reached, calls function recursivly with increased height 
        if (search_height < search_height_max):
            process_citations_rec(new_citation_pub_obj_list, search_height+1, search_height_max, test_var)





def process_main(doi_input_list, search_height, search_depth, test_var = False):
    '''
        :param doi_input_list:  input list of doi from UI
        :type doi_input_list:   list of strings

        :param search_height:   maximum height to search for citations
        :type search_height:    int

        :param search_depth:    maximum depth to search for references
        :type search_depth:     int

        :param test_var:        variable to differenciate between test and url call
        :type test_var:         boolean
    '''

    # ERROR-Handling doi_array = NULL
    if (len(doi_input_list) == 0):
        print("Error, no input data")

    # ERROR- if a negative number is entered for height
    if (search_height < 0):
        print("Error, search_height of search must be positive")

    # ERROR- if a negative number is entered for depth
    if (search_depth < 0):
        print("Error, search_depth of search must be positive")       

    
    
    global nodes, edges
    nodes = [] # create empty array for the nodes
    edges = [] # create empty array for the edges

    # initializes nodes/edges from input and gets a list with publication objects for citations and references returned
    references_obj_list, citations_obj_list = initialize_nodes_list(doi_input_list,search_depth, search_height, test_var)

    # function calls to begin recursive processing up to max depth/height
    process_citations_rec(citations_obj_list, 1, search_height, test_var)
    process_references_rec(references_obj_list, 1, search_depth, test_var)

    # adds edges between reference group and citation group of known publications
    complete_inner_edges(test_var)

    # calls a skript to save nodes and edges of graph in .json file
    output_to_json(nodes,edges)


    # only for unit tests
    if (test_var == True):
        doi_nodes_list = []
        for node in nodes:
            doi_nodes_list.append(node.doi_url)
        return(doi_nodes_list, edges)

    return(nodes,edges)