Skip to content
Snippets Groups Projects
Select Git revision
  • 048eb4e58521dfc950f8ca33d81ab8d1f220431a
  • main default protected
  • userHandling
  • snuggle
4 results

README.md

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    Processing.py 13.24 KiB
    # -*- coding: utf-8 -*-
    """
    Functions to generate a graph representing citations between multiple ACS/Nature journals
    
    """
    
    __authors__ = "Donna Löding, Alina Molkentin, Xinyi Tang, Judith Große, Malte Schokolowski"
    __email__ = "cis-project2021@zbh.uni-hamburg.de"
    __status__ = "Production"
    #__copyright__ = ""
    #__credits__ = ["", "", "", ""]
    #__license__ = ""
    #__version__ = ""
    #__maintainer__ = ""
    
    from bs4 import BeautifulSoup as bs
    import requests as req
    import sys  
    from pathlib import Path
    #sys.path.insert(1, 'C:\Users\Malte\Git\CiS-Projekt\ci-s-projekt-verarbeitung\input')
    sys.path.append("../")
    from input.interface import InputInterface as Input
    #import input
    from input_test import input_test_func
    from json_demo import output_to_json
    
    
    def initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var):
        '''
            :param doi_input_list:      input list of doi from UI
            :type doi_input_list:       list of strings
    
            :param search_depth_max:    maximum depth to search for references
            :type search_depth_max:     int
    
            :param search_height_max:   maximum height to search for citations
            :type search_height_max:    int
    
            :param test_var:            variable to differenciate between test and url call
            :type test_var:             boolean
        '''
    
        references_pub_obj_list = []
        citations_pub_obj_list = []
    
        for pub_doi in doi_input_list: #iterates over every incoming doi
            if(test_var): #checks that it is a test and chooses test-input function
                pub = input_test_func(pub_doi) #creates an object of class Publication
            else: #checks that it isnt a test and chooses standart-input function
                #print(pub_doi)
                inter = Input()
                try:
                    pub = inter.get_publication(pub_doi) #creates an object of class Publication
                except ValueError:
                    continue
                except IndexError:
                    continue
                
    
            # checks if publication already exists in nodes
            not_in_nodes = True #boolean value to check if a node already exists in the set of nodes
            for node in nodes: #iterates over every node in the set of nodes
                if (pub.doi_url == node.doi_url): #determines that a node with this doi already is in the set
                    not_in_nodes = False #false --> node will not be created
                    break
            if (not_in_nodes): #there is no node with this doi in the set
                nodes.append(pub) #appends Publication Object
                pub.group = "input"
            else:
                doi_input_list.remove(pub_doi) #deletes the doi-dublicate from input list
    
            # inserts references as publication objects into list and 
            # inserts first depth references into nodes/edges if maximum search depth > 0
            for reference in create_graph_structure_references(pub, 0, search_depth_max, test_var):
                references_pub_obj_list.append(reference)
    
            # inserts citations as publication objects into list and 
            # inserts first height citations into nodes if maximum search height > 0
            for citation in create_graph_structure_citations(pub, 0, search_height_max, test_var):
                citations_pub_obj_list.append(citation)
    
        return(references_pub_obj_list, citations_pub_obj_list)
            
        
    
    def complete_inner_edges(test_var):
        '''
            :param test_var:    variable to differenciate between test and url call
            :type test_var:     boolean
        '''
    
        for node in nodes:
            if (node.group == "depth"):
                for citation in node.citations:
                    for cit in nodes:
                        if (citation.doi_url == cit.doi_url and [citation.doi_url, node.doi_url] not in edges):
                            edges.append([citation.doi_url, node.doi_url])
            if (node.group == "height"):
                for reference in node.references:
                    for ref in nodes:
                        if (reference.doi_url == ref.doi_url and [node.doi_url, reference.doi_url] not in edges):
                            edges.append([node.doi_url,reference.doi_url])
    
    
    
    # adds a node for every publication unknown
    # adds edges for references between publications     
    def create_graph_structure_references(pub, search_depth, search_depth_max, test_var):
        '''
            :param pub:                 publication which references will be added
            :type pub:                  Class Publication
    
            :param search_depth:        current depth to search for references
            :type search_depth:         int
    
            :param search_depth_max:    maximum depth to search for references
            :type search_depth_max:     int
    
            :param test_var:            variable to differenciate between test and url call
            :type test_var:             boolean
        '''
    
        references_pub_obj_list = []
        for reference in pub.references: #iterates over the references of the considered paper
            not_in_nodes = True #boolean Value to ensure that there will be no dublicates in the set of nodes
            for node in nodes: #iterates over all nodes in set of nodes 
                if (reference.doi_url == node.doi_url): #determines that the node already exists
                    not_in_nodes = False #boolean false --> node will not be created
                    break
            if (not_in_nodes): #checks that there is no node with this doi 
                if (search_depth < search_depth_max): #checks that the recursion step is smaller than the limit
                    if (test_var): #determines that it is a test and chooses the test-input function
                        reference_pub_obj = input_test_func(reference.doi_url) #creates an Object of Publication Class 
                    else: #determines that it isnt a test and chooses the standart function
                        #reference_pub_obj = Input(reference.doi_url)
                        inter = Input()
                        try:
                            reference_pub_obj = inter.get_publication(reference.doi_url)
                        except ValueError:
                            continue
                        # nur aus Testzwecken, da noch was bei Input falsch ist
                        except IndexError:
                            continue
    
                    reference_pub_obj.group = "depth"
                    nodes.append(reference_pub_obj) # appends the object to the set of nodes
                    edges.append([pub.doi_url,reference_pub_obj.doi_url]) #appends the edge to the set of edges
                    references_pub_obj_list.append(reference_pub_obj) #appends the node to the set of references
    
            # adds edge only if citation already exists           
            elif [pub.doi_url,reference.doi_url] not in edges:
                edges.append([pub.doi_url,reference.doi_url])  
        return references_pub_obj_list 
    
    
    # recursive function to implement height-first-search on references
    # references_pub_obj_list: input list of references as publication objects
    # search_depth: current search_depth of height-first-search
    # search_depth_max: maximal search_depth for dfs
    def process_references_rec(references_pub_obj_list, search_depth, search_depth_max, test_var):  
        '''
            :param references_pub_obj_list: list of publications which references will be added
            :type references_pub_obj_list:  list of objects of type Class Publications
    
            :param search_depth:            current depth to search for references
            :type search_depth:             int
    
            :param search_depth_max:        maximum depth to search for references
            :type search_depth_max:         int
    
            :param test_var:                variable to differenciate between test and url call
            :type test_var:                 boolean
        '''
    
        # adds next level to nodes/edges
        for pub in references_pub_obj_list:
            new_reference_pub_obj_list = create_graph_structure_references(pub, search_depth, search_depth_max, test_var)    
    
            # If the maximum height has not yet been reached, calls function recursivly with increased height     
            if (search_depth < search_depth_max):
                process_references_rec(new_reference_pub_obj_list, search_depth+1, search_depth_max, test_var)
    
    
    
        
      
    def create_graph_structure_citations(pub, search_height, search_height_max, test_var):
        '''
            :param pub:                 publication which citations will be added
            :type pub:                  Class Publication
    
            :param search_height:       current height to search for citations
            :type search_height_max:    int
    
            :param search_height_max:   maximum height to search for citations
            :type search_height_max:    int
    
            :param test_var:            variable to differenciate between test and url call
            :type test_var:             boolean
        '''
    
        citations_pub_obj_list = []
        for citation in pub.citations:
            not_in_nodes = True
            for node in nodes: # checks every citation for duplication 
                if (citation.doi_url == node.doi_url):
                    not_in_nodes = False
                    break
            if (not_in_nodes):
                if (search_height < search_height_max): #checks if its a test and chooses input function accordingly
                    if (test_var):
                        citation_pub_obj = input_test_func(citation.doi_url)
                    else:
                        #citation_pub_obj = Input(citation.doi_url)
                        inter = Input()
                        try:
                            citation_pub_obj = inter.get_publication(citation.doi_url)
                        except ValueError:
                            continue
                        except IndexError:
                            continue
    
                    citation_pub_obj.group = "height"
                    nodes.append(citation_pub_obj)
                    edges.append([citation_pub_obj.doi_url,pub.doi_url])
                    citations_pub_obj_list.append(citation_pub_obj)
    
            # adds only edge if citation already exists         
            elif [citation.doi_url,pub.doi_url] not in edges:
                edges.append([citation.doi_url,pub.doi_url])   
        return citations_pub_obj_list
    
    
    
    # recursive function to implement height-first-search on citations
    # citations_pub_obj_list: input list of citations as publication objects
    # search_height: current search_height of height-first-search
    # search_height_max: maximal search_height for dfs
    def process_citations_rec(citations_pub_obj_list, search_height, search_height_max, test_var):  
        '''
            :param citations_pub_obj_list:  list of publications which citations will be added
            :type citations_pub_obj_list:   list of objects of type Class Publications
    
            :param search_height:       current height to search for citations
            :type search_height_max:    int
    
            :param search_height_max:   maximum height to search for citations
            :type search_height_max:    int
    
            :param test_var:            variable to differenciate between test and url call
            :type test_var:             boolean
        '''
    
        # adds next level to nodes/edges
        for pub in citations_pub_obj_list:
            new_citation_pub_obj_list = create_graph_structure_citations(pub, search_height, search_height_max, test_var)   
    
            # If the maximum height has not yet been reached, calls function recursivly with increased height 
            if (search_height < search_height_max):
                process_citations_rec(new_citation_pub_obj_list, search_height+1, search_height_max, test_var)
    
    
    
    
    
    def process_main(doi_input_list, search_height, search_depth, test_var = False):
        '''
            :param doi_input_list:  input list of doi from UI
            :type doi_input_list:   list of strings
    
            :param search_height:   maximum height to search for citations
            :type search_height:    int
    
            :param search_depth:    maximum depth to search for references
            :type search_depth:     int
    
            :param test_var:        variable to differenciate between test and url call
            :type test_var:         boolean
        '''
    
        # ERROR-Handling doi_array = NULL
        if (len(doi_input_list) == 0):
            print("Error, no input data")
    
        # ERROR- if a negative number is entered for height
        if (search_height < 0):
            print("Error, search_height of search must be positive")
    
        # ERROR- if a negative number is entered for depth
        if (search_depth < 0):
            print("Error, search_depth of search must be positive")       
    
        
        
        global nodes, edges
        nodes = [] # create empty array for the nodes
        edges = [] # create empty array for the edges
    
        # initializes nodes/edges from input and gets a list with publication objects for citations and references returned
        references_obj_list, citations_obj_list = initialize_nodes_list(doi_input_list,search_depth, search_height, test_var)
    
        # function calls to begin recursive processing up to max depth/height
        process_citations_rec(citations_obj_list, 1, search_height, test_var)
        process_references_rec(references_obj_list, 1, search_depth, test_var)
    
        # adds edges between reference group and citation group of known publications
        complete_inner_edges(test_var)
    
        # calls a skript to save nodes and edges of graph in .json file
        output_to_json(nodes,edges)
    
    
        # only for unit tests
        if (test_var == True):
            doi_nodes_list = []
            for node in nodes:
                doi_nodes_list.append(node.doi_url)
            return(doi_nodes_list, edges)
    
        return(nodes,edges)