Skip to content
Snippets Groups Projects
Commit e570bd72 authored by Malte Schokolowski's avatar Malte Schokolowski
Browse files

fixed bug and removed old Processing.py

parent 45b19745
No related branches found
No related tags found
No related merge requests found
...@@ -23,9 +23,10 @@ from json_demo import output_to_json ...@@ -23,9 +23,10 @@ from json_demo import output_to_json
# adds every publication from input list to graph structure # adds every publication from input list to graph structure
# doi_input_list: list of publication dois from user # doi_input_list: list of publication dois from user
def initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var):
references_pub_obj_list = []
citations_pub_obj_list = []
# TO-DO: Listenelemente auf Korrektheit überprüfen
def initialize_nodes_list(doi_input_list, test_var):
for pub_doi in doi_input_list: for pub_doi in doi_input_list:
#checks if its a test and chooses input function accordingly #checks if its a test and chooses input function accordingly
...@@ -46,60 +47,40 @@ def initialize_nodes_list(doi_input_list, test_var): ...@@ -46,60 +47,40 @@ def initialize_nodes_list(doi_input_list, test_var):
else: else:
doi_input_list.remove(pub_doi) doi_input_list.remove(pub_doi)
# adds inner edges between citations and references to edges # inserts references as publication objects into list and
def complete_inner_edges(test_var): # inserts first depth references into nodes/edges if maximum search depth > 0
for node in nodes: for reference in create_graph_structure_references(pub, 0, search_depth_max, test_var):
references_pub_obj_list.append(reference)
# checks if its a test and chooses input function accordingly
if (test_var):
pub = input_test_func(node.doi_url)
else:
pub = input(node.doi_url)
if (node.group == "depth"):
for citation in pub.citations:
if (citation in nodes and [citation.doi_url, pub.doi_url] not in edges):
edges.append([citation.doi_url, pub.doi_url])
if (node.group == "height"):
for reference in pub.references:
for node in nodes:
if (reference.doi_url in node.doi_url and [pub.doi_url, reference.doi_url] not in edges):
edges.append([pub.doi_url,reference.doi_url])
# inserts citations as publication objects into list and
# inserts first height citations into nodes if maximum search height > 0
for citation in create_graph_structure_citations(pub, 0, search_height_max, test_var):
citations_pub_obj_list.append(citation)
return(references_pub_obj_list, citations_pub_obj_list)
# adds a node for every publication unknown
# adds edges for citations between publications
def create_graph_structure_citations(pub, search_height, search_height_max):
for citation in pub.citations:
# checks if publication already exists in nodes # adds edges between citation and reference group
not_in_nodes = True def complete_inner_edges(test_var):
for node in nodes: for node in nodes:
# checks every citation for duplication if (node.group == "depth"):
if (citation.doi_url == node.doi_url): for citation in node.citations:
not_in_nodes = False for cit in nodes:
break if (citation.doi_url == cit.doi_url and [citation.doi_url, node.doi_url] not in edges):
if (not_in_nodes): edges.append([citation.doi_url, node.doi_url])
if (search_height <= search_height_max): if (node.group == "height"):
citation.group = "height" for reference in node.references:
nodes.append(citation) for ref in nodes:
edges.append([citation.doi_url,pub.doi_url]) if (reference.doi_url == ref.doi_url and [node.doi_url, reference.doi_url] not in edges):
edges.append([node.doi_url,reference.doi_url])
# adds only an edge (citation already exists)
elif [citation.doi_url,pub.doi_url] not in edges:
edges.append([citation.doi_url,pub.doi_url])
# adds a node for every publication unknown # adds a node for every publication unknown
# adds edges for references between publications # adds edges for references between publications
def create_graph_structure_references(pub, search_depth, search_depth_max): def create_graph_structure_references(pub, search_depth, search_depth_max, test_var):
references_pub_obj_list = []
for reference in pub.references: for reference in pub.references:
# checks if publication already exists in nodes
not_in_nodes = True not_in_nodes = True
for node in nodes: for node in nodes:
# checks every reference for duplication # checks every reference for duplication
...@@ -107,86 +88,94 @@ def create_graph_structure_references(pub, search_depth, search_depth_max): ...@@ -107,86 +88,94 @@ def create_graph_structure_references(pub, search_depth, search_depth_max):
not_in_nodes = False not_in_nodes = False
break break
if (not_in_nodes): if (not_in_nodes):
if (search_depth <= search_depth_max): if (search_depth < search_depth_max):
reference.group = "depth"
nodes.append(reference)
edges.append([pub.doi_url,reference.doi_url])
# adds only an edge (citation already exists)
elif [pub.doi_url,reference.doi_url] not in edges:
edges.append([pub.doi_url,reference.doi_url])
# recursive function to implement height-first-search on citations
# doi_citations: input list of citet dois
# search_height: current search_height of height-first-search
# search_height_max: maximal search_height for dfs
def process_citations_rec(doi_citations, search_height, search_height_max, test_var):
# height of search is increased by 1 with each recursive call
search_height += 1
# create class object for every citation from list
for pub_doi in doi_citations:
#checks if its a test and chooses input function accordingly #checks if its a test and chooses input function accordingly
if (test_var): if (test_var):
pub = input_test_func(pub_doi) reference_pub_obj = input_test_func(reference.doi_url)
else: else:
pub = input(pub_doi) reference_pub_obj = input(reference.doi_url)
create_graph_structure_citations(pub, search_height, search_height_max)
# If the maximum height has not yet been reached, all references from the publication
# are written to an array and the function is called again with this array.
if (search_height < search_height_max):
citations_list = []
for citation in pub.citations:
# currently only the references with acs are stored in the URL, because we can't
# extract the info from other sources.
if ("acs" in citation.doi_url or test_var == True):
citations_list.append(citation.doi_url)
# recursive call of function. reference_pub_obj.group = "depth"
process_citations_rec(citations_list, search_height, search_height_max, test_var) nodes.append(reference_pub_obj)
edges.append([pub.doi_url,reference_pub_obj.doi_url])
references_pub_obj_list.append(reference_pub_obj)
# adds edge only if citation already exists
elif [pub.doi_url,reference.doi_url] not in edges:
edges.append([pub.doi_url,reference.doi_url])
return references_pub_obj_list
# recursive function to implement height-first-search on references # recursive function to implement height-first-search on references
# doi_references: input list of referenced dois # references_pub_obj_list: input list of references as publication objects
# search_depth: current search_depth of height-first-search # search_depth: current search_depth of height-first-search
# search_depth_max: maximal search_depth for dfs # search_depth_max: maximal search_depth for dfs
def process_references_rec(doi_references, search_depth, search_depth_max, test_var): def process_references_rec(references_pub_obj_list, search_depth, search_depth_max, test_var):
# The depth is increased by 1 with each recursive call # adds next level to nodes/edges
search_depth += 1 for pub in references_pub_obj_list:
new_reference_pub_obj_list = create_graph_structure_references(pub, search_depth, search_depth_max, test_var)
# If the maximum height has not yet been reached, calls function recursivly with increased height
if (search_depth < search_depth_max):
process_references_rec(new_reference_pub_obj_list, search_depth+1, search_depth_max, test_var)
# create class object for every citation from list # adds a node for every publication unknown
for pub_doi in doi_references: # adds edges for citations between publications
def create_graph_structure_citations(pub, search_height, search_height_max, test_var):
citations_pub_obj_list = []
for citation in pub.citations:
not_in_nodes = True
for node in nodes:
# checks every citation for duplication
if (citation.doi_url == node.doi_url):
not_in_nodes = False
break
if (not_in_nodes):
if (search_height < search_height_max):
#checks if its a test and chooses input function accordingly #checks if its a test and chooses input function accordingly
if (test_var): if (test_var):
pub = input_test_func(pub_doi) citation_pub_obj = input_test_func(citation.doi_url)
else: else:
pub = input(pub_doi) citation_pub_obj = input(citation.doi_url)
create_graph_structure_references(pub, search_depth, search_depth_max) citation_pub_obj.group = "height"
# If the maximum depth has not yet been reached, all references from the publication nodes.append(citation_pub_obj)
# are written to an array and the function is called again with this array. edges.append([citation_pub_obj.doi_url,pub.doi_url])
if (search_depth < search_depth_max): citations_pub_obj_list.append(citation_pub_obj)
references_list = []
for reference in pub.references: # adds only edge if citation already exists
elif [citation.doi_url,pub.doi_url] not in edges:
edges.append([citation.doi_url,pub.doi_url])
return citations_pub_obj_list
# currently only the references with acs are stored in the URL, because we can't
# extract the info from other sources.
if ("acs" in reference.doi_url or test_var == True):
references_list.append(reference.doi_url)
# recursive call of function.
process_references_rec(references_list, search_depth, search_depth_max, test_var) # recursive function to implement height-first-search on citations
# citations_pub_obj_list: input list of citations as publication objects
# search_height: current search_height of height-first-search
# search_height_max: maximal search_height for dfs
def process_citations_rec(citations_pub_obj_list, search_height, search_height_max, test_var):
# adds next level to nodes/edges
for pub in citations_pub_obj_list:
new_citation_pub_obj_list = create_graph_structure_citations(pub, search_height, search_height_max, test_var)
# If the maximum height has not yet been reached, calls function recursivly with increased height
if (search_height < search_height_max):
process_citations_rec(new_citation_pub_obj_list, search_height+1, search_height_max, test_var)
# main function to call. Needs as input:
# doi_input_list: input list of dois
# search_height: max search height to process to
# search_depth: max search depth to process to
# test_var: only needed for unit test as True, default is False
def process_main(doi_input_list, search_height, search_depth, test_var = False): def process_main(doi_input_list, search_height, search_depth, test_var = False):
# ERROR-Handling doi_array = NULL # ERROR-Handling doi_array = NULL
if (len(doi_input_list) == 0): if (len(doi_input_list) == 0):
...@@ -206,24 +195,30 @@ def process_main(doi_input_list, search_height, search_depth, test_var = False): ...@@ -206,24 +195,30 @@ def process_main(doi_input_list, search_height, search_depth, test_var = False):
nodes = [] nodes = []
edges = [] edges = []
# initializes nodes/edges from input and gets a list with publication objects for citations and references returned
references_obj_list, citations_obj_list = initialize_nodes_list(doi_input_list,search_depth, search_height, test_var)
initialize_nodes_list(doi_input_list,test_var) # function calls to begin recursive processing up to max depth/height
process_citations_rec(doi_input_list, 0, search_height, test_var) process_citations_rec(citations_obj_list, 1, search_height, test_var)
process_references_rec(doi_input_list, 0, search_depth, test_var) process_references_rec(references_obj_list, 1, search_depth, test_var)
# adds edges between reference group and citation group of known publications
complete_inner_edges(test_var) complete_inner_edges(test_var)
# calls a skript to save nodes and edges of graph in .json file
output_to_json(nodes,edges) output_to_json(nodes,edges)
# only for internal testing # only for unit tests
doi_nodes = [] if (test_var == True):
doi_nodes_list = []
for node in nodes: for node in nodes:
doi_nodes.append(node.doi_url) doi_nodes_list.append(node.doi_url)
return(doi_nodes,edges) return(doi_nodes_list, edges)
# a function to print nodes and edges from a graph
def print_graph(nodes, edges): def print_graph(nodes, edges):
print("Knoten:\n") print("Knoten:\n")
for node in nodes: for node in nodes:
...@@ -233,24 +228,12 @@ def print_graph(nodes, edges): ...@@ -233,24 +228,12 @@ def print_graph(nodes, edges):
print(edge,"\n") print(edge,"\n")
# function to test cycles # program test, because there is no connection to UI yet.
def test_cycle(): def try_known_publications():
arr = [] doi_list = []
arr.append('doiz1') doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
#arr.append('doiz2')
nodes,edges = process_main(arr,1,1,True)
print(nodes, edges)
print_graph(nodes, edges)
# program test, because there is no connection to the input yet.
def test_print():
arr = []
#arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
#arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249') #arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
#arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332') doi_list.append('https://doi.org/10.1021/acs.jmedchem.0c01332')
#arr.append('https://doi.org/10.1021/acs.jcim.0c00741') #arr.append('https://doi.org/10.1021/acs.jcim.0c00741')
#arr.append('https://doi.org/10.1021/ci700007b') #arr.append('https://doi.org/10.1021/ci700007b')
...@@ -259,13 +242,6 @@ def test_print(): ...@@ -259,13 +242,6 @@ def test_print():
#arr.append[url] #arr.append[url]
nodes,edges = process_main(arr,2,2,True) nodes,edges = process_main(doi_list,2,2)
print_graph(nodes, edges) print_graph(nodes, edges)
\ No newline at end of file
#test_print()
#test_cycle()
#print(process_main(['doiz1'],1,1,True))
#print(process_main(['doi1'],0,0,True))
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Functions to generate a graph representing citations between multiple ACS/Nature journals
"""
__authors__ = "Donna Löding, Alina Molkentin, Xinyi Tang, Judith Große, Malte Schokolowski"
__email__ = "cis-project2021@zbh.uni-hamburg.de"
__status__ = "Production"
#__copyright__ = ""
#__credits__ = ["", "", "", ""]
#__license__ = ""
#__version__ = ""
#__maintainer__ = ""
from bs4 import BeautifulSoup as bs
import requests as req
import sys
from pathlib import Path
from input_fj import input
from input_test import input_test_func
from json_demo import output_to_json
# adds every publication from input list to graph structure
# doi_input_list: list of publication dois from user
def initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var):
references_pub_obj_list = []
citations_pub_obj_list = []
for pub_doi in doi_input_list:
#checks if its a test and chooses input function accordingly
if(test_var):
pub = input_test_func(pub_doi)
else:
pub = input(pub_doi)
# checks if publication already exists in nodes
not_in_nodes = True
for node in nodes: # checks if a pub is already in nodes
if (pub.doi_url == node.doi_url):
not_in_nodes = False
break
if (not_in_nodes):
nodes.append(pub)
pub.group = "input"
else:
doi_input_list.remove(pub_doi)
# inserts references as publication objects into list and
# inserts first depth references into nodes/edges if maximum search depth > 0
for reference in create_graph_structure_references(pub, 0, search_depth_max, test_var):
references_pub_obj_list.append(reference)
# inserts citations as publication objects into list and
# inserts first height citations into nodes if maximum search height > 0
for citation in create_graph_structure_citations(pub, 0, search_height_max, test_var):
citations_pub_obj_list.append(citation)
return(references_pub_obj_list, citations_pub_obj_list)
# adds edges between citation and reference group
def complete_inner_edges(test_var):
for node in nodes:
if (node.group == "depth"):
for citation in node.citations:
if (citation in nodes and [citation.doi_url, node.doi_url] not in edges):
edges.append([citation.doi_url, node.doi_url])
if (node.group == "height"):
for reference in node.references:
if (reference in nodes and [node.doi_url, reference.doi_url] not in edges):
edges.append([node.doi_url,reference.doi_url])
# adds a node for every publication unknown
# adds edges for references between publications
def create_graph_structure_references(pub, search_depth, search_depth_max, test_var):
references_pub_obj_list = []
for reference in pub.references:
not_in_nodes = True
for node in nodes:
# checks every reference for duplication
if (reference.doi_url == node.doi_url):
not_in_nodes = False
break
if (not_in_nodes):
if (search_depth < search_depth_max):
#checks if its a test and chooses input function accordingly
if (test_var):
reference_pub_obj = input_test_func(reference.doi_url)
else:
reference_pub_obj = input(reference.doi_url)
reference_pub_obj.group = "depth"
nodes.append(reference_pub_obj)
edges.append([pub.doi_url,reference_pub_obj.doi_url])
references_pub_obj_list.append(reference_pub_obj)
# adds edge only if citation already exists
elif [pub.doi_url,reference.doi_url] not in edges:
edges.append([pub.doi_url,reference.doi_url])
return references_pub_obj_list
# recursive function to implement height-first-search on references
# references_pub_obj_list: input list of references as publication objects
# search_depth: current search_depth of height-first-search
# search_depth_max: maximal search_depth for dfs
def process_references_rec(references_pub_obj_list, search_depth, search_depth_max, test_var):
# adds next level to nodes/edges
for pub in references_pub_obj_list:
new_reference_pub_obj_list = create_graph_structure_references(pub, search_depth, search_depth_max, test_var)
# If the maximum height has not yet been reached, calls function recursivly with increased height
if (search_depth < search_depth_max):
process_references_rec(new_reference_pub_obj_list, search_depth+1, search_depth_max, test_var)
# adds a node for every publication unknown
# adds edges for citations between publications
def create_graph_structure_citations(pub, search_height, search_height_max, test_var):
citations_pub_obj_list = []
for citation in pub.citations:
not_in_nodes = True
for node in nodes:
# checks every citation for duplication
if (citation.doi_url == node.doi_url):
not_in_nodes = False
break
if (not_in_nodes):
if (search_height < search_height_max):
#checks if its a test and chooses input function accordingly
if (test_var):
citation_pub_obj = input_test_func(citation.doi_url)
else:
citation_pub_obj = input(citation.doi_url)
citation_pub_obj.group = "height"
nodes.append(citation_pub_obj)
edges.append([citation_pub_obj.doi_url,pub.doi_url])
citations_pub_obj_list.append(citation_pub_obj)
# adds only edge if citation already exists
elif [citation.doi_url,pub.doi_url] not in edges:
edges.append([citation.doi_url,pub.doi_url])
return citations_pub_obj_list
# recursive function to implement height-first-search on citations
# citations_pub_obj_list: input list of citations as publication objects
# search_height: current search_height of height-first-search
# search_height_max: maximal search_height for dfs
def process_citations_rec(citations_pub_obj_list, search_height, search_height_max, test_var):
# adds next level to nodes/edges
for pub in citations_pub_obj_list:
new_citation_pub_obj_list = create_graph_structure_citations(pub, search_height, search_height_max, test_var)
# If the maximum height has not yet been reached, calls function recursivly with increased height
if (search_height < search_height_max):
process_citations_rec(new_citation_pub_obj_list, search_height+1, search_height_max, test_var)
def process_main(doi_input_list, search_height, search_depth, test_var = False):
# ERROR-Handling doi_array = NULL
if (len(doi_input_list) == 0):
print("Error, no input data")
# ERROR- if a negative number is entered for height
if (search_height < 0):
print("Error, search_height of search must be positive")
# ERROR- if a negative number is entered for depth
if (search_depth < 0):
print("Error, search_depth of search must be positive")
# create empty array for the nodes
# create empty array for the edges
global nodes, edges
nodes = []
edges = []
# initializes nodes/edges from input and gets a list with publication objects for citations and references returned
references_obj_list, citations_obj_list = initialize_nodes_list(doi_input_list,search_depth, search_height, test_var)
process_citations_rec(citations_obj_list, 1, search_height, test_var)
process_references_rec(references_obj_list, 1, search_depth, test_var)
complete_inner_edges(test_var)
output_to_json(nodes,edges)
# only for internal testing
doi_nodes = []
for node in nodes:
doi_nodes.append(node.doi_url)
return(doi_nodes,edges)
# a function to print nodes and edges from a graph
def print_graph(nodes, edges):
print("Knoten:\n")
for node in nodes:
print(node.title, "\n")
print("\nKanten:\n")
for edge in edges:
print(edge,"\n")
# function to test cycles
def test_cycle():
arr = []
arr.append('doiz1')
#arr.append('doiz2')
nodes,edges = process_main(arr,1,1,True)
print(nodes, edges)
print_graph(nodes, edges)
# program test, because there is no connection to the input yet.
def test_print():
arr = []
#arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
#arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
#arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332')
#arr.append('https://doi.org/10.1021/acs.jcim.0c00741')
#arr.append('https://doi.org/10.1021/ci700007b')
#arr.append('https://doi.org/10.1021/acs.jcim.5b00292')
#url = sys.argv[1]
#arr.append[url]
nodes,edges = process_main(arr,2,2,True)
print_graph(nodes, edges)
#test_print()
#test_cycle()
#print(process_main(['doiz1'],1,1,True))
#print(process_main(['doi1'],0,0,True))
\ No newline at end of file
import unittest import unittest
from Processing import process_main from Processing_pub_objs_only import process_main
class ProcessingTest(unittest.TestCase): class ProcessingTest(unittest.TestCase):
def testCycle(self): def testCycle(self):
......
No preview for this file type
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment