Skip to content
Snippets Groups Projects
Commit e570bd72 authored by Malte Schokolowski's avatar Malte Schokolowski
Browse files

fixed bug and removed old Processing.py

parent 45b19745
No related branches found
No related tags found
1 merge request!7Main
......@@ -23,9 +23,10 @@ from json_demo import output_to_json
# adds every publication from input list to graph structure
# doi_input_list: list of publication dois from user
def initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var):
references_pub_obj_list = []
citations_pub_obj_list = []
# TO-DO: Listenelemente auf Korrektheit überprüfen
def initialize_nodes_list(doi_input_list, test_var):
for pub_doi in doi_input_list:
#checks if its a test and chooses input function accordingly
......@@ -46,60 +47,40 @@ def initialize_nodes_list(doi_input_list, test_var):
else:
doi_input_list.remove(pub_doi)
# adds inner edges between citations and references to edges
def complete_inner_edges(test_var):
for node in nodes:
# checks if its a test and chooses input function accordingly
if (test_var):
pub = input_test_func(node.doi_url)
else:
pub = input(node.doi_url)
if (node.group == "depth"):
for citation in pub.citations:
if (citation in nodes and [citation.doi_url, pub.doi_url] not in edges):
edges.append([citation.doi_url, pub.doi_url])
if (node.group == "height"):
for reference in pub.references:
for node in nodes:
if (reference.doi_url in node.doi_url and [pub.doi_url, reference.doi_url] not in edges):
edges.append([pub.doi_url,reference.doi_url])
# inserts references as publication objects into list and
# inserts first depth references into nodes/edges if maximum search depth > 0
for reference in create_graph_structure_references(pub, 0, search_depth_max, test_var):
references_pub_obj_list.append(reference)
# inserts citations as publication objects into list and
# inserts first height citations into nodes if maximum search height > 0
for citation in create_graph_structure_citations(pub, 0, search_height_max, test_var):
citations_pub_obj_list.append(citation)
return(references_pub_obj_list, citations_pub_obj_list)
# adds a node for every publication unknown
# adds edges for citations between publications
def create_graph_structure_citations(pub, search_height, search_height_max):
for citation in pub.citations:
# checks if publication already exists in nodes
not_in_nodes = True
# adds edges between citation and reference group
def complete_inner_edges(test_var):
for node in nodes:
# checks every citation for duplication
if (citation.doi_url == node.doi_url):
not_in_nodes = False
break
if (not_in_nodes):
if (search_height <= search_height_max):
citation.group = "height"
nodes.append(citation)
edges.append([citation.doi_url,pub.doi_url])
# adds only an edge (citation already exists)
elif [citation.doi_url,pub.doi_url] not in edges:
edges.append([citation.doi_url,pub.doi_url])
if (node.group == "depth"):
for citation in node.citations:
for cit in nodes:
if (citation.doi_url == cit.doi_url and [citation.doi_url, node.doi_url] not in edges):
edges.append([citation.doi_url, node.doi_url])
if (node.group == "height"):
for reference in node.references:
for ref in nodes:
if (reference.doi_url == ref.doi_url and [node.doi_url, reference.doi_url] not in edges):
edges.append([node.doi_url,reference.doi_url])
# adds a node for every publication unknown
# adds edges for references between publications
def create_graph_structure_references(pub, search_depth, search_depth_max):
def create_graph_structure_references(pub, search_depth, search_depth_max, test_var):
references_pub_obj_list = []
for reference in pub.references:
# checks if publication already exists in nodes
not_in_nodes = True
for node in nodes:
# checks every reference for duplication
......@@ -107,86 +88,94 @@ def create_graph_structure_references(pub, search_depth, search_depth_max):
not_in_nodes = False
break
if (not_in_nodes):
if (search_depth <= search_depth_max):
reference.group = "depth"
nodes.append(reference)
edges.append([pub.doi_url,reference.doi_url])
# adds only an edge (citation already exists)
elif [pub.doi_url,reference.doi_url] not in edges:
edges.append([pub.doi_url,reference.doi_url])
# recursive function to implement height-first-search on citations
# doi_citations: input list of citet dois
# search_height: current search_height of height-first-search
# search_height_max: maximal search_height for dfs
def process_citations_rec(doi_citations, search_height, search_height_max, test_var):
# height of search is increased by 1 with each recursive call
search_height += 1
# create class object for every citation from list
for pub_doi in doi_citations:
if (search_depth < search_depth_max):
#checks if its a test and chooses input function accordingly
if (test_var):
pub = input_test_func(pub_doi)
reference_pub_obj = input_test_func(reference.doi_url)
else:
pub = input(pub_doi)
create_graph_structure_citations(pub, search_height, search_height_max)
# If the maximum height has not yet been reached, all references from the publication
# are written to an array and the function is called again with this array.
if (search_height < search_height_max):
citations_list = []
for citation in pub.citations:
# currently only the references with acs are stored in the URL, because we can't
# extract the info from other sources.
if ("acs" in citation.doi_url or test_var == True):
citations_list.append(citation.doi_url)
reference_pub_obj = input(reference.doi_url)
# recursive call of function.
process_citations_rec(citations_list, search_height, search_height_max, test_var)
reference_pub_obj.group = "depth"
nodes.append(reference_pub_obj)
edges.append([pub.doi_url,reference_pub_obj.doi_url])
references_pub_obj_list.append(reference_pub_obj)
# adds edge only if citation already exists
elif [pub.doi_url,reference.doi_url] not in edges:
edges.append([pub.doi_url,reference.doi_url])
return references_pub_obj_list
# recursive function to implement height-first-search on references
# doi_references: input list of referenced dois
# references_pub_obj_list: input list of references as publication objects
# search_depth: current search_depth of height-first-search
# search_depth_max: maximal search_depth for dfs
def process_references_rec(doi_references, search_depth, search_depth_max, test_var):
# The depth is increased by 1 with each recursive call
search_depth += 1
def process_references_rec(references_pub_obj_list, search_depth, search_depth_max, test_var):
# adds next level to nodes/edges
for pub in references_pub_obj_list:
new_reference_pub_obj_list = create_graph_structure_references(pub, search_depth, search_depth_max, test_var)
# If the maximum height has not yet been reached, calls function recursivly with increased height
if (search_depth < search_depth_max):
process_references_rec(new_reference_pub_obj_list, search_depth+1, search_depth_max, test_var)
# create class object for every citation from list
for pub_doi in doi_references:
# adds a node for every publication unknown
# adds edges for citations between publications
def create_graph_structure_citations(pub, search_height, search_height_max, test_var):
citations_pub_obj_list = []
for citation in pub.citations:
not_in_nodes = True
for node in nodes:
# checks every citation for duplication
if (citation.doi_url == node.doi_url):
not_in_nodes = False
break
if (not_in_nodes):
if (search_height < search_height_max):
#checks if its a test and chooses input function accordingly
if (test_var):
pub = input_test_func(pub_doi)
citation_pub_obj = input_test_func(citation.doi_url)
else:
pub = input(pub_doi)
citation_pub_obj = input(citation.doi_url)
create_graph_structure_references(pub, search_depth, search_depth_max)
# If the maximum depth has not yet been reached, all references from the publication
# are written to an array and the function is called again with this array.
if (search_depth < search_depth_max):
references_list = []
for reference in pub.references:
citation_pub_obj.group = "height"
nodes.append(citation_pub_obj)
edges.append([citation_pub_obj.doi_url,pub.doi_url])
citations_pub_obj_list.append(citation_pub_obj)
# adds only edge if citation already exists
elif [citation.doi_url,pub.doi_url] not in edges:
edges.append([citation.doi_url,pub.doi_url])
return citations_pub_obj_list
# currently only the references with acs are stored in the URL, because we can't
# extract the info from other sources.
if ("acs" in reference.doi_url or test_var == True):
references_list.append(reference.doi_url)
# recursive call of function.
process_references_rec(references_list, search_depth, search_depth_max, test_var)
# recursive function to implement height-first-search on citations
# citations_pub_obj_list: input list of citations as publication objects
# search_height: current search_height of height-first-search
# search_height_max: maximal search_height for dfs
def process_citations_rec(citations_pub_obj_list, search_height, search_height_max, test_var):
# adds next level to nodes/edges
for pub in citations_pub_obj_list:
new_citation_pub_obj_list = create_graph_structure_citations(pub, search_height, search_height_max, test_var)
# If the maximum height has not yet been reached, calls function recursivly with increased height
if (search_height < search_height_max):
process_citations_rec(new_citation_pub_obj_list, search_height+1, search_height_max, test_var)
# main function to call. Needs as input:
# doi_input_list: input list of dois
# search_height: max search height to process to
# search_depth: max search depth to process to
# test_var: only needed for unit test as True, default is False
def process_main(doi_input_list, search_height, search_depth, test_var = False):
# ERROR-Handling doi_array = NULL
if (len(doi_input_list) == 0):
......@@ -206,24 +195,30 @@ def process_main(doi_input_list, search_height, search_depth, test_var = False):
nodes = []
edges = []
# initializes nodes/edges from input and gets a list with publication objects for citations and references returned
references_obj_list, citations_obj_list = initialize_nodes_list(doi_input_list,search_depth, search_height, test_var)
initialize_nodes_list(doi_input_list,test_var)
process_citations_rec(doi_input_list, 0, search_height, test_var)
process_references_rec(doi_input_list, 0, search_depth, test_var)
# function calls to begin recursive processing up to max depth/height
process_citations_rec(citations_obj_list, 1, search_height, test_var)
process_references_rec(references_obj_list, 1, search_depth, test_var)
# adds edges between reference group and citation group of known publications
complete_inner_edges(test_var)
# calls a skript to save nodes and edges of graph in .json file
output_to_json(nodes,edges)
# only for internal testing
doi_nodes = []
# only for unit tests
if (test_var == True):
doi_nodes_list = []
for node in nodes:
doi_nodes.append(node.doi_url)
return(doi_nodes,edges)
doi_nodes_list.append(node.doi_url)
return(doi_nodes_list, edges)
# a function to print nodes and edges from a graph
def print_graph(nodes, edges):
print("Knoten:\n")
for node in nodes:
......@@ -233,24 +228,12 @@ def print_graph(nodes, edges):
print(edge,"\n")
# function to test cycles
def test_cycle():
arr = []
arr.append('doiz1')
#arr.append('doiz2')
nodes,edges = process_main(arr,1,1,True)
print(nodes, edges)
print_graph(nodes, edges)
# program test, because there is no connection to the input yet.
def test_print():
arr = []
#arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
# program test, because there is no connection to UI yet.
def try_known_publications():
doi_list = []
doi_list.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
#arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
#arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332')
doi_list.append('https://doi.org/10.1021/acs.jmedchem.0c01332')
#arr.append('https://doi.org/10.1021/acs.jcim.0c00741')
#arr.append('https://doi.org/10.1021/ci700007b')
......@@ -259,13 +242,6 @@ def test_print():
#arr.append[url]
nodes,edges = process_main(arr,2,2,True)
nodes,edges = process_main(doi_list,2,2)
print_graph(nodes, edges)
\ No newline at end of file
#test_print()
#test_cycle()
#print(process_main(['doiz1'],1,1,True))
#print(process_main(['doi1'],0,0,True))
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Functions to generate a graph representing citations between multiple ACS/Nature journals
"""
__authors__ = "Donna Löding, Alina Molkentin, Xinyi Tang, Judith Große, Malte Schokolowski"
__email__ = "cis-project2021@zbh.uni-hamburg.de"
__status__ = "Production"
#__copyright__ = ""
#__credits__ = ["", "", "", ""]
#__license__ = ""
#__version__ = ""
#__maintainer__ = ""
from bs4 import BeautifulSoup as bs
import requests as req
import sys
from pathlib import Path
from input_fj import input
from input_test import input_test_func
from json_demo import output_to_json
# adds every publication from input list to graph structure
# doi_input_list: list of publication dois from user
def initialize_nodes_list(doi_input_list, search_depth_max, search_height_max, test_var):
references_pub_obj_list = []
citations_pub_obj_list = []
for pub_doi in doi_input_list:
#checks if its a test and chooses input function accordingly
if(test_var):
pub = input_test_func(pub_doi)
else:
pub = input(pub_doi)
# checks if publication already exists in nodes
not_in_nodes = True
for node in nodes: # checks if a pub is already in nodes
if (pub.doi_url == node.doi_url):
not_in_nodes = False
break
if (not_in_nodes):
nodes.append(pub)
pub.group = "input"
else:
doi_input_list.remove(pub_doi)
# inserts references as publication objects into list and
# inserts first depth references into nodes/edges if maximum search depth > 0
for reference in create_graph_structure_references(pub, 0, search_depth_max, test_var):
references_pub_obj_list.append(reference)
# inserts citations as publication objects into list and
# inserts first height citations into nodes if maximum search height > 0
for citation in create_graph_structure_citations(pub, 0, search_height_max, test_var):
citations_pub_obj_list.append(citation)
return(references_pub_obj_list, citations_pub_obj_list)
# adds edges between citation and reference group
def complete_inner_edges(test_var):
for node in nodes:
if (node.group == "depth"):
for citation in node.citations:
if (citation in nodes and [citation.doi_url, node.doi_url] not in edges):
edges.append([citation.doi_url, node.doi_url])
if (node.group == "height"):
for reference in node.references:
if (reference in nodes and [node.doi_url, reference.doi_url] not in edges):
edges.append([node.doi_url,reference.doi_url])
# adds a node for every publication unknown
# adds edges for references between publications
def create_graph_structure_references(pub, search_depth, search_depth_max, test_var):
references_pub_obj_list = []
for reference in pub.references:
not_in_nodes = True
for node in nodes:
# checks every reference for duplication
if (reference.doi_url == node.doi_url):
not_in_nodes = False
break
if (not_in_nodes):
if (search_depth < search_depth_max):
#checks if its a test and chooses input function accordingly
if (test_var):
reference_pub_obj = input_test_func(reference.doi_url)
else:
reference_pub_obj = input(reference.doi_url)
reference_pub_obj.group = "depth"
nodes.append(reference_pub_obj)
edges.append([pub.doi_url,reference_pub_obj.doi_url])
references_pub_obj_list.append(reference_pub_obj)
# adds edge only if citation already exists
elif [pub.doi_url,reference.doi_url] not in edges:
edges.append([pub.doi_url,reference.doi_url])
return references_pub_obj_list
# recursive function to implement height-first-search on references
# references_pub_obj_list: input list of references as publication objects
# search_depth: current search_depth of height-first-search
# search_depth_max: maximal search_depth for dfs
def process_references_rec(references_pub_obj_list, search_depth, search_depth_max, test_var):
# adds next level to nodes/edges
for pub in references_pub_obj_list:
new_reference_pub_obj_list = create_graph_structure_references(pub, search_depth, search_depth_max, test_var)
# If the maximum height has not yet been reached, calls function recursivly with increased height
if (search_depth < search_depth_max):
process_references_rec(new_reference_pub_obj_list, search_depth+1, search_depth_max, test_var)
# adds a node for every publication unknown
# adds edges for citations between publications
def create_graph_structure_citations(pub, search_height, search_height_max, test_var):
citations_pub_obj_list = []
for citation in pub.citations:
not_in_nodes = True
for node in nodes:
# checks every citation for duplication
if (citation.doi_url == node.doi_url):
not_in_nodes = False
break
if (not_in_nodes):
if (search_height < search_height_max):
#checks if its a test and chooses input function accordingly
if (test_var):
citation_pub_obj = input_test_func(citation.doi_url)
else:
citation_pub_obj = input(citation.doi_url)
citation_pub_obj.group = "height"
nodes.append(citation_pub_obj)
edges.append([citation_pub_obj.doi_url,pub.doi_url])
citations_pub_obj_list.append(citation_pub_obj)
# adds only edge if citation already exists
elif [citation.doi_url,pub.doi_url] not in edges:
edges.append([citation.doi_url,pub.doi_url])
return citations_pub_obj_list
# recursive function to implement height-first-search on citations
# citations_pub_obj_list: input list of citations as publication objects
# search_height: current search_height of height-first-search
# search_height_max: maximal search_height for dfs
def process_citations_rec(citations_pub_obj_list, search_height, search_height_max, test_var):
# adds next level to nodes/edges
for pub in citations_pub_obj_list:
new_citation_pub_obj_list = create_graph_structure_citations(pub, search_height, search_height_max, test_var)
# If the maximum height has not yet been reached, calls function recursivly with increased height
if (search_height < search_height_max):
process_citations_rec(new_citation_pub_obj_list, search_height+1, search_height_max, test_var)
def process_main(doi_input_list, search_height, search_depth, test_var = False):
# ERROR-Handling doi_array = NULL
if (len(doi_input_list) == 0):
print("Error, no input data")
# ERROR- if a negative number is entered for height
if (search_height < 0):
print("Error, search_height of search must be positive")
# ERROR- if a negative number is entered for depth
if (search_depth < 0):
print("Error, search_depth of search must be positive")
# create empty array for the nodes
# create empty array for the edges
global nodes, edges
nodes = []
edges = []
# initializes nodes/edges from input and gets a list with publication objects for citations and references returned
references_obj_list, citations_obj_list = initialize_nodes_list(doi_input_list,search_depth, search_height, test_var)
process_citations_rec(citations_obj_list, 1, search_height, test_var)
process_references_rec(references_obj_list, 1, search_depth, test_var)
complete_inner_edges(test_var)
output_to_json(nodes,edges)
# only for internal testing
doi_nodes = []
for node in nodes:
doi_nodes.append(node.doi_url)
return(doi_nodes,edges)
# a function to print nodes and edges from a graph
def print_graph(nodes, edges):
print("Knoten:\n")
for node in nodes:
print(node.title, "\n")
print("\nKanten:\n")
for edge in edges:
print(edge,"\n")
# function to test cycles
def test_cycle():
arr = []
arr.append('doiz1')
#arr.append('doiz2')
nodes,edges = process_main(arr,1,1,True)
print(nodes, edges)
print_graph(nodes, edges)
# program test, because there is no connection to the input yet.
def test_print():
arr = []
#arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
#arr.append('https://pubs.acs.org/doi/10.1021/acs.jcim.9b00249')
#arr.append('https://doi.org/10.1021/acs.jmedchem.0c01332')
#arr.append('https://doi.org/10.1021/acs.jcim.0c00741')
#arr.append('https://doi.org/10.1021/ci700007b')
#arr.append('https://doi.org/10.1021/acs.jcim.5b00292')
#url = sys.argv[1]
#arr.append[url]
nodes,edges = process_main(arr,2,2,True)
print_graph(nodes, edges)
#test_print()
#test_cycle()
#print(process_main(['doiz1'],1,1,True))
#print(process_main(['doi1'],0,0,True))
\ No newline at end of file
import unittest
from Processing import process_main
from Processing_pub_objs_only import process_main
class ProcessingTest(unittest.TestCase):
def testCycle(self):
......
No preview for this file type
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment