From be58b1809786051f92f4e22d84772d47ed8c9c9a Mon Sep 17 00:00:00 2001 From: Florian Jochens <fj@andaco.de> Date: Tue, 23 Nov 2021 14:48:03 +0100 Subject: [PATCH] added newest version of the input files --- example_input.py | 10 +- input/.publication.py.swp | Bin 16384 -> 0 bytes input/README.md | 12 ++- input/get/acs.py | 37 +++---- input/get/acs_fj.py | 100 ------------------- input/get/journal_fetcher.py | 14 +-- input/get/nature.py | 4 - input/get/template_.py | 10 +- input/publication.py | 18 ++-- input/tempdir/input_fj.py | 144 ---------------------------- input/tempdir/pub.py | 32 ------- input/tempdir/test.py | 15 --- input/test_input_get_publication.py | 28 ------ 13 files changed, 51 insertions(+), 373 deletions(-) delete mode 100644 input/.publication.py.swp delete mode 100755 input/get/acs_fj.py mode change 100755 => 100644 input/get/nature.py delete mode 100755 input/tempdir/input_fj.py delete mode 100644 input/tempdir/pub.py delete mode 100755 input/tempdir/test.py delete mode 100755 input/test_input_get_publication.py diff --git a/example_input.py b/example_input.py index a8331cb..febbc7b 100755 --- a/example_input.py +++ b/example_input.py @@ -1,14 +1,10 @@ #!/usr/bin/env python3 -from input.interface import InputInterface -import input.publication +from input.interface import InputInterface as Input def main(url: str): - #print(get_publication(url)) - print(InputInterface.get_publication(url)) - #pub.print_pub() + print(Input.get_publication(url)) if __name__ == "__main__": #main("https://doi.org/10.1021/acs.jcim.1c00203") - #main("https://doi.org/10.1021/acs.jcim.1c00917") - main("https://doi.org/10.1021/acs.jcim.5b00332") + main("https://pubs.acs.org/doi/10.1021/acs.jcim.5b00332") diff --git a/input/.publication.py.swp b/input/.publication.py.swp deleted file mode 100644 index d2a0a137478b10579a081fbe5111873dda6877d0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmYc?2=nw+u+TGPU|?VnU|?{WF*|W4qZLCv2LnS|7D$jV&L|+P2Be}cEla<kC_gJT zyF@oRvsgbfub{L<zo0ZJCo?&*Br`uxub>j7ZWND(z-S0iIRr{e(sV6&8H|k#4L~L< zD=8`n3x$H1qj)p~MnhmU1V%$(Gz3ONU^E0qLtr!nMnhnPgg{9F6GJ@%0|OJ(zsXRV z5shYs@<X6>3Y3PagK|fy(GVC7fzc2c4S~@R7!85Z5Eu=C(GVC7fzc2c4S~@R7!83z z7y^kY3=Ad=3=DTb9eDu;23Y_9B|ihh3w{QMtNaWMXZaZzR`WA3EaqomDCTEiaNuWP z_`t`&u#Jy_VHF<(!*o6dh88{shCn_B1~Wbe26jFMhIhOS3_E!l7^d+uFy!zuF!=K_ zFo^RqFnr}<V3^Otz);P@z+l9~z`)PL!0?lsf#EGT1H(CP28PYt3=Cb|3=G-a3=CG> z3=ERo3=F@x7#MDHF)*CwVqjRr#lX<Q#lVol#lR5C#lWD)#lXPB#lUc#lY!wRCj&!1 zCj&zsCj&z+Cj)~&Cj)~PCj$d3Cj-L+4hDwZ91IN8I2afTIT#qcI2afdIT#o&u`@7C zVP{}SWM^OyVrO7D%*McQh>d|^0~-TF2O9%J4jThQ02>2?6B`4AA{zt4FIEPI{j3ZO z3s@N#=Cd*|l&~@|WUw+YsIoFJsIW3H2(dCSFt9Q(>|<eI=w@MH$Yx<+h-YD7uwY?e z;9+54;AUZ9xXR4Huz{I@p^uq?A(@$h!I+tWL6(_;VLKB8Lmd+XLje;5gAfw~!(B#5 zyzBwR4LNB?0T0ki%P-1JEYT>bEJ)Qz&MW~hz|hpxWI)zal$cqZst{6Hkm_1glwYKw z6knWL5)V_YkXliYnp{$>;0)DXtdL)%;FDQgqL8145Y$n~%P+A~P_M34SAv_Knp2!= z1?M5#iOxoGD9CPfN5VAF&ap5Xa67s<H789E7B0nli3J6zc_|uT*J5}wGfe>;=wMle zOhoXZhDTa{kpf7GLS~)<%w&X$)Et;GFgZ<-Q8}5#B?v{z3eFj+$=SsUnQ02@Fm>u6 zb%_cfl_=4~fZ|Z>9)g*HNF`WZ0ZR`EmnbPAqTZ?~wWPEt&q@K+4!HGJpwK~h8J>_p zi3{ZFXqfS_2rUJPMTxmM^aQ1*r52^;C8vVCTauBg5P(>51UEq!2H+wAAR|*tQi~7) z;|Pw{Vg)Rr2XY+DT?!@nXnxa!n+gkvl+-l1bBi@VA)y23Ybxm4Dfs2*rCK4ZhJ+bh zNqlihQD$B`+*nu&g+&ll9+nVc!g}!_iTHR8w0sUqYcMs)>8%(MgCHAVN<fBW=A|px zDnujV05fwy-5jk1@{1BeWhRo>6>LF<K&%0UNjg&b0<{KSIu&a|)HxJ`N~`4D)RK(+ z6hsh#T!@Ty6qI7}l=QOlGxNX!tcT5PP{<=PF0P^lWGwN<6~Pi*F<LN#jDcYt1tlE? zWY-m!CS`$2Q$*f^DTUEGkW`hGUs{xxm;>b@EuV`|Ni0dlND{E}6YSHH%#xf`sLqu9 z%=prx91Mpf=jW9aWhRxD<QElVIz|bW%8RW)?uyB)u8ql4(u~2dR|%f)z$&2%F_i^F z{S7uw0c;wkB5;;1&PYvB$j`IFrN|2sFc5=qDRL|=$;dAPJ4rzSmm-(^Ox@C=9IPfm zQ?-Ut2q-L}N{~$gC1Y4V0|h!bI~SJ}SuvodRgj!sUTH3<kO$Yk#R|3xsC;FGq@u*q z<ct!9+{8SEyi5=)H8U?QF*!paCp8Z#XE1;w1B5ZNAy#`py1^2#JdG69nQ3Uw$Sj5y zM<5Ge7*)4|Exc?hMs<q<oD0&4p$n-5K`9q-+lk0t3bqOe4%`|XA%IYcT5e^gAsGUy z8$qQNq@aO27?w8`Y!zTkxOHG3q31#cTLp9>R1MHftYE7EWuZzV=Rr`=B4tTbh2RXW zV5<P8QKg{yTftTV%7RP7Dj|^n;I%Qx+i(@Ab#Sp2xVaFWm{*A?#jQ~5++r&QP~8eE z9TdtlGLtiqL$)L%vlwgD3$Y6kJ$NibR)od2%wmP4)YLqLttkpgl?vz<BN8OUW>}0u z?JX`T!tf+Q3CLC`4_|WvVlp@hfJ!YZh2oMTMBqX~4oMbNaD$91O3g`4EJl>!5Mz)N z1JsciE<{lSY9)b;OU%hpK$bwb0Nlg^rx~nHgUWy@kU=0{gPjXmr;TO~G<jh)2`&d^ z;4=-@1k=@pTL5o_Aqo}b#0yHmV620Pb5JP?O3R4$6ih?`+I|4*fDt+v85Qgg9e74p zumu->I%s7gL=^5&m~jd^C?O6t3!2BlJg^o>9g&%rSrQ))Zk*`AGAUS%4kSN=*<kg^ zo(@VaC`v6(%_{*lM8Op!Bvv3n1X7utlUR(B*Q^*Aloj<$i;MJ=GV}CP^U4$oDoZl* z^Nbll{r`Ao28PotkoEtt`Ta-y3=9wX85s8PGceThGcW}3Gcb7bGcXwOGcYjnGca6- z_W!r?F)%FWV_>M}V_=BlV_?wYV_?waV_;z9V_>+#%fL_v?f*0IGB8}>VPM$F!@w|& zhk>Dxhk?PJhk-$fhk=2Yhk@ZgHv_{4ZU%<++zbrsxEUDMax*Zj;bvf%!_B~u!_B~8 z!Og(H$IZaN#Ld9)o{NFu4i^K%N-hS5J}w4^crFG8H!cPS0WJoHSDXwC>o^%0CUG(_ z)NnE|6hOxWLO2;1OgR}C#5frk-f}Q7oZ(<#IL*Ppu$F^?p_zk$A(DfEL7#(xftiDW z;UYT&!(4U-hD>$_1{HP&24!{z1|@a|hTm)q46oQ27*?|}Fic=$U?^l`U<hJkV9;b^ zV9;P=VBlh7V7S4`z_1KDF5tw<z#zy93ESnM@Fk}YKdNUm1V%#uTnNI;Skf5K?hdpJ z#^2q+?C%ijK4G{_DW(L`^U-xiYPo?rE?9d?C_0Gh#h~bL$<Ks$Ly$WnpkRXcR#0@1 z(RBeC2EyRxE^?y+Z}Suy=(w7>utou96BTR?r2T>1xW!#>>OiVY9awuBwI0^AVt`bN z@Tjq3P*w;5HAypz6|zc;OB50nic$+eJJCQ*6j(F1SOKlm1_}ZQh9;}z%#wIeYl_r# zg~JfAzoDUqk+ksXfd(ap9+dQpa4gs!aQI^AP>Lz3uEj{WI6C=YwL~V_>RMeQlWKLX zF2zX{8ZV&a31z~43ikYfq+z%dK+QBrbSfwi=z@Wh7osT$(uL@IK_!C=Qj;^&GE>1r zz3?=SJSYjb8E2mYTgL}EkRV+TQ0^Ep{R(g&0vyB&whHi(FC{Gf2za>zjwsZ^2ik)G z#{ljE45|^FC&6rJi3KZ)K*L9<14so$nRz7|>M?oXatx_|1{%f4%P#?q$fSb1z=?U4 z3i%}&sYS@09X)kKZHGB#0!n7;>QE&v{+_y_K|au0rW!V8qz5f@YQbJZtLos2kkb%E zC0cm_R|zfmAd1im0JtJ>06~<1+z7*vz(`Y9R|h)=CX7C^4axej`6|RP4@^DG80>+X zoLK@1P*8_BEk7qGzZ^6UUJW)CW$3OJIi!oNV)CFN39}~^>--m4wj)J6rahTy3ZO)S z=|D};$h#pT0PrZZQ!qf9iGa9K3*2kfLylWbP>oms$|yPtN{FNdQLi48hmt;!f&-!e zY9MrMr5MdH_#iPjNiv|OM^KnTq5z&Ak-EhSpurU@1&9Kqu!;wXBhnGX6`=IvSX7dk zoRbP^Pk@s=L@!dp1Lb0{hWLWgBydPTiv)PugSNZC;SO~bD8Rv)5SDG=ij|a<6rA&O z3lfV`i@~F4rFof!rKzwX0Z@@!Ql5`IcvcLnKR_;tk54Ux4<mqW01r7Q<>%)h#T&AV JV410y0RU|$*nI#1 diff --git a/input/README.md b/input/README.md index 0ebd7e1..7776ee0 100644 --- a/input/README.md +++ b/input/README.md @@ -5,12 +5,12 @@ Input-Package to fetch publication information with a given url. ## Usage/Examples ```python -from input.interface import get_publication +from input.interface import InputInterface as Input from input.publication import Publication def main(url): try: - pub = get_publication(url) + pub = Input.get_publication(url) except Exception as error: raise error @@ -21,6 +21,14 @@ def main(url): if __name__=="__main__": main("https://doi.org/10.1021/acs.chemrev.8b00728") ``` + +## Testing + +``` c +python -m unittest input/test/<file.py> -v +# for all tests in directory +python -m unittest discover input/test -v +``` ## Authors - Florian Jochens - Sam Ockenden diff --git a/input/get/acs.py b/input/get/acs.py index 3a54a15..cfc1d6d 100755 --- a/input/get/acs.py +++ b/input/get/acs.py @@ -21,13 +21,16 @@ class Fetcher(JournalFetcher): SUPPORTED_JOURNALS = ['1021'] @staticmethod - def can_use_url(url: str) -> bool: + def can_use_url(url: str) -> str: """ Uses Regex to extract journal specific substrings in Doi. TODO: Support non Doi-urls """ - matched_url = re.match(r'(https?://)?(doi.org/)?(10.(\d{4})/\w+.\S+)', url) - return matched_url[4] in Fetcher.SUPPORTED_JOURNALS + matched_url = re.match(r'^(https?://)?(doi.org/|pubs.acs.org/doi/)?(10.(\d{4})/\w+.\S+)', url.strip(". \t\r\n")) + if matched_url is not None: + return matched_url[4] in Fetcher.SUPPORTED_JOURNALS + else: + return False @staticmethod def get_publication(url: str) -> Publication: @@ -65,16 +68,19 @@ class Fetcher(JournalFetcher): references_soup = ref_cit_soup.select('ol#references') if references_soup != []: for reference in references_soup[0].select('li'): - ref_doi = "https://doi.org/{}".format(reference.select('.refDoi')[0].text.strip()[5:])\ - if reference.select('.refDoi') != [] else "None" + if reference.select('.refDoi') != []: + ref_doi = "https://doi.org/{}".format(reference.select('.refDoi')[0].text.strip()[5:]) + else: + # No Doi -> No Paper + continue ref_title = reference.select('.NLM_article-title')[0].text\ - if reference.select('.NLM_article-title') != [] else "None" + if reference.select('.NLM_article-title') != [] else None ref_journal = reference.select('i')[0].text\ - if reference.select('i') != [] else "None" + if reference.select('i') != [] else None ref_contributors=[] for author in reference.select('.NLM_contrib-group'): - ref_contributors.append(author.text) + ref_contributors.append(author.text.replace("\n", " ").replace("\r", "")) references.append(Citation(ref_doi, ref_title, ref_journal, ref_contributors, cit_type="Reference")) @@ -82,14 +88,18 @@ class Fetcher(JournalFetcher): citation_soup = ref_cit_soup.select('.cited-content_cbyCitation') if citation_soup != []: for citation in citation_soup[0].select('li'): - cit_doi = citation.select('a[title="DOI URL"]')[0].text\ - if citation.select('a[title="DOI URL"]') != [] else "None" + if citation.select('a[title="DOI URL"]') != []: + cit_doi = citation.select('a[title="DOI URL"]')[0].text + else: + # No Doi -> No Paper + continue cit_title = citation.select('.cited-content_cbyCitation_article-title')[0].text\ if citation.select('.cited-content_cbyCitation_article-title')!= [] else "None" cit_journal = citation.select('.cited-content_cbyCitation_journal-name')[0].text\ if citation.select('.cited-content_cbyCitation_journal-name') != [] else "None" cit_contributors =[] - cit_contributors = citation.select('.cited-content_cbyCitation_article-contributors')[0].text.split(', ') + cit_contributors = citation.select('.cited-content_cbyCitation_article-contributors')[0]\ + .text.replace("\n", " ").replace("\r", "").split(', ') # clean up of the last Entry cit_contributors_last = cit_contributors.pop().strip(". ") if cit_contributors_last != '': @@ -98,8 +108,3 @@ class Fetcher(JournalFetcher): return Publication(doi_url, title, contributors, journal, published , subjects, num_citations, references, citations) - - - @staticmethod - def test_fetcher(): - pass diff --git a/input/get/acs_fj.py b/input/get/acs_fj.py deleted file mode 100755 index 2880838..0000000 --- a/input/get/acs_fj.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python3 - -""" -Child class of JournalFetcher -JCIM -""" - -# import re -from input.get.journal_fetcher import JournalFetcher -from input.publication import Publication, Citation, Reference -import requests as req -from bs4 import BeautifulSoup as bs - -class Fetcher(JournalFetcher): - - """ - """ - - # TODO: Naming-Convention: - # Class: 'Fetcher' - # file: input_get_[journal-/organisation-name] - # format = "input_get_[a-z]*.py" allowed - # TODO: List of Compatable Journals - _SUPPORTED_JOURNALS = [] - - @staticmethod - def can_use_url(url: str) -> bool: - """ - Checks if given url links to a supported journal. - """ - - # TODO: Check the URL for compatability - # re.match in _SUPPORTED_JOURNALS - return True - - @staticmethod - def get_publication(url: str) -> Publication: - return input(url) - - - @staticmethod - def test_fetcher(): - pass - -def get_article_info(soup): - header = soup.find('div', class_ = 'article_header-left pull-left') - article_title = header.find('span', class_ = 'hlFld-Title').text - publication_date = header.find('span', class_ = 'pub-date-value').text - for link in header.find('div', class_ = 'article_header-doiurl'): - doi_url = link.get('href') - subs = header.find('div', class_ = 'article_header-taxonomy') - subjects = [] - for sub in subs.find_all('a'): - subjects.append(sub.get('title')) - cons = header.find('ul', class_ = 'loa') - contributors = [] - for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): - contributors.append(con.text) - numc = header.find('div', class_ = 'articleMetrics_count') - if not numc.a: - num_citations = 0 - else: - num_citations = numc.a.text - pub = Publication(doi_url, article_title, contributors, "JCIM", - publication_date, subjects, num_citations) - #pub = Publication(article_title, publication_date, contributors, doi_url, - # subjects, num_citations) - return pub - - -def get_citation_info(pub, num_citations, soup): - details = soup.find('ol', class_ = 'cited-content_cbyCitation') - titles = [] - for title in details.find_all('span', - class_ = 'cited-content_cbyCitation_article-title'): - titles.append(title.text.replace('.', '')) - journal_names = [] - for name in details.find_all('span', - class_ = 'cited-content_cbyCitation_journal-name'): - journal_names.append(name.text) - doi_urls = [] - for url in details.find_all('a'): - doi_urls.append(url.get('href')) - # TODO: There are a few diffrent types how Contributors are listed - contributors = [] - for contrib in details.find_all('span', - class_ = 'cited-content_cbyCitation_article-contributors'): - contributors.append(contrib.text) - for i in range(0, int(num_citations)): - pub.citations.append(Citation(doi_urls[i], titles[i], journal_names[i], \ - contributors[i])) - -def input(url): - html_text = req.get(url).text - soup = bs(html_text, 'html.parser') - - pub = get_article_info(soup) - if int(pub.num_citations) > 0: - get_citation_info(pub, int(pub.num_citations), soup) - return pub diff --git a/input/get/journal_fetcher.py b/input/get/journal_fetcher.py index 097eb24..aad5857 100755 --- a/input/get/journal_fetcher.py +++ b/input/get/journal_fetcher.py @@ -14,6 +14,7 @@ class JournalFetcher(metaclass=ABCMeta): """ This is a abstract-class for fetcher modules """ + @staticmethod def get_soup(url: str) -> BeautifulSoup: """ @@ -31,7 +32,8 @@ class JournalFetcher(metaclass=ABCMeta): raise SystemExit(err) return BeautifulSoup(req.content, 'html.parser') - + + @staticmethod @abstractmethod def can_use_url(url: str) -> bool: @@ -41,6 +43,7 @@ class JournalFetcher(metaclass=ABCMeta): """ raise AttributeError("JournalFetcher for '{}' hasnt implemented 'can_use_url()'".format(url)) + @staticmethod @abstractmethod def get_publication(url: str) -> Publication: @@ -49,12 +52,3 @@ class JournalFetcher(metaclass=ABCMeta): Creates a Publication-instance. """ raise AttributeError("JournalFetcher for '{}' hasnt implemented 'get_publication()'".format(url)) - - @staticmethod - @abstractmethod - def test_fetcher(): - """ - Abstract-function to be implemented in subclass. - Unit-test for the class. - """ - raise AttributeError("JournalFetcher: Subclass hasnt implemented 'test_fetcher()'") diff --git a/input/get/nature.py b/input/get/nature.py old mode 100755 new mode 100644 index d08d74f..4d206f4 --- a/input/get/nature.py +++ b/input/get/nature.py @@ -57,7 +57,3 @@ class Fetcher(JournalFetcher): # TODO: Exceptions-handling # raise ValueException("Cant Fetch: '{}'".format(error)) # return None - - @staticmethod - def test_fetcher(): - pass diff --git a/input/get/template_.py b/input/get/template_.py index 72f3cf9..da200bc 100755 --- a/input/get/template_.py +++ b/input/get/template_.py @@ -31,7 +31,9 @@ class Fetcher(JournalFetcher): # TODO: Check the URL for compatability # url_re = re.match(r'(https?://)?(doi.org/)?(10.(\d{4})/\w+.\S+)', url) - # return url_re[4] in SUPPORTED_JOURNALS + # if url_re is not None: + # return url_re[4] in SUPPORTED_JOURNALS + # else: return False @staticmethod @@ -45,8 +47,4 @@ class Fetcher(JournalFetcher): # doi,title,contributors[],journal,publication_date,subjects[],references[],citations[] # TODO: Create new Publication-instance # return Publication(doi,title,contributors[],journal,publication_date,subjects[],num_citation=None ,references[],citations[]) - return None - - @staticmethod - def test_fetcher(): - pass + return None \ No newline at end of file diff --git a/input/publication.py b/input/publication.py index f819bed..6de2373 100755 --- a/input/publication.py +++ b/input/publication.py @@ -4,10 +4,10 @@ class Publication: """ Represents a Publications """ - def __init__(self, doi_url: str, title: str - , contributors: str, journal: str - , publication_date: str, subjects = None, num_citations = None - , references = None, citations = None ): + def __init__(self, doi_url: str, title: str \ + , contributors: list[str], journal: str \ + , publication_date: str, subjects: list[str], num_citations: int = None \ + , references: list[any] = None, citations: list[any] = None ): """ Parameters ---------- @@ -72,7 +72,7 @@ class Publication: citation_string.append(citation.__str__()) return citation_string - def citations(self, citation) -> None: + def add_citations(self, citation) -> None: """ Appends a list of Citations or Citation to self.citations. @@ -132,9 +132,9 @@ Subjects:''') class Citation: - def __init__(self, doi_url: str, title: str - , journal: str, contributors = None - , cit_type = "Citation"): + def __init__(self, doi_url: str, title: str \ + , journal: str, contributors: list[str] \ + , cit_type: str = "Citation"): """ Parameters ---------- @@ -168,7 +168,7 @@ class Citation: # This is just a replica of Citations class Reference: - def __init__(self, doi_url, title, journal, contributors): + def __init__(self, doi_url: str, title: str, journal: str, contributors: list[str]): self.title = title self.doi_url = doi_url self.journal = journal diff --git a/input/tempdir/input_fj.py b/input/tempdir/input_fj.py deleted file mode 100755 index 00bb012..0000000 --- a/input/tempdir/input_fj.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python3 -""" -Functions for information retrieval of articles from the ACS journal JCIM - -""" - -__author__ = "Florian Jochens" -__email__ = "fj@andaco.de" -__status__ = "Production" -#__copyright__ = "" -#__credits__ = ["", "", "", ""] -#__license__ = "" -#__version__ = "" -#__maintainer__ = "" - -from bs4 import BeautifulSoup as bs -import requests as req -import sys -from pathlib import Path - -class Publication: - #_registry = [] - _citations = [] - - def __init__(self, title, publication_date, contributors, doi_url, - subjects, num_citations): - #self._registry.append(self) - self.title = title - self.publication_date = publication_date - self.contributors = contributors - self.doi_url = doi_url - self.subjects = subjects - self.num_citations = num_citations - -class Citation: - def __init__(self, title, journal, contributors, doi_url): - self.title = title - self.journal = journal - self.contributors = contributors - self.doi_url = doi_url - -def get_article_info(soup): - header = soup.find('div', class_ = 'article_header-left pull-left') - article_title = header.find('span', class_ = 'hlFld-Title').text - publication_date = header.find('span', class_ = 'pub-date-value').text - for link in header.find('div', class_ = 'article_header-doiurl'): - doi_url = link.get('href') - subs = header.find('div', class_ = 'article_header-taxonomy') - subjects = [] - for sub in subs.find_all('a'): - subjects.append(sub.get('title')) - cons = header.find('ul', class_ = 'loa') - contributors = [] - for con in cons.find_all('span', class_ = 'hlFld-ContribAuthor'): - contributors.append(con.text) - numc = header.find('div', class_ = 'articleMetrics_count') - if not numc.a: - num_citations = 0 - else: - num_citations = numc.a.text - - pub = Publication(article_title, publication_date, contributors, doi_url, - subjects, num_citations) - return pub - -def get_download_url(): - export = soup.find('div', class_ = 'cit-download-dropdown_content') - url = 'https://pubs.acs.org' - for link in export.find_all('a'): - if link.get('title') == 'Citation and references': - url += link.get('href') - print(url) - return url - -def download(url): # Download citation and references file - if url.find('='): - filename = url.rsplit('=', 1)[1] - path = Path(('./files/' + filename)) - if path.is_file(): - print("File already exists") - else: - print("File does not exist") - -def get_citation_info(pub, num_citations, soup): - pub._citations = [] - details = soup.find('ol', class_ = 'cited-content_cbyCitation') - titles = [] - for title in details.find_all('span', - class_ = 'cited-content_cbyCitation_article-title'): - titles.append(title.text.replace('.', '')) - journal_names = [] - for name in details.find_all('span', - class_ = 'cited-content_cbyCitation_journal-name'): - journal_names.append(name.text) - doi_urls = [] - for url in details.find_all('a'): - doi_urls.append(url.get('href')) - contributors = [] - for contrib in details.find_all('span', - class_ = 'cited-content_cbyCitation_article-contributors'): - contributors.append(contrib.text) - for i in range(0, int(num_citations)): - pub._citations.append(Citation(titles[i], journal_names[i], - contributors[i], doi_urls[i])) -def print_pub_info(pub): - print(f'''Article title: {pub.title} -Publication date: {pub.publication_date} -DOI-URL: {pub.doi_url} - -Subjects:''') - print(*(pub.subjects), sep = ", ") - print('\nContributors:') - print(*(pub.contributors), sep = ", ") - - if int(pub.num_citations) > 0: - if int(pub.num_citations) == 1: - print(f'\nThis publication is cited by the following publication:\n') - else: - print(f'\nThis publication is cited by the following {pub.num_citations} publications:\n') - for citation in pub._citations: - print(f''' - Title: {citation.title} - Journal: {citation.journal} - Contributors: {citation.contributors} - DOI-URL: {citation.doi_url} - ''') - else: - print('\nThis publication is not cited by any other publication.') - -def input(url): - html_text = req.get(url).text - soup = bs(html_text, 'html.parser') - - pub = get_article_info(soup) - if int(pub.num_citations) > 0: - get_citation_info(pub, int(pub.num_citations), soup) - return pub - -#if len(sys.argv) != 2: -# sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) -# exit(1) -#url = sys.argv[1] -#pub = input(url) -#print_pub_info(pub) diff --git a/input/tempdir/pub.py b/input/tempdir/pub.py deleted file mode 100644 index 13b90e8..0000000 --- a/input/tempdir/pub.py +++ /dev/null @@ -1,32 +0,0 @@ -class Publication: - #_registry = [] - #_citations = [] - #_references = [] - - def __init__(self, title, publication_date, contributors, doi_url, - subjects, num_citations): - #self._registry.append(self) - self.title = title - self.publication_date = publication_date - self.contributors = contributors - self.doi_url = doi_url - self.subjects = subjects - self.num_citations = num_citations - self.num_references = num_references - self._citations = [] - self._references = [] - -class Citation: - def __init__(self, title, journal, contributors, doi_url): - self.title = title - self.journal = journal - self.contributors = contributors - self.doi_url = doi_url - -class References: - def __init__(self, title, journal, contributors, doi_url): - self.title = title - self.journal = journal - self.contributors = contributors - self.doi_url = doi_url - diff --git a/input/tempdir/test.py b/input/tempdir/test.py deleted file mode 100755 index bdd12e8..0000000 --- a/input/tempdir/test.py +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python3 - -from input_fj import input, print_pub_info -import sys - -if len(sys.argv) != 3: - sys.stderr.write('Usage: {} <url>\n'.format(sys.argv[0])) - exit(1) -url = sys.argv[1] -url2 = sys.argv[2] -pub = input(url) -print_pub_info(pub) -pub2 = input(url2) -print_pub_info(pub2) - diff --git a/input/test_input_get_publication.py b/input/test_input_get_publication.py deleted file mode 100755 index 941dbc7..0000000 --- a/input/test_input_get_publication.py +++ /dev/null @@ -1,28 +0,0 @@ -import unittest -""" -Testing the Publication fetcher - -Publication 1: 'https://doi.org/10.1021/acs.jcim.1c00203' -Publication 2: 'doi.org/10.1021/acs.jcim.1c00917' -Publication 3: '10.1038/nchem.1781' -Publication 4: '11.12/jaj' -Publication 5: '11.12/' -Publication 6: 'https://doi.org/10.1021/acs.jmedchem.0c01332' # Paper is a PDF -""" - - -class TestGetPublication(unittest.TestCase): - - def test_publication1(self): - pass - - def test_publication2(self): - pass - - def test_publication3(self): - pass - - -if __name__=="__main__": - print("test") - unittest.main() \ No newline at end of file -- GitLab