eprint_cache/author.py at main · LCAS/eprint_cache · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
import shelve
import re
from logging import getLogger, INFO, DEBUG
from flatten_dict import flatten

from figshare_api import FigShare
from doi2bib import doi2bib
from doi_utils import guess_doi_from_crossref


class Author:
    """Represents an author and manages their Figshare article collection.

    This class handles retrieving, processing, and caching article metadata for
    a specific author from the Figshare repository.
    """

    def __init__(self, name, user_id=None, institution_id=None, orcid=None, debug=False, rate_limit_delay=1.0, max_retries=5):
        """Initialize an Author instance.

        Args:
            name: Author's full name (required)
            user_id: Figshare user ID (optional, improves search accuracy)
            institution_id: Institution ID for filtering articles (optional, recommended)
            orcid: Author's ORCID identifier (optional, for reference)
            debug: Enable debug logging (default: False)
            rate_limit_delay: Delay in seconds between API requests (default: 1.0)
            max_retries: Maximum retry attempts for failed API calls (default: 5)
        """
        self.logger = getLogger("Author")
        if debug:
            self.logger.setLevel(DEBUG)
        self.name = name
        self.user_id = user_id
        self.institution_id = institution_id
        self.orcid = orcid
        self.fs = FigShare(rate_limit_delay=rate_limit_delay, max_retries=max_retries)
        self.articles = {}
        self.public_html_prefix = "https://repository.lincoln.ac.uk"
        self.df = None

    def save(self, filename=None):
        """Save author's articles and dataframe to a persistent cache file.

        Args:
            filename: Path to cache file (default: '{author_name}.db')
        """
        if filename is None:
            filename = f"{self.name}.db"
        with shelve.open(filename) as db:
            db['articles'] = self.articles
            db['df'] = self.df

    def load(self, filename=None):
        """Load author's articles and dataframe from a persistent cache file.

        Args:
            filename: Path to cache file (default: '{author_name}.db')
        """
        if filename is None:
            filename = f"{self.name}.db"
        with shelve.open(filename) as db:
            self.articles = db['articles']
            self.df = db['df']


    def _retrieve_figshare(self, use_cache=True):
        """Retrieve articles for this author from Figshare.

        Uses the most precise search method available based on the author metadata:
        - If user_id and/or institution_id are available, uses articles_by_author()
          with filtering for more accurate results
        - Otherwise, falls back to simple name-based search

        Args:
            use_cache: Whether to use cached API results (default: True)
        """
        self.logger.info(f"retrieving articles for {self.name}")

        # Use enhanced search with user_id tracking and institution filtering when available
        if self.user_id or self.institution_id:
            if self.user_id and self.institution_id:
                self.logger.info(f"Using enhanced search for user_id {self.user_id} with institution_id {self.institution_id}")
            elif self.user_id:
                self.logger.info(f"Using enhanced search for user_id {self.user_id}")
            else:
                self.logger.info(f"Using enhanced search with institution_id {self.institution_id}")

            self.articles = self.fs.articles_by_author(
                self.name,
                user_id=self.user_id,
                institution_id=self.institution_id,
                use_cache=use_cache
            )
        else:
            self.logger.info(f"Using basic name search (no user_id or institution_id available)")
            self.articles = self.fs.articles_by_user_name(self.name, use_cache=use_cache)

        self.logger.info(f"found {len(self.articles)} articles for {self.name}")

    def _retrieve_details(self, use_cache=True):
        """Retrieve detailed metadata for each article.

        Fetches full article details including custom fields, tags, categories, etc.

        Args:
            use_cache: Whether to use cached API results (default: True)
        """
        for article in self.articles:
            self.logger.info(f"retrieving details for article {article['id']}")
            article['details'] = self.fs.get_article(article['id'], use_cache=use_cache)

    def _remove_non_repository(self):
        self.logger.info(f"removing non-repository articles out of {len(self.articles)}")
        self.articles = [a for a in self.articles if a['url_public_html'].startswith(self.public_html_prefix)]
        self.logger.info(f"retained {len(self.articles)} articles")

    def _custom_fields_to_dicts(self):
        for article in self.articles:
            if 'details' not in article:
                continue
            if 'custom_fields' not in article['details']:
                continue
            self.logger.debug(f"convert")

            cf = article['details']['custom_fields']
            if type(cf) == list:
                new_cf = {}
                for p in cf:
                    new_cf[p['name']] = p['value']
                article['details']['custom_fields'] = new_cf


    def _guess_doi(self, article):
        """
        Use crossref API to guess the DOI for an article based on the title and authors
        """
        if 'title' not in article or not article['title']:
            self.logger.warning("No title found for article, can't guess DOI")
            return None

        title = article['title']
        author = article['author']

        return guess_doi_from_crossref(title, author)


    def _retrieve_bibtex_from_dois(self):
        if self.df is None:
            self.logger.warning(f"no dataframe found for {self.name}, can't continue")
            return
        doi2bibber = doi2bib()
        # iterate over all rows in the dataframe self.df
        for index, row in self.df.iterrows():
            doi = row['External DOI']
            # Check if DOI is in valid format
            if doi and isinstance(doi, str):
                # Basic DOI validation - should start with 10. followed by numbers/dots/hyphens
                if not doi.startswith('10.') or not len(doi.split('/', 1)) == 2:
                    self.logger.warning(f"Invalid DOI format: {doi}, will try to guess")
                    doi = None
            else:
                self.logger.info(f"No DOI defined in record for article, will try to guess")
                doi = None
            if doi is None:
                doi = self._guess_doi(row)
                if doi is None:
                    self.logger.debug(f"Unable to guess DOI for article, no option left but to skip it")
                    continue
                self.logger.info(f"Guessed DOI for article: {doi}, updating dataframe")
                self.df.at[index, 'External DOI'] = doi
            try:
                bibtex = doi2bibber.get_bibtex_entry(doi)
                # Update the dataframe with the bibtex information
                if bibtex is not None:
                    self.df.at[index, 'bibtex'] = bibtex
                    self.df.at[index, 'bibtex_str'] = doi2bibber.entries_to_str([bibtex])
                    self.logger.info(f"got bibtex for {doi}")
                else:
                    self.logger.warning(f"Couldn't get bibtex for {doi}")

            except Exception as e:
                self.logger.warning(f"Failed to get bibtex for {doi}: {e}")

    def _flatten(self):
        self.logger.info(f"flattening article dicts for {self.name}")
        new_articles = []
        for a in self.articles:
            try:
                new_articles.append(flatten(a, reducer='path'))
            except Exception as e:
                self.logger.warning(f"Failed to flatten article {a}: {e}")
        self.articles = new_articles

    def retrieve(self, use_cache=True):
        self._retrieve_figshare(use_cache=use_cache)
        self._remove_non_repository()
        self._retrieve_details(use_cache=True)
        self._custom_fields_to_dicts()
        self._flatten()
        self._create_dataframe()
        self._retrieve_bibtex_from_dois()

    def _create_dataframe(self):
        if len(self.articles) == 0:
            self.logger.warning(f"no articles found for {self.name}, can't create dataframe")
            self.df = None
            return
        self.df = pd.DataFrame.from_dict(self.articles)
        # add column with author name
        self.df['author'] = self.name

        # add column with online date (as datetime object)
        if 'timeline/firstOnline' in self.df.columns:
            self.df['online_date'] = pd.to_datetime(self.df['timeline/firstOnline'], utc=True)
        else:
            self.logger.warning(f"'timeline/firstOnline' field not found, setting online_date to NaT")
            self.df['online_date'] = pd.NaT

        # add column with online year
        self.df['online_year'] = self.df['online_date'].apply(
            lambda x: x.year if pd.notna(x) else None
        )

        # add column with external DOI, parsed from custom_fields
        if 'details/custom_fields/External DOI' in self.df.columns:
            self.df['External DOI'] = self.df['details/custom_fields/External DOI'].apply(
                lambda x: re.sub(r'^(?:https?://doi\.org/|doi:)', '', x[0], flags=re.IGNORECASE).replace('doi:','')
                if isinstance(x, list) and len(x) > 0 else None
            )
        else:
            self.logger.warning(f"'details/custom_fields/External DOI' field not found, setting External DOI to None")
            self.df['External DOI'] = None

        return self.df