diff --git a/api/crossref/views.py b/api/crossref/views.py index 9baeb37d562..bf92d358ad7 100644 --- a/api/crossref/views.py +++ b/api/crossref/views.py @@ -7,6 +7,7 @@ from api.crossref.permissions import RequestComesFromMailgun from osf.models import Preprint, NotificationTypeEnum +from osf.models.base import Guid from website import settings from website.preprints.tasks import mint_doi_on_crossref_fail @@ -48,6 +49,23 @@ def post(self, request): if record.get('status').lower() == 'success' and doi: msg = record.find('msg').text created = bool(msg == 'Successfully added') + # Unversioned DOIs (no _vN suffix, e.g. 10.31233/osf.io/tnaqp) are routing + # aliases that always resolve to the latest version via OSF's GUID routing. + # Store them as 'doi_unversioned' on the v1 preprint so we can track which + # preprint series have had their unversioned DOI registered. + _, version = Guid.split_guid(guid) if guid else (None, None) + if not version: + logger.info(f'Unversioned DOI confirmed by CrossRef: {doi}') + if created and guid: + v1_preprint = Preprint.objects.filter( + versioned_guids__guid___id=guid, + versioned_guids__version=1, + ).first() + if v1_preprint: + v1_preprint.set_identifier_value(category='doi_unversioned', value=doi) + dois_processed += 1 + continue + legacy_doi = preprint.get_identifier(category='legacy_doi') if created or legacy_doi: # Sets preprint_doi_created and saves the preprint @@ -67,9 +85,14 @@ def post(self, request): if 'Relation target DOI does not exist' in record.find('msg').text: logger.warning('Related publication DOI does not exist, sending metadata again without it...') mint_doi_on_crossref_fail.apply_async(kwargs={'preprint_id': preprint._id}) - # This error occurs when a single preprint is being updated several times in a row with the same metadata [#PLAT-944] - elif 'less or equal to previously submitted version' in record.find('msg').text and record_count == 2: - break + # This error occurs when a single preprint is being updated several times in a row + # with the same metadata [#PLAT-944]. Previously this broke out of the loop when + # record_count == 2 (single DOI submitted twice). Now batches legitimately contain + # 2 records (versioned + unversioned DOI), so we continue instead of break to allow + # the remaining record to be processed. + elif 'less or equal to previously submitted version' in record.find('msg').text: + dois_processed += 1 + continue else: unexpected_errors = True logger.info(f'Creation success email received from CrossRef for preprints: {guids}') diff --git a/api/users/views.py b/api/users/views.py index b920798be86..218fef32cd2 100644 --- a/api/users/views.py +++ b/api/users/views.py @@ -1477,7 +1477,8 @@ def post(self, request, *args, **kwargs): user.date_last_logged_in = timezone.now() user.external_identity[provider][provider_id] = 'VERIFIED' - user.social[provider.lower()] = provider_id + if provider.lower() in OSFUser.SOCIAL_FIELDS: + user.social[provider.lower()] = provider_id del user.email_verifications[token] user.verification_key = generate_verification_key() user.save() diff --git a/api_tests/crossref/views/test_crossref_email_response.py b/api_tests/crossref/views/test_crossref_email_response.py index 196c0debd2a..fb03a2c404a 100644 --- a/api_tests/crossref/views/test_crossref_email_response.py +++ b/api_tests/crossref/views/test_crossref_email_response.py @@ -220,3 +220,91 @@ def test_confirmation_marks_legacy_doi_as_deleted(self, app, url, preprint): app.post(url, context_data) assert preprint.identifiers.get(category='legacy_doi').deleted + + def test_unversioned_doi_confirmation_skips_identifier_update(self, app, url, preprint): + versioned_doi = settings.DOI_FORMAT.format( + prefix=preprint.provider.doi_prefix, guid=preprint._id + ) + preprint.set_identifier_value(category='doi', value=versioned_doi) + + base_guid = preprint.get_guid()._id # no _vN suffix + unversioned_doi = settings.DOI_FORMAT.format( + prefix=preprint.provider.doi_prefix, guid=base_guid + ) + dual_confirmation_xml = """ + + + 1390675999 + {batch_id} + + {versioned_doi} + Successfully updated + + + {unversioned_doi} + Successfully added + + + 2 + 2 + 0 + 0 + + + """.format( + batch_id=preprint._id, + versioned_doi=versioned_doi, + unversioned_doi=unversioned_doi, + ) + + context_data = self.make_mailgun_payload(crossref_response=dual_confirmation_xml) + with capture_notifications(expect_none=True): + app.post(url, context_data) + + preprint.reload() + assert preprint.get_identifier_value('doi') == versioned_doi + assert preprint.get_identifier_value('doi_unversioned') == unversioned_doi + + def test_unversioned_doi_confirmation_update_does_not_store_doi_unversioned(self, app, url, preprint): + versioned_doi = settings.DOI_FORMAT.format( + prefix=preprint.provider.doi_prefix, guid=preprint._id + ) + preprint.set_identifier_value(category='doi', value=versioned_doi) + + base_guid = preprint.get_guid()._id + unversioned_doi = settings.DOI_FORMAT.format( + prefix=preprint.provider.doi_prefix, guid=base_guid + ) + update_confirmation_xml = """ + + + 1390676000 + {batch_id} + + {versioned_doi} + Successfully updated + + + {unversioned_doi} + Successfully updated + + + 2 + 2 + 0 + 0 + + + """.format( + batch_id=preprint._id, + versioned_doi=versioned_doi, + unversioned_doi=unversioned_doi, + ) + + context_data = self.make_mailgun_payload(crossref_response=update_confirmation_xml) + with capture_notifications(expect_none=True): + app.post(url, context_data) + + preprint.reload() + assert preprint.get_identifier_value('doi') == versioned_doi + assert preprint.get_identifier_value('doi_unversioned') is None diff --git a/osf/management/commands/resync_preprint_dois_v1.py b/osf/management/commands/resync_preprint_dois_v1.py new file mode 100644 index 00000000000..31c04cf13f7 --- /dev/null +++ b/osf/management/commands/resync_preprint_dois_v1.py @@ -0,0 +1,263 @@ +import logging +import time + +from django.contrib.contenttypes.models import ContentType +from django.core.management.base import BaseCommand +from django.db.models import Q + +from framework.celery_tasks import app +from osf.models import Preprint, Identifier +from osf.models.base import VersionedGuidMixin +from osf.management.commands.sync_doi_metadata import async_request_identifier_update + +logger = logging.getLogger(__name__) + +# 5-minute pause between rate-limit windows to avoid flooding the Crossref API +# with too many deposit requests in a short period. +RATE_LIMIT_SLEEP = 60 * 5 + + +def get_preprints_needing_v1_doi(provider_id=None): + content_type = ContentType.objects.get_for_model(Preprint) + + already_versioned_ids = Identifier.objects.filter( + content_type=content_type, + category='doi', + deleted__isnull=True, + value__contains=VersionedGuidMixin.GUID_VERSION_DELIMITER, + ).values_list('object_id', flat=True) + + public_query = Q(is_published=True, is_public=True, deleted__isnull=True) + withdrawn_query = Q(date_withdrawn__isnull=False, ever_public=True) + + qs = Preprint.objects.filter( + versioned_guids__version=1, + ).filter( + public_query | withdrawn_query + ).exclude( + id__in=already_versioned_ids + ).exclude( + tags__name='qatest', + tags__system=True, + ).select_related('provider').distinct() + + if provider_id: + qs = qs.filter(provider___id=provider_id) + + return qs + + +def resync_preprint_dois_v1(dry_run=True, batch_size=500, rate_limit=100, provider_id=None): + preprints_to_update = get_preprints_needing_v1_doi(provider_id=provider_id) + + total = preprints_to_update.count() + logger.info( + f'{"[DRY RUN] " if dry_run else ""}' + f'{total} preprints need v1 DOI resync' + + (f' (provider={provider_id})' if provider_id else '') + ) + + if batch_size: + preprints_iterable = preprints_to_update[:batch_size] + else: + preprints_iterable = preprints_to_update.iterator() + + queued = 0 + skipped = 0 + errored = 0 + for record_number, preprint in enumerate(preprints_iterable, 1): + if not preprint.provider.doi_prefix: + logger.warning( + f'Skipping preprint {preprint._id}: ' + f'provider {preprint.provider._id} has no DOI prefix' + ) + skipped += 1 + continue + + if dry_run: + logger.info(f'[DRY RUN] Would resync DOI for preprint {preprint._id}') + queued += 1 + continue + + if rate_limit and not record_number % rate_limit: + logger.info(f'Rate limit reached at {record_number} preprints, sleeping {RATE_LIMIT_SLEEP}s') + time.sleep(RATE_LIMIT_SLEEP) + + try: + async_request_identifier_update.apply_async(kwargs={'preprint_id': preprint._id}) + logger.info(f'Queued DOI resync for preprint {preprint._id}') + queued += 1 + except Exception: + logger.exception(f'Failed to queue DOI resync for preprint {preprint._id}') + errored += 1 + + logger.info( + f'{"[DRY RUN] " if dry_run else ""}' + f'Done: {queued} preprints queued, {skipped} skipped (no DOI prefix), {errored} errored' + ) + if not dry_run and batch_size: + logger.info( + f'Estimated remaining after this batch: ~{max(0, total - queued - skipped - errored)}. ' + f'Re-run this command until 0 preprints remain.' + ) + + +def get_preprints_needing_unversioned_doi(provider_id=None): + content_type = ContentType.objects.get_for_model(Preprint) + + already_has_unversioned = Identifier.objects.filter( + content_type=content_type, + category='doi_unversioned', + deleted__isnull=True, + ).values_list('object_id', flat=True) + + has_versioned_doi = Identifier.objects.filter( + content_type=content_type, + category='doi', + deleted__isnull=True, + value__contains=VersionedGuidMixin.GUID_VERSION_DELIMITER, + ).values_list('object_id', flat=True) + + public_query = Q(is_published=True, is_public=True, deleted__isnull=True) + withdrawn_query = Q(date_withdrawn__isnull=False, ever_public=True) + + qs = Preprint.objects.filter( + versioned_guids__version=1, + id__in=has_versioned_doi, + ).filter( + public_query | withdrawn_query + ).exclude( + id__in=already_has_unversioned + ).exclude( + tags__name='qatest', + tags__system=True, + ).select_related('provider').distinct() + + if provider_id: + qs = qs.filter(provider___id=provider_id) + + return qs + + +def register_missing_unversioned_dois(dry_run=True, batch_size=500, rate_limit=100, provider_id=None): + preprints_to_update = get_preprints_needing_unversioned_doi(provider_id=provider_id) + + total = preprints_to_update.count() + logger.info( + f'{"[DRY RUN] " if dry_run else ""}' + f'{total} preprints need unversioned DOI registration' + + (f' (provider={provider_id})' if provider_id else '') + ) + + if batch_size: + preprints_iterable = preprints_to_update[:batch_size] + else: + preprints_iterable = preprints_to_update.iterator() + + queued = 0 + skipped = 0 + errored = 0 + for record_number, preprint in enumerate(preprints_iterable, 1): + if not preprint.provider.doi_prefix: + logger.warning( + f'Skipping preprint {preprint._id}: ' + f'provider {preprint.provider._id} has no DOI prefix' + ) + skipped += 1 + continue + + if dry_run: + logger.info(f'[DRY RUN] Would register unversioned DOI for preprint {preprint._id}') + queued += 1 + continue + + if rate_limit and not record_number % rate_limit: + logger.info(f'Rate limit reached at {record_number} preprints, sleeping {RATE_LIMIT_SLEEP}s') + time.sleep(RATE_LIMIT_SLEEP) + + try: + async_request_identifier_update.apply_async(kwargs={'preprint_id': preprint._id}) + logger.info(f'Queued unversioned DOI registration for preprint {preprint._id}') + queued += 1 + except Exception: + logger.exception(f'Failed to queue unversioned DOI registration for preprint {preprint._id}') + errored += 1 + + logger.info( + f'{"[DRY RUN] " if dry_run else ""}' + f'Unversioned DOI pass done: {queued} queued, {skipped} skipped, {errored} errored' + ) + if not dry_run and batch_size: + logger.info( + f'Estimated unversioned remaining after this batch: ~{max(0, total - queued - skipped - errored)}. ' + f'Re-run until 0 preprints remain.' + ) + + +@app.task(name='osf.management.commands.resync_preprint_dois_v1', max_retries=0) +def resync_preprint_dois_v1_task(batch_size=500, rate_limit=100, dry_run=False, provider_id=None): + resync_preprint_dois_v1( + dry_run=dry_run, + batch_size=batch_size, + rate_limit=rate_limit, + provider_id=provider_id, + ) + register_missing_unversioned_dois( + dry_run=dry_run, + batch_size=batch_size, + rate_limit=rate_limit, + provider_id=provider_id, + ) + + +class Command(BaseCommand): + help = ( + 'Resync DOIs for version-1 preprints that are missing the versioned DOI suffix (_v1). ' + 'Processes preprints in batches and queues Crossref deposit tasks. ' + 'IMPORTANT: This command must be run repeatedly until it reports 0 preprints remaining, ' + 'as each run only processes a single batch. ' + 'Check remaining count with --dry_run before and after each run.' + ) + + def add_arguments(self, parser): + super().add_arguments(parser) + parser.add_argument( + '--dry_run', + action='store_true', + dest='dry_run', + help='Log what would be done without submitting to Crossref.', + ) + parser.add_argument( + '--batch_size', + '-b', + type=int, + default=500, + help=( + 'Maximum number of preprints to process per run (default: 500). ' + 'The command processes the first N eligible preprints and exits; ' + 're-run the command to continue with the next batch.' + ), + ) + parser.add_argument( + '--rate_limit', + '-r', + type=int, + default=100, + help='Sleep between Crossref submissions every N preprints.', + ) + parser.add_argument( + '--provider', + '-p', + type=str, + default=None, + dest='provider_id', + help='Restrict to a single provider _id (e.g. socarxiv).', + ) + + def handle(self, *args, **options): + resync_preprint_dois_v1( + dry_run=options['dry_run'], + batch_size=options['batch_size'], + rate_limit=options['rate_limit'], + provider_id=options['provider_id'], + ) diff --git a/osf_tests/test_user.py b/osf_tests/test_user.py index 2e5d0631cd4..d564429b992 100644 --- a/osf_tests/test_user.py +++ b/osf_tests/test_user.py @@ -1344,7 +1344,7 @@ def test_add_system_tag(self, user): assert len(user.system_tags) == 1 - tag = Tag.all_tags.get(name=tag_name, system=True) + tag = Tag.all_tags.get(name=tag_name.lower(), system=True) assert tag in user.all_tags.all() def test_add_system_tag_instance(self, user): diff --git a/tests/identifiers/test_crossref.py b/tests/identifiers/test_crossref.py index e05284f303e..4d9c0b43806 100644 --- a/tests/identifiers/test_crossref.py +++ b/tests/identifiers/test_crossref.py @@ -95,6 +95,18 @@ def test_crossref_build_doi(self, crossref_client, preprint): assert crossref_client.build_doi(preprint) == settings.DOI_FORMAT.format(prefix=doi_prefix, guid=preprint._id) + def test_crossref_build_unversioned_doi(self, crossref_client, preprint): + doi_prefix = preprint.provider.doi_prefix + base_guid = preprint.get_guid()._id + + expected = settings.DOI_FORMAT.format(prefix=doi_prefix, guid=base_guid) + assert crossref_client.build_unversioned_doi(preprint) == expected + assert '_v' not in expected + + def test_crossref_build_unversioned_doi_matches_base_guid_not_versioned(self, crossref_client, preprint, preprint_version): + assert crossref_client.build_doi(preprint_version) != crossref_client.build_unversioned_doi(preprint_version) + assert crossref_client.build_unversioned_doi(preprint) == crossref_client.build_unversioned_doi(preprint_version) + def test_crossref_build_doi_versioned(self, crossref_client, preprint_version): doi_prefix = preprint_version.provider.doi_prefix @@ -338,6 +350,57 @@ def test_metadata_for_non_included_relation(self, crossref_client, preprint): root_without_relation = lxml.etree.fromstring(xml_without_relation) assert root_without_relation.find('.//{%s}intra_work_relation' % crossref.CROSSREF_RELATIONS) is None + def test_metadata_includes_unversioned_doi_entry(self, crossref_client, preprint): + crossref_xml = crossref_client.build_metadata(preprint, include_unversioned_doi=True) + root = lxml.etree.fromstring(crossref_xml) + + posted_contents = root.findall('.//{%s}posted_content' % crossref.CROSSREF_NAMESPACE) + assert len(posted_contents) == 2 + + versioned_dois = posted_contents[0].findall('.//{%s}doi' % crossref.CROSSREF_NAMESPACE) + assert len(versioned_dois) == 1 + assert versioned_dois[0].text == crossref_client.build_doi(preprint) + + unversioned_dois = posted_contents[1].findall('.//{%s}doi' % crossref.CROSSREF_NAMESPACE) + assert len(unversioned_dois) == 1 + assert unversioned_dois[0].text == crossref_client.build_unversioned_doi(preprint) + assert '_v' not in unversioned_dois[0].text + + unversioned_resource = posted_contents[1].find('.//{%s}resource' % crossref.CROSSREF_NAMESPACE) + base_guid = preprint.get_guid()._id + assert unversioned_resource.text == settings.DOMAIN + base_guid + + def test_metadata_unversioned_doi_uses_latest_version_metadata(self, crossref_client, preprint, preprint_version): + crossref_xml = crossref_client.build_metadata(preprint, include_unversioned_doi=True) + root = lxml.etree.fromstring(crossref_xml) + + posted_contents = root.findall('.//{%s}posted_content' % crossref.CROSSREF_NAMESPACE) + assert len(posted_contents) == 2 + + unversioned_resource = posted_contents[1].find('.//{%s}resource' % crossref.CROSSREF_NAMESPACE) + base_guid = preprint.get_guid()._id + assert unversioned_resource.text == settings.DOMAIN + base_guid + + def test_metadata_unversioned_doi_has_no_version_relations(self, crossref_client, preprint, preprint_version): + crossref_xml = crossref_client.build_metadata(preprint_version, include_unversioned_doi=True) + root = lxml.etree.fromstring(crossref_xml) + + posted_contents = root.findall('.//{%s}posted_content' % crossref.CROSSREF_NAMESPACE) + unversioned_content = posted_contents[1] + + relations = unversioned_content.findall('.//{%s}intra_work_relation' % crossref.CROSSREF_RELATIONS) + is_version_of_relations = [ + r for r in relations if r.get('relationship-type') == 'isVersionOf' + ] + assert len(is_version_of_relations) == 0 + + def test_metadata_bulk_does_not_include_unversioned_doi(self, crossref_client, preprint, preprint_version): + crossref_xml = crossref_client.build_metadata([preprint, preprint_version]) + root = lxml.etree.fromstring(crossref_xml) + + posted_contents = root.findall('.//{%s}posted_content' % crossref.CROSSREF_NAMESPACE) + assert len(posted_contents) == 2 # one per preprint, no extras + def test_metadata_for_affiliated_institutions(self, crossref_client, preprint): institution = InstitutionFactory() institution.ror_uri = 'http://ror.org/WHATisITgoodFOR/' diff --git a/tests/identifiers/test_resync_preprint_dois_v1.py b/tests/identifiers/test_resync_preprint_dois_v1.py new file mode 100644 index 00000000000..05c5996fb73 --- /dev/null +++ b/tests/identifiers/test_resync_preprint_dois_v1.py @@ -0,0 +1,187 @@ +import pytest +from unittest import mock +from django.utils import timezone + +from osf.models import Preprint +from osf_tests.factories import PreprintFactory, PreprintProviderFactory +from osf.management.commands.resync_preprint_dois_v1 import ( + get_preprints_needing_v1_doi, + resync_preprint_dois_v1, +) +from website import settings + +pytestmark = pytest.mark.django_db + + +@pytest.fixture() +def provider(): + p = PreprintProviderFactory() + p.doi_prefix = '10.31219' + p.save() + return p + + +@pytest.fixture() +def preprint(provider): + pp = PreprintFactory(provider=provider, is_published=True) + old_doi = settings.DOI_FORMAT.format(prefix=provider.doi_prefix, guid=pp.get_guid()._id) + pp.set_identifier_values(doi=old_doi, save=True) + return pp + + +@pytest.fixture() +def preprint_with_v1_doi(provider): + pp = PreprintFactory(provider=provider, is_published=True) + v1_doi = settings.DOI_FORMAT.format(prefix=provider.doi_prefix, guid=pp._id) + pp.set_identifier_values(doi=v1_doi, save=True) + return pp + + +class TestGetPreprrintsNeedingV1Doi: + + def test_includes_public_preprint_without_versioned_doi(self, preprint): + qs = get_preprints_needing_v1_doi() + assert preprint in qs + + def test_excludes_preprint_with_versioned_doi(self, preprint_with_v1_doi): + qs = get_preprints_needing_v1_doi() + assert preprint_with_v1_doi not in qs + + def test_excludes_preprint_with_no_doi_if_private(self, provider): + private_preprint = PreprintFactory(provider=provider, is_published=False) + private_preprint.is_public = False + private_preprint.save() + qs = get_preprints_needing_v1_doi() + assert private_preprint not in qs + + def test_includes_withdrawn_preprint_with_ever_public(self, provider): + pp = PreprintFactory(provider=provider, is_published=True) + old_doi = settings.DOI_FORMAT.format(prefix=provider.doi_prefix, guid=pp.get_guid()._id) + pp.set_identifier_values(doi=old_doi, save=True) + pp.date_withdrawn = timezone.now() + pp.ever_public = True + pp.save() + qs = get_preprints_needing_v1_doi() + assert pp in qs + + def test_excludes_withdrawn_preprint_never_public(self, provider): + pp = PreprintFactory(provider=provider, is_published=False) + Preprint.objects.filter(pk=pp.pk).update(date_withdrawn=timezone.now()) + qs = get_preprints_needing_v1_doi() + assert pp not in qs + + def test_excludes_version_2_preprint(self, preprint): + from tests.utils import capture_notifications + with capture_notifications(): + v2 = PreprintFactory.create_version(preprint, is_published=True, set_doi=False) + old_doi = settings.DOI_FORMAT.format(prefix=preprint.provider.doi_prefix, guid=v2.get_guid()._id) + v2.set_identifier_values(doi=old_doi, save=True) + qs = get_preprints_needing_v1_doi() + assert v2 not in qs + + def test_excludes_qatest_tagged_preprint(self, preprint): + preprint.add_system_tag('qatest') + qs = get_preprints_needing_v1_doi() + assert preprint not in qs + + def test_excludes_deleted_preprint(self, preprint): + preprint.deleted = timezone.now() + preprint.save() + qs = get_preprints_needing_v1_doi() + assert preprint not in qs + + def test_provider_filter_limits_results(self, preprint, provider): + other_provider = PreprintProviderFactory() + other_provider.doi_prefix = '10.12345' + other_provider.save() + other_preprint = PreprintFactory(provider=other_provider, is_published=True) + old_doi = settings.DOI_FORMAT.format(prefix=other_provider.doi_prefix, guid=other_preprint.get_guid()._id) + other_preprint.set_identifier_values(doi=old_doi, save=True) + + qs = get_preprints_needing_v1_doi(provider_id=provider._id) + assert preprint in qs + assert other_preprint not in qs + + def test_preprint_with_no_doi_identifier_is_included(self, provider): + pp = PreprintFactory(provider=provider, is_published=True, set_doi=False) + qs = get_preprints_needing_v1_doi() + assert pp in qs + + +class TestResyncPreprintDoisV1: + + @mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update') + def test_dry_run_does_not_queue_tasks(self, mock_task, preprint): + resync_preprint_dois_v1(dry_run=True) + mock_task.apply_async.assert_not_called() + + @mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update') + def test_live_run_queues_task_for_each_preprint(self, mock_task, preprint): + resync_preprint_dois_v1(dry_run=False, rate_limit=0) + mock_task.apply_async.assert_called_once_with(kwargs={'preprint_id': preprint._id}) + + @mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update') + def test_batch_size_limits_processed_count(self, mock_task, provider): + preprints = [] + for _ in range(5): + pp = PreprintFactory(provider=provider, is_published=True) + old_doi = settings.DOI_FORMAT.format(prefix=provider.doi_prefix, guid=pp.get_guid()._id) + pp.set_identifier_values(doi=old_doi, save=True) + preprints.append(pp) + + resync_preprint_dois_v1(dry_run=False, batch_size=2, rate_limit=0) + assert mock_task.apply_async.call_count == 2 + + @mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update') + def test_skips_provider_without_doi_prefix(self, mock_task, provider): + no_prefix_provider = PreprintProviderFactory() + no_prefix_provider.doi_prefix = '' + no_prefix_provider.save() + pp = PreprintFactory(provider=no_prefix_provider, is_published=True) + old_doi = '10.000/old-doi' + pp.set_identifier_values(doi=old_doi, save=True) + + resync_preprint_dois_v1(dry_run=False, rate_limit=0) + queued_ids = [ + call.kwargs['kwargs']['preprint_id'] + for call in mock_task.apply_async.call_args_list + ] + assert pp._id not in queued_ids + + @mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update') + def test_provider_filter_is_applied(self, mock_task, preprint, provider): + other_provider = PreprintProviderFactory() + other_provider.doi_prefix = '10.99999' + other_provider.save() + other_pp = PreprintFactory(provider=other_provider, is_published=True) + old_doi = settings.DOI_FORMAT.format(prefix=other_provider.doi_prefix, guid=other_pp.get_guid()._id) + other_pp.set_identifier_values(doi=old_doi, save=True) + + resync_preprint_dois_v1(dry_run=False, rate_limit=0, provider_id=provider._id) + + queued_ids = [ + call.kwargs['kwargs']['preprint_id'] + for call in mock_task.apply_async.call_args_list + ] + assert preprint._id in queued_ids + assert other_pp._id not in queued_ids + + @mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update') + def test_already_versioned_doi_is_not_queued(self, mock_task, preprint_with_v1_doi): + resync_preprint_dois_v1(dry_run=False, rate_limit=0) + queued_ids = [ + call.kwargs['kwargs']['preprint_id'] + for call in mock_task.apply_async.call_args_list + ] + assert preprint_with_v1_doi._id not in queued_ids + + @mock.patch('osf.management.commands.resync_preprint_dois_v1.time.sleep') + @mock.patch('osf.management.commands.resync_preprint_dois_v1.async_request_identifier_update') + def test_rate_limit_triggers_sleep(self, mock_task, mock_sleep, provider): + for _ in range(3): + pp = PreprintFactory(provider=provider, is_published=True) + old_doi = settings.DOI_FORMAT.format(prefix=provider.doi_prefix, guid=pp.get_guid()._id) + pp.set_identifier_values(doi=old_doi, save=True) + + resync_preprint_dois_v1(dry_run=False, rate_limit=2) + mock_sleep.assert_called_once() diff --git a/website/identifiers/clients/crossref.py b/website/identifiers/clients/crossref.py index 8f496ce363b..c1d0aa41ece 100644 --- a/website/identifiers/clients/crossref.py +++ b/website/identifiers/clients/crossref.py @@ -41,13 +41,18 @@ def build_doi(self, preprint): prefix = preprint.provider.doi_prefix return settings.DOI_FORMAT.format(prefix=prefix, guid=preprint._id) - def build_metadata(self, preprint, include_relation=True): + def build_unversioned_doi(self, preprint): + prefix = preprint.provider.doi_prefix + return settings.DOI_FORMAT.format(prefix=prefix, guid=preprint.get_guid()._id) + + def build_metadata(self, preprint, include_relation=True, include_unversioned_doi=False): """Return the crossref metadata XML document for a given preprint as a string for DOI minting purposes :param preprint: the preprint, or list of preprints to build metadata for """ if isinstance(preprint, (list, QuerySet)): preprints = preprint + include_unversioned_doi = False # not supported for bulk batches else: preprints = [preprint] @@ -74,6 +79,9 @@ def build_metadata(self, preprint, include_relation=True): for preprint in preprints: body.append(self.build_posted_content(preprint, element, include_relation)) + if include_unversioned_doi: + body.append(self.build_unversioned_posted_content(preprints[0], element)) + root = element.doi_batch( head, body, @@ -82,10 +90,12 @@ def build_metadata(self, preprint, include_relation=True): root.attrib['{%s}schemaLocation' % XSI] = CROSSREF_SCHEMA_LOCATION return lxml.etree.tostring(root) - def build_posted_content(self, preprint, element, include_relation): + def build_posted_content(self, preprint, element, include_relation, doi_override=None, resource_override=None): """Build the element for a single preprint preprint - preprint to build posted_content for element - namespace element to use when building parts of the XML structure + doi_override - if provided, use this DOI value instead of the preprint's own DOI + resource_override - if provided, use this URL as the instead of the default """ from osf.models import SpamStatus @@ -138,7 +148,7 @@ def build_posted_content(self, preprint, element, include_relation): preprint_versions = preprint.get_preprint_versions( versioned_guids__version__lt=preprint.version, include_rejected=False, - ) + ) if include_relation else [] if preprint_versions: for previous_version in preprint_versions: @@ -157,15 +167,27 @@ def build_posted_content(self, preprint, element, include_relation): posted_content.append(relations_program) minted_doi = preprint.get_identifier_value('doi') - doi = minted_doi or self.build_doi(preprint) + doi = doi_override or minted_doi or self.build_doi(preprint) + resource_url = resource_override if resource_override is not None else settings.DOMAIN + preprint._id doi_data = [ element.doi(doi), - element.resource(settings.DOMAIN + preprint._id) + element.resource(resource_url) ] posted_content.append(element.doi_data(*doi_data)) return posted_content + def build_unversioned_posted_content(self, preprint, element): + latest = preprint.get_guid().referent + base_guid = latest.get_guid()._id + return self.build_posted_content( + latest, + element, + include_relation=False, + doi_override=self.build_unversioned_doi(latest), + resource_override=settings.DOMAIN + base_guid, + ) + def _process_crossref_name(self, contributor): # Adapted from logic used in `api/citations/utils.py` # If the user has a family and given name, use those @@ -249,7 +271,7 @@ def _build_url(self, **query): def create_identifier(self, preprint, category, include_relation=True): if category == 'doi': - metadata = self.build_metadata(preprint, include_relation) + metadata = self.build_metadata(preprint, include_relation, include_unversioned_doi=True) doi = self.build_doi(preprint) username, password = self.get_credentials() logger.info(f'Sending metadata for DOI {doi}:\n{metadata}') diff --git a/website/settings/defaults.py b/website/settings/defaults.py index fbe9b939ae1..d2d62cfcb60 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -711,6 +711,11 @@ class CeleryConfig: 'schedule': crontab(minute=0, hour=5), # Daily 12 a.m 'kwargs': {'dry_run': False}, }, + 'resync_preprint_dois_v1': { + 'task': 'osf.management.commands.resync_preprint_dois_v1', + 'schedule': crontab(minute=0, hour=5), # Daily 12 a.m EDT + 'kwargs': {'dry_run': False}, + }, }