Source code for academic_observatory_workflows.unpaywall_telescope.tests.test_tasks

# Copyright 2020 Curtin University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Author: Tuan Chien, James Diprose

import os
import unittest
from unittest.mock import patch
import tempfile

from airflow import AirflowException
import jsonlines
import pendulum
import vcr

from observatory_platform.sandbox.test_utils import SandboxTestCase
from academic_observatory_workflows.config import project_path
from academic_observatory_workflows.unpaywall_telescope.release import Changefile
from academic_observatory_workflows.unpaywall_telescope.tasks import (
    snapshot_url,
    get_snapshot_file_name,
    changefiles_url,
    changefile_download_url,
    get_unpaywall_changefiles,
    unpaywall_filename_to_datetime,
    unpaywall_transform_file,
    unpaywall_transform_row,
    UNPAYWALL_BASE_URL,
)


[docs]FIXTURES_FOLDER = project_path("unpaywall_telescope", "tests", "fixtures")
[docs]class TestUnpaywallUtils(SandboxTestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs)
[docs] def test_changefile(self): class MockRelease: @property def download_folder(self): return "/path/to/download/" @property def extract_folder(self): return "/path/to/extract/" @property def transform_folder(self): return "/path/to/transform/" cf = Changefile( "changed_dois_with_versions_2020-03-11T005336.jsonl.gz", pendulum.datetime(2020, 3, 11, 0, 53, 36), changefile_release=MockRelease(), ) # download_file_path self.assertEqual( "/path/to/download/changed_dois_with_versions_2020-03-11T005336.jsonl.gz", cf.download_file_path ) # extract_file_path self.assertEqual("/path/to/extract/changed_dois_with_versions_2020-03-11T005336.jsonl", cf.extract_file_path) # transform_file_path self.assertEqual( "/path/to/transform/changed_dois_with_versions_2020-03-11T005336.jsonl", cf.transform_file_path ) # from_dict dict_ = dict(filename=cf.filename, changefile_date=cf.changefile_date.isoformat()) self.assertEqual(cf, Changefile.from_dict(dict_)) # to_dict self.assertEqual(dict_, cf.to_dict())
[docs] def test_snapshot_url(self): url = snapshot_url(UNPAYWALL_BASE_URL, "my-api-key") self.assertEqual(f"https://api.unpaywall.org/feed/snapshot?api_key=my-api-key", url)
[docs] def test_changefiles_url(self): url = changefiles_url(UNPAYWALL_BASE_URL, "my-api-key") self.assertEqual(f"https://api.unpaywall.org/feed/changefiles?interval=day&api_key=my-api-key", url)
[docs] def test_changefile_download_url(self): url = changefile_download_url( UNPAYWALL_BASE_URL, "changed_dois_with_versions_2020-03-11T005336.jsonl.gz", "my-api-key" ) self.assertEqual( f"https://api.unpaywall.org/daily-feed/changefile/changed_dois_with_versions_2020-03-11T005336.jsonl.gz?api_key=my-api-key", url, )
[docs] def test_unpaywall_filename_to_datetime(self): # Snapshot filename filename = "unpaywall_snapshot_2023-04-25T083002.jsonl.gz" dt = unpaywall_filename_to_datetime(filename) self.assertEqual(pendulum.datetime(2023, 4, 25, 8, 30, 2), dt) # Changefile filename filename = "changed_dois_with_versions_2020-03-11T005336.jsonl.gz" dt = unpaywall_filename_to_datetime(filename) self.assertEqual(pendulum.datetime(2020, 3, 11, 0, 53, 36), dt) # Filename without time component filename = "changed_dois_with_versions_2020-03-11.jsonl.gz" dt = unpaywall_filename_to_datetime(filename) self.assertEqual(pendulum.datetime(2020, 3, 11, 0, 0, 0), dt)
@patch("observatory_platform.url_utils.get_http_text_response")
[docs] def test_get_unpaywall_changefiles(self, m_get_http_text_response): # Don't use vcr here because the actual returned data contains API keys and it is a lot of data m_get_http_text_response.return_value = '{"list":[{"date":"2023-04-25","filename":"changed_dois_with_versions_2023-04-25T080001.jsonl.gz","filetype":"jsonl","last_modified":"2023-04-25T08:03:12","lines":310346,"size":143840367,"url":"https://api.unpaywall.org/daily-feed/changefile/changed_dois_with_versions_2023-04-25T080001.jsonl.gz?api_key=my-api-key"},{"date":"2023-04-24","filename":"changed_dois_with_versions_2023-04-24T080001.jsonl.gz","filetype":"jsonl","last_modified":"2023-04-24T08:04:49","lines":220800,"size":112157260,"url":"https://api.unpaywall.org/daily-feed/changefile/changed_dois_with_versions_2023-04-24T080001.jsonl.gz?api_key=my-api-key"},{"date":"2023-04-23","filename":"changed_dois_with_versions_2023-04-23T080001.jsonl.gz","filetype":"jsonl","last_modified":"2023-04-23T08:03:54","lines":213140,"size":105247617,"url":"https://api.unpaywall.org/daily-feed/changefile/changed_dois_with_versions_2023-04-23T080001.jsonl.gz?api_key=my-api-key"},{"date":"2023-02-24","filename":"changed_dois_with_versions_2023-02-24.jsonl.gz","filetype":"jsonl","last_modified":"2023-03-21T01:51:18","lines":5,"size":6301,"url":"https://api.unpaywall.org/daily-feed/changefile/changed_dois_with_versions_2023-02-24.jsonl.gz?api_key=my-api-key"},{"date":"2020-03-11","filename":"changed_dois_with_versions_2020-03-11T005336.csv.gz","filetype":"csv","last_modified":"2020-03-11T01:27:04","lines":1806534,"size":195900034,"url":"https://api.unpaywall.org/daily-feed/changefile/changed_dois_with_versions_2020-03-11T005336.csv.gz?api_key=my-api-key"}]}' expected_changefiles = [ Changefile("changed_dois_with_versions_2023-02-24.jsonl.gz", pendulum.datetime(2023, 2, 24)), Changefile( "changed_dois_with_versions_2023-04-23T080001.jsonl.gz", pendulum.datetime(2023, 4, 23, 8, 0, 1) ), Changefile( "changed_dois_with_versions_2023-04-24T080001.jsonl.gz", pendulum.datetime(2023, 4, 24, 8, 0, 1) ), Changefile( "changed_dois_with_versions_2023-04-25T080001.jsonl.gz", pendulum.datetime(2023, 4, 25, 8, 0, 1) ), ] url = changefiles_url(UNPAYWALL_BASE_URL, "my-api-key") changefiles = get_unpaywall_changefiles(url) self.assertEqual(expected_changefiles, changefiles)
[docs] def test_get_snapshot_file_name(self): # This cassette was run with a valid api key, which is not saved into the cassette or code # Set UNPAYWALL_API_KEY to use the real api key with vcr.use_cassette( os.path.join(FIXTURES_FOLDER, "get_snapshot_file_name_success.yaml"), filter_query_parameters=["api_key"], ): filename = get_snapshot_file_name(UNPAYWALL_BASE_URL, os.getenv("UNPAYWALL_API_KEY", "my-api-key")) self.assertEqual("unpaywall_snapshot_2023-04-25T083002.jsonl.gz", filename) with vcr.use_cassette( os.path.join(FIXTURES_FOLDER, "get_snapshot_file_name_missing_location.yaml"), filter_query_parameters=["api_key"], ): with self.assertRaisesRegex(AirflowException, 'Missing "location"'): get_snapshot_file_name(UNPAYWALL_BASE_URL, os.getenv("UNPAYWALL_API_KEY", "my-api-key")) # An invalid API key with vcr.use_cassette( os.path.join(FIXTURES_FOLDER, "get_snapshot_file_name_failure.yaml"), filter_query_parameters=["api_key"], ): with self.assertRaisesRegex(AirflowException, "Unexpected status code"): get_snapshot_file_name(UNPAYWALL_BASE_URL, "invalid-api-key")
[docs]class TestUnpaywallTransform(unittest.TestCase):
[docs] def test_transform_file(self): with tempfile.NamedTemporaryFile(mode="w+") as f_in, tempfile.NamedTemporaryFile(mode="r") as f_out: # Sample input row input_data = {"author-list": ["Alice", "null", None], "null-str": "null", "nested": {"some-key": "value"}} # Write to input with jsonlines.Writer(f_in) as writer: writer.write(input_data) f_in.flush() # Flush because the temp file is still open # Transform unpaywall_transform_file(f_in.name, f_out.name) # Read output with jsonlines.Reader(f_out) as reader: output = list(reader) # Check result self.assertEqual(len(output), 1) row = output[0] self.assertEqual(row["author_list"], ["Alice"]) self.assertIsNone(row["null_str"]) self.assertEqual(row["nested"]["some_key"], "value")
[docs] def test_transform_row(self): input_row = { "title": "Example", "author-list": ["Alice", None, "Bob"], "publication-info": {"journal-name": "Science", "publisher": "null", "identifiers": [None, "123", "null"]}, "null-field": "null", } expected_output = { "title": "Example", "author_list": ["Alice", "Bob"], "publication_info": { "journal_name": "Science", "publisher": None, "identifiers": ["123"], }, "null_field": None, } result = unpaywall_transform_row(input_row) print(result) self.assertEqual(result["title"], expected_output["title"]) self.assertEqual(result["author_list"], expected_output["author_list"]) self.assertEqual( result["publication_info"]["journal_name"], expected_output["publication_info"]["journal_name"] ) self.assertEqual(result["publication_info"]["publisher"], expected_output["publication_info"]["publisher"]) self.assertEqual(result["publication_info"]["identifiers"], expected_output["publication_info"]["identifiers"]) self.assertEqual(result["null_field"], expected_output["null_field"])