From 282b7de59c792db3f019bc9ecef2f23954541abe Mon Sep 17 00:00:00 2001 From: Wolf Date: Thu, 12 Dec 2024 15:00:40 +0000 Subject: [PATCH 1/3] Playwight --- .github/workflows/cicd.yml | 3 + .github/workflows/generate-release.yml | 3 + .github/workflows/generate-test-release.yml | 3 + CITATION.cff | 4 +- README.md | 28 +++- setup.py | 2 +- test.py | 33 +++++ tests/{testconf.py => conftest.py} | 149 ++++++++++---------- tests/test_pypi_extractor.py | 58 +++----- wolfsoftware/pypi_extractor/pypi.py | 69 ++++++--- 10 files changed, 219 insertions(+), 133 deletions(-) create mode 100644 test.py rename tests/{testconf.py => conftest.py} (69%) diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index 74f5674..f506892 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -202,6 +202,9 @@ jobs: - name: Install Pytest run: pip install pytest pytest-mock + - name: Setup PlayWright + run: playwright install && playwright install-deps + - name: Run Pytest run: pytest --no-header -vv diff --git a/.github/workflows/generate-release.yml b/.github/workflows/generate-release.yml index 687efdc..290196d 100644 --- a/.github/workflows/generate-release.yml +++ b/.github/workflows/generate-release.yml @@ -76,6 +76,9 @@ jobs: - name: Install Pytest run: pip install pytest pytest-mock + - name: Setup PlayWright + run: playwright install && playwright install-deps + - name: Run Pytest run: pytest --no-header -vv diff --git a/.github/workflows/generate-test-release.yml b/.github/workflows/generate-test-release.yml index 8d73889..5a3260a 100644 --- a/.github/workflows/generate-test-release.yml +++ b/.github/workflows/generate-test-release.yml @@ -75,6 +75,9 @@ jobs: - name: Install Pytest run: pip install pytest pytest-mock + - name: Setup PlayWright + run: playwright install && playwright install-deps + - name: Run Pytest run: pytest --no-header -vv diff --git a/CITATION.cff b/CITATION.cff index c3245ae..451e733 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -3,8 +3,8 @@ message: If you use this software, please cite it using these metadata. title: PyPi Extractor abstract: Extract package information for a given user in PyPi. type: software -version: 0.1.2 -date-released: 2024-06-26 +version: 0.1.3 +date-released: 2024-12-12 repository-code: https://github.com/DevelopersToolbox/pypi-extractor-package keywords: - "Wolf Software" diff --git a/README.md b/README.md index 770e9c7..d25e899 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,22 @@ PyPI Extractor is a Python package designed to fetch and process detailed inform Python Package Index (PyPI). This package is particularly useful for users who want to retrieve and analyze metadata for packages maintained by a specific PyPI user. +## Significant Update From 0.1.3 + +pypi.org no longer allow you to scrap details using the requests package, or any package that does not support JavaScript. To resolve this we have +updated this package to utilise [PlayWright](https://pypi.org/project/playwright/) when retrieving a list of packages for a given user. While we have +attempted to automate as much as possible you might want to do some of the work manually. + +Playwright needs two commands to be run in order for it to function correctly: + +``` +playwright install +playwright install-deps +``` + +We have added an `auto_install` option to the main class so that you can instruct the package to do the install for you, this helps when installing the +package in a fully automated way, e.g. Puppet or similar. + ## Features - Retrieve a list of packages maintained by a specific PyPI user. @@ -116,11 +132,13 @@ print(package_details) A class to fetch and process package details for a given PyPI user. -##### `__init__(self, username: str)` +##### `__init__(self, username: str, verbose: bool, auto_install: bool)` - Initializes the `PyPiExtractor` with a username. - Parameters: - `username` (str): The PyPI username. + - `verbose` (bool): Verbose output (Default: False) + - `auto_install` (bool): Auto install PlayWright dependencies (Default: False) - Raises: - `PyPiExtractorError`: If the username is not provided. @@ -132,6 +150,14 @@ A class to fetch and process package details for a given PyPI user. - Raises: - `PyPiExtractorError`: If the username is not provided. +##### `enable_verbose(self)` + +- Enable verbose mode. + +##### `enable_auto_install(self)` + +- Enable auto install. + ##### `get_user_packages(self) -> list` - Fetches the list of packages for the given PyPI user. diff --git a/setup.py b/setup.py index 869e7b7..2f3cafe 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name='wolfsoftware.pypi-extractor', - version='0.1.2', + version='0.1.3', author='Wolf Software', author_email='pypi@wolfsoftware.com', description='Extract package information for a given user in PyPi.', diff --git a/test.py b/test.py new file mode 100644 index 0000000..eb50dca --- /dev/null +++ b/test.py @@ -0,0 +1,33 @@ +import json + +from wolfsoftware.pypi_extractor import PyPiExtractor, PyPiExtractorError + + +def get_package_list(username="wolfsoftware"): + """ + Retrieves a list of packages for the specified user. + + Args: + username (str): Username to fetch the PyPi packages for. + + Returns: + list: A sorted list of package names. + """ + pypi_info = PyPiExtractor(verbose=True) + pypi_info.set_username(username) + + try: + packages_details = pypi_info.get_all_packages_details() + return(packages_details) + except PyPiExtractorError as e: + print(f"An error occurred while fetching packages: {e.message}") + return [] + + +def main(): + packages = get_package_list() + print(packages) + + +if __name__ == "__main__": + main() diff --git a/tests/testconf.py b/tests/conftest.py similarity index 69% rename from tests/testconf.py rename to tests/conftest.py index 5404db8..ab24880 100644 --- a/tests/testconf.py +++ b/tests/conftest.py @@ -16,32 +16,56 @@ import requests +def raise_error(*args, **kwargs): + """Raise an error if the real playwright gets used.""" + raise RuntimeError("Real Playwright should not be invoked!") + + @pytest.fixture -def mock_get_user_packages_success() -> Generator[Union[MagicMock, AsyncMock], Any, None]: - """Fixture to mock requests.get for get_user_packages success case.""" - with patch('requests.get') as mock_get: - mock_response = Mock() - mock_response.raise_for_status.return_value = None - mock_response.text = ''' - -

Package1

-

Description1

-
- -

Package2

-

Description2

-
- ''' - mock_get.return_value = mock_response - yield mock_get +def mock_playwright() -> Generator[MagicMock, None, None]: + """Mock the Playwright sync API.""" + with patch('wolfsoftware.pypi_extractor.pypi.sync_playwright') as mock_sync_playwright: + mock_playwright_instance = MagicMock() + mock_browser = MagicMock() + mock_context = MagicMock() + mock_page = MagicMock() + + # Mock page.goto() and page.wait_for_selector() + mock_page.goto.return_value = None + mock_page.wait_for_selector.return_value = None + + # Mock page.query_selector_all() to return simulated package elements + def mock_query_selector_all(selector): + """Handle mocking the right data.""" + if selector == 'a.package-snippet': + return [ + MagicMock(query_selector=MagicMock(side_effect=[ + MagicMock(inner_text=MagicMock(return_value="Package1")), + MagicMock(inner_text=MagicMock(return_value="Description1")), + ])), + MagicMock(query_selector=MagicMock(side_effect=[ + MagicMock(inner_text=MagicMock(return_value="Package2")), + MagicMock(inner_text=MagicMock(return_value="Description2")), + ])), + ] + return [] + mock_page.query_selector_all.side_effect = mock_query_selector_all + + mock_context.new_page.return_value = mock_page + mock_browser.new_context.return_value = mock_context + mock_playwright_instance.chromium.launch.return_value = mock_browser + mock_sync_playwright.return_value.__enter__.return_value = mock_playwright_instance + yield mock_sync_playwright @pytest.fixture -def mock_get_user_packages_error() -> Generator[Union[MagicMock, AsyncMock], Any, None]: - """Fixture to mock requests.get for get_user_packages error case.""" - with patch('requests.get') as mock_get: - mock_get.side_effect = requests.RequestException("Request error") - yield mock_get +def mock_playwright_error() -> Generator[MagicMock, None, None]: + """Fixture to mock Playwright with an error scenario.""" + with patch('wolfsoftware.pypi_extractor.pypi.sync_playwright') as mock_sync_playwright: + mock_playwright_instance = MagicMock() + mock_playwright_instance.chromium.launch.side_effect = Exception("Playwright error") + mock_sync_playwright.return_value.__enter__.return_value = mock_playwright_instance + yield mock_sync_playwright @pytest.fixture @@ -155,24 +179,13 @@ def mock_get_package_details_error() -> Generator[Union[MagicMock, AsyncMock], A @pytest.fixture -def mock_get_all_packages_details_success() -> Generator[Union[MagicMock, AsyncMock], Any, None]: - """Fixture to mock requests.get for get_all_packages_details success case.""" +def mock_get_all_packages_details_success() -> Generator[MagicMock, None, None]: + """Mock requests.get for get_all_packages_details success case.""" with patch('requests.get') as mock_get: - mock_response_user = Mock() + # Mock response for the user packages API + mock_response_user = MagicMock() mock_response_user.raise_for_status.return_value = None - mock_response_user.text = ''' - -

Package1

-

Description1

-
- -

Package2

-

Description2

-
- ''' - mock_response_package1 = Mock() - mock_response_package1.raise_for_status.return_value = None - mock_response_package1.json.return_value = { + mock_response_user.json.return_value = { 'info': { 'name': 'Package1', 'version': '1.0.0', @@ -186,37 +199,30 @@ def mock_get_all_packages_details_success() -> Generator[Union[MagicMock, AsyncM 'requires_python': '>=3.6', }, 'releases': { - '0.9.0': [ - { - 'upload_time': '2021-01-01T00:00:00', - 'upload_time_iso_8601': '2021-01-01T00:00:00Z', - 'python_version': 'py3', - 'url': 'https://example.com', - 'filename': 'package-0.9.0.tar.gz', - 'packagetype': 'sdist', - 'md5_digest': 'abc123', - 'digests': {'sha256': 'def456'}, - 'size': 12345 - } - ], '1.0.0': [ { 'upload_time': '2021-06-01T00:00:00', 'upload_time_iso_8601': '2021-06-01T00:00:00Z', 'python_version': 'py3', - 'url': 'https://example.com', + 'url': 'https://example.com/package-1.0.0.tar.gz', 'filename': 'package-1.0.0.tar.gz', 'packagetype': 'sdist', - 'md5_digest': 'ghi789', - 'digests': {'sha256': 'jkl012'}, - 'size': 23456 + 'md5_digest': 'abc123', + 'digests': {'sha256': 'def456'}, + 'size': 12345 } - ], + ] }, 'requires_dist': ['requests', 'beautifulsoup4'], 'urls': [{'url': 'https://example.com/package-1.0.0.tar.gz'}], } - mock_response_package2 = Mock() + + # Simulate two different package details responses + mock_response_package1 = MagicMock() + mock_response_package1.raise_for_status.return_value = None + mock_response_package1.json.return_value = mock_response_user.json.return_value + + mock_response_package2 = MagicMock() mock_response_package2.raise_for_status.return_value = None mock_response_package2.json.return_value = { 'info': { @@ -226,41 +232,30 @@ def mock_get_all_packages_details_success() -> Generator[Union[MagicMock, AsyncM 'author': 'Author2', 'author_email': 'author2@example.com', 'license': 'MIT', - 'home_page': 'https://example.com', - 'keywords': 'example, package', + 'home_page': 'https://example.com/package2', + 'keywords': 'example, package2', 'classifiers': ['Development Status :: 5 - Production/Stable'], 'requires_python': '>=3.6', }, 'releases': { - '1.0.0': [ - { - 'upload_time': '2021-01-01T00:00:00', - 'upload_time_iso_8601': '2021-01-01T00:00:00Z', - 'python_version': 'py3', - 'url': 'https://example.com', - 'filename': 'package-1.0.0.tar.gz', - 'packagetype': 'sdist', - 'md5_digest': 'abc123', - 'digests': {'sha256': 'def456'}, - 'size': 12345 - } - ], '2.0.0': [ { - 'upload_time': '2021-06-01T00:00:00', - 'upload_time_iso_8601': '2021-06-01T00:00:00Z', + 'upload_time': '2022-06-01T00:00:00', + 'upload_time_iso_8601': '2022-06-01T00:00:00Z', 'python_version': 'py3', - 'url': 'https://example.com', + 'url': 'https://example.com/package-2.0.0.tar.gz', 'filename': 'package-2.0.0.tar.gz', 'packagetype': 'sdist', 'md5_digest': 'ghi789', 'digests': {'sha256': 'jkl012'}, 'size': 23456 } - ], + ] }, 'requires_dist': ['requests', 'beautifulsoup4'], 'urls': [{'url': 'https://example.com/package-2.0.0.tar.gz'}], } - mock_get.side_effect = [mock_response_user, mock_response_package1, mock_response_package2] + + # Simulate the sequence of requests + mock_get.side_effect = [mock_response_package1, mock_response_package2] yield mock_get diff --git a/tests/test_pypi_extractor.py b/tests/test_pypi_extractor.py index d4b9aa0..22daadf 100644 --- a/tests/test_pypi_extractor.py +++ b/tests/test_pypi_extractor.py @@ -10,13 +10,6 @@ import pytest from wolfsoftware.pypi_extractor import PyPiExtractor, PyPiExtractorError # pylint: disable=unused-import, no-name-in-module -from .testconf import ( # noqa: F401 pylint: disable=unused-import - mock_get_user_packages_success, - mock_get_user_packages_error, - mock_get_package_details_success, - mock_get_package_details_error, - mock_get_all_packages_details_success -) def test_version() -> None: @@ -78,38 +71,29 @@ def test_set_username_with_invalid_value() -> None: pypi_info.set_username("") -def test_get_user_packages_success(mock_get_user_packages_success) -> None: # noqa: F811 pylint: disable=redefined-outer-name, unused-argument - """ - Test get_user_packages method for a successful case. - - This test uses the mock_get_user_packages_success fixture to mock requests.get method - to return a successful response and verifies that the get_user_packages method returns - the expected list of packages. - """ - pypi_info = PyPiExtractor("testuser") - packages: List = pypi_info.get_user_packages() - - assert len(packages) == 2 # nosec: B101 - assert packages[0]['name'] == "Package1" # nosec: B101 - assert packages[0]['summary'] == "Description1" # nosec: B101 - assert packages[1]['name'] == "Package2" # nosec: B101 - assert packages[1]['summary'] == "Description2" # nosec: B101 +@pytest.mark.usefixtures("mock_playwright") +def test_get_user_packages_success() -> None: + """Test the get_user_packages method for a successful case.""" + pypi_extractor = PyPiExtractor("testuser") + packages = pypi_extractor.get_user_packages() + assert len(packages) == 2 + assert packages[0]['name'] == "Package1" + assert packages[0]['summary'] == "Description1" + assert packages[1]['name'] == "Package2" + assert packages[1]['summary'] == "Description2" -def test_get_user_packages_error(mock_get_user_packages_error) -> None: # noqa: F811 pylint: disable=redefined-outer-name, unused-argument - """ - Test get_user_packages method when there is an error. - This test uses the mock_get_user_packages_error fixture to mock requests.get method - to raise an exception and verifies that the get_user_packages method raises a PyPiExtractorError. - """ - pypi_info = PyPiExtractor("testuser") - - with pytest.raises(PyPiExtractorError, match="Error fetching user profile: Request error"): - pypi_info.get_user_packages() +@pytest.mark.usefixtures("mock_playwright_error") +def test_get_user_packages_error() -> None: + """Test the get_user_packages method when Playwright fails.""" + pypi_extractor = PyPiExtractor("testuser") + with pytest.raises(PyPiExtractorError, match="Error fetching user profile with Playwright"): + pypi_extractor.get_user_packages() -def test_get_package_details_success(mock_get_package_details_success) -> None: # noqa: F811 pylint: disable=redefined-outer-name, unused-argument +@pytest.mark.usefixtures("mock_get_package_details_success") +def test_get_package_details_success() -> None: """ Test get_package_details method for a successful case. @@ -136,7 +120,8 @@ def test_get_package_details_success(mock_get_package_details_success) -> None: assert details['older_versions'][0]['version'] == "0.9.0" # nosec: B101 -def test_get_package_details_error(mock_get_package_details_error) -> None: # noqa: F811 pylint: disable=redefined-outer-name, unused-argument +@pytest.mark.usefixtures("mock_get_package_details_error") +def test_get_package_details_error() -> None: """ Test get_package_details method when there is an error. @@ -149,7 +134,8 @@ def test_get_package_details_error(mock_get_package_details_error) -> None: # n pypi_info.get_package_details("Package1") -def test_get_all_packages_details_success(mock_get_all_packages_details_success) -> None: # noqa: F811 pylint: disable=redefined-outer-name, unused-argument +@pytest.mark.usefixtures("mock_playwright", "mock_get_all_packages_details_success") +def test_get_all_packages_details_success() -> None: """ Test get_all_packages_details method for a successful case. diff --git a/wolfsoftware/pypi_extractor/pypi.py b/wolfsoftware/pypi_extractor/pypi.py index 1b07b4b..0882e83 100644 --- a/wolfsoftware/pypi_extractor/pypi.py +++ b/wolfsoftware/pypi_extractor/pypi.py @@ -7,8 +7,11 @@ from typing import Any, Dict, List, Optional import json +import subprocess # nosec: B404 + import requests -from bs4 import BeautifulSoup + +from playwright.sync_api import sync_playwright from .exceptions import PyPiExtractorError @@ -21,7 +24,7 @@ class PyPiExtractor: username (Optional[str]): The PyPI username whose packages are to be fetched. """ - def __init__(self, username: Optional[str] = None) -> None: + def __init__(self, username: Optional[str] = None, verbose: Optional[bool] = False, auto_install: Optional[bool] = False) -> None: """ Initialize the PyPIPackageInfo. The username can be set during initialization or later using the set_username method. @@ -29,6 +32,8 @@ def __init__(self, username: Optional[str] = None) -> None: username (Optional[str]): The PyPI username. Default is None. """ self.username: Optional[str] = username + self.verbose: Optional[bool] = verbose + self.auto_install: Optional[bool] = auto_install def set_username(self, username: str) -> None: """ @@ -44,6 +49,31 @@ def set_username(self, username: str) -> None: raise PyPiExtractorError("Username must be provided") self.username = username + def enable_verbose(self) -> None: + """Enable verbose output.""" + self.verbose = True + + def enable_auto_install(self) -> None: + """Enable auto_install.""" + self.auto_install = True + + def ensure_playwright_browsers_and_deps(self) -> None: + """Ensure Playwright browsers and system dependencies are installed silently.""" + if self.auto_install: + try: + # Install Playwright browsers silently + subprocess.run(["playwright", "install"], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # nosec: B603 B607 + if self.verbose: + print("Playwright browsers installed successfully.") + + # Install system-level dependencies silently (Linux only) + subprocess.run(["playwright", "install-deps"], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) # nosec: B603 B607 + if self.verbose: + print("System dependencies installed successfully.") + except subprocess.CalledProcessError as e: + print(f"Error during Playwright setup: {e}") + raise + def get_user_packages(self) -> List[Dict[str, str]]: """ Fetch the list of packages for the given PyPI user. @@ -52,27 +82,34 @@ def get_user_packages(self) -> List[Dict[str, str]]: list: A list of dictionaries containing package names and summaries. Raises: - PyPIPackageInfoError: If the username is not set or if there is an error fetching or parsing the user profile. + PyPiExtractorError: If the username is not set or if there is an error fetching or parsing the user profile. """ if not self.username: raise PyPiExtractorError("Username must be set before fetching packages") profile_url: str = "https://pypi.org/user/" + self.username + "/" + packages: List[Dict[str, str]] = [] + try: - response: requests.Response = requests.get(profile_url, timeout=10) - response.raise_for_status() - except requests.RequestException as e: - raise PyPiExtractorError(f"Error fetching user profile: {e}") from e + self.ensure_playwright_browsers_and_deps() - soup = BeautifulSoup(response.text, 'html.parser') - packages: List[Dict[str, str]] = [] - for project in soup.find_all('a', class_='package-snippet'): - try: - package_name: str = project.find('h3', class_='package-snippet__title').text.strip() - summary: str = project.find('p', class_='package-snippet__description').text.strip() - packages.append({'name': package_name, 'summary': summary}) - except AttributeError as e: - raise PyPiExtractorError(f"Error parsing package details: {e}") from e + with sync_playwright() as p: + browser: Any = p.chromium.launch(headless=True) + context: Any = browser.new_context() + page: Any = context.new_page() + + page.goto(profile_url) + page.wait_for_selector('.package-snippet') + + elements: Any = page.query_selector_all('a.package-snippet') + for element in elements: + package_name: Any = element.query_selector('h3.package-snippet__title').inner_text().strip() + summary: Any = element.query_selector('p.package-snippet__description').inner_text().strip() + packages.append({'name': package_name, 'summary': summary}) + + browser.close() + except Exception as e: + raise PyPiExtractorError(f"Error fetching user profile with Playwright: {e}") from e return packages From af1ff8ac7de6e664f2fb509c49cf9f5594b911eb Mon Sep 17 00:00:00 2001 From: Wolf Date: Thu, 12 Dec 2024 15:09:03 +0000 Subject: [PATCH 2/3] cleanup --- requirements.txt | 1 + test.py | 33 --------------------------------- 2 files changed, 1 insertion(+), 33 deletions(-) delete mode 100644 test.py diff --git a/requirements.txt b/requirements.txt index 74c270f..d88d7e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ requests==2.32.3 beautifulsoup4==4.12.3 +playwright==1.49.1 diff --git a/test.py b/test.py deleted file mode 100644 index eb50dca..0000000 --- a/test.py +++ /dev/null @@ -1,33 +0,0 @@ -import json - -from wolfsoftware.pypi_extractor import PyPiExtractor, PyPiExtractorError - - -def get_package_list(username="wolfsoftware"): - """ - Retrieves a list of packages for the specified user. - - Args: - username (str): Username to fetch the PyPi packages for. - - Returns: - list: A sorted list of package names. - """ - pypi_info = PyPiExtractor(verbose=True) - pypi_info.set_username(username) - - try: - packages_details = pypi_info.get_all_packages_details() - return(packages_details) - except PyPiExtractorError as e: - print(f"An error occurred while fetching packages: {e.message}") - return [] - - -def main(): - packages = get_package_list() - print(packages) - - -if __name__ == "__main__": - main() From 52b1bc3064bacd098e202f41e83b3a2e54dcdbf0 Mon Sep 17 00:00:00 2001 From: Wolf Date: Thu, 12 Dec 2024 15:11:50 +0000 Subject: [PATCH 3/3] fix bandit issues --- tests/test_pypi_extractor.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_pypi_extractor.py b/tests/test_pypi_extractor.py index 22daadf..b83a66c 100644 --- a/tests/test_pypi_extractor.py +++ b/tests/test_pypi_extractor.py @@ -75,13 +75,13 @@ def test_set_username_with_invalid_value() -> None: def test_get_user_packages_success() -> None: """Test the get_user_packages method for a successful case.""" pypi_extractor = PyPiExtractor("testuser") - packages = pypi_extractor.get_user_packages() + packages: List[Dict[str, str]] = pypi_extractor.get_user_packages() - assert len(packages) == 2 - assert packages[0]['name'] == "Package1" - assert packages[0]['summary'] == "Description1" - assert packages[1]['name'] == "Package2" - assert packages[1]['summary'] == "Description2" + assert len(packages) == 2 # nosec: B101 + assert packages[0]['name'] == "Package1" # nosec: B101 + assert packages[0]['summary'] == "Description1" # nosec: B101 + assert packages[1]['name'] == "Package2" # nosec: B101 + assert packages[1]['summary'] == "Description2" # nosec: B101 @pytest.mark.usefixtures("mock_playwright_error")