From 221b71ba7f0f7f2c5465081815608e95424747e1 Mon Sep 17 00:00:00 2001 From: Juro Oravec Date: Thu, 23 Jan 2025 23:29:02 +0100 Subject: [PATCH 1/6] feat: add HTML parser initial implementation --- .github/FUNDING.yml | 2 + .github/workflows/publish.yml | 179 +++++++++++ .github/workflows/tests.yml | 51 ++++ .gitignore | 96 ++++++ CHANGELOG.md | 11 + CODE_OF_CONDUCT.md | 76 +++++ Cargo.lock | 296 +++++++++++++++++++ Cargo.toml | 19 ++ README.md | 118 ++++++++ __init__.pyi | 36 +++ py.typed | 0 pyproject.toml | 96 ++++++ requirements-ci.in | 2 + requirements-ci.txt | 16 + requirements-dev.in | 2 + requirements-dev.txt | 16 + src/html_parser.rs | 540 ++++++++++++++++++++++++++++++++++ src/lib.rs | 11 + tests/benchmark.py | 97 ++++++ tests/test_html_parser.py | 176 +++++++++++ 20 files changed, 1840 insertions(+) create mode 100644 .github/FUNDING.yml create mode 100644 .github/workflows/publish.yml create mode 100644 .github/workflows/tests.yml create mode 100644 .gitignore create mode 100644 CHANGELOG.md create mode 100644 CODE_OF_CONDUCT.md create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 README.md create mode 100644 __init__.pyi create mode 100644 py.typed create mode 100644 pyproject.toml create mode 100644 requirements-ci.in create mode 100644 requirements-ci.txt create mode 100644 requirements-dev.in create mode 100644 requirements-dev.txt create mode 100644 src/html_parser.rs create mode 100644 src/lib.rs create mode 100644 tests/benchmark.py create mode 100644 tests/test_html_parser.py diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..97ca688 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,2 @@ +github: ["EmilStenstrom"] + diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..fb535f8 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,179 @@ +# This file is autogenerated by maturin v1.8.1 +# To update, run +# +# maturin generate-ci github +# +name: Publish to PyPI + +on: + push: + tags: + - '*' + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +permissions: + contents: read + +jobs: + linux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-22.04 + target: x86_64 + - runner: ubuntu-22.04 + target: x86 + - runner: ubuntu-22.04 + target: aarch64 + - runner: ubuntu-22.04 + target: armv7 + - runner: ubuntu-22.04 + target: s390x + - runner: ubuntu-22.04 + target: ppc64le + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + manylinux: auto + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-linux-${{ matrix.platform.target }} + path: dist + + musllinux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-22.04 + target: x86_64 + - runner: ubuntu-22.04 + target: x86 + - runner: ubuntu-22.04 + target: aarch64 + - runner: ubuntu-22.04 + target: armv7 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + manylinux: musllinux_1_2 + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-musllinux-${{ matrix.platform.target }} + path: dist + + windows: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: windows-latest + target: x64 + - runner: windows-latest + target: x86 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + architecture: ${{ matrix.platform.target }} + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-windows-${{ matrix.platform.target }} + path: dist + + macos: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: macos-13 + target: x86_64 + - runner: macos-14 + target: aarch64 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-macos-${{ matrix.platform.target }} + path: dist + + sdist: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build sdist + uses: PyO3/maturin-action@v1 + with: + command: sdist + args: --out dist + - name: Upload sdist + uses: actions/upload-artifact@v4 + with: + name: wheels-sdist + path: dist + + release: + name: Release + runs-on: ubuntu-latest + if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }} + needs: [linux, musllinux, windows, macos, sdist] + permissions: + # Use to sign the release artifacts + id-token: write + # Used to upload release artifacts + contents: write + # Used to generate artifact attestation + attestations: write + steps: + - uses: actions/download-artifact@v4 + - name: Generate artifact attestation + uses: actions/attest-build-provenance@v1 + with: + subject-path: 'wheels-*/*' + - name: Publish to PyPI + if: ${{ startsWith(github.ref, 'refs/tags/') }} + uses: PyO3/maturin-action@v1 + env: + MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + with: + command: upload + args: --non-interactive --skip-existing wheels-*/* diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..0142e31 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,51 @@ +name: Run tests + +on: + push: + branches: + - 'main' + - 'dev' + pull_request: + workflow_dispatch: + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] + os: [ubuntu-20.04, windows-latest] + + steps: + - uses: actions/checkout@v4 + + # First check Rust tests + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + toolchain: stable + components: rustfmt, clippy + + - name: Cache Rust dependencies + uses: Swatinem/rust-cache@v2 + + - name: Run Rust tests + run: cargo test + + # After Rust tests pass, run Python tests next + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -r requirements-ci.txt + + - name: Build Python package + run: maturin develop + + - name: Run Python tests + run: pytest diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6bae0a7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,96 @@ +/target + +# Byte-compiled / optimized / DLL files +__pycache__/ +.pytest_cache/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg +include/ +man/ +venv/ + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +*.sqlite3 + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# VSCode +.vscode + +# Poetry +# lock file is not needed for development +# as project supports variety of Django versions +poetry.lock + +# PyCharm +.idea/ + +# Python environment +.venv/ +.DS_Store +.python-version +site +.direnv/ +.envrc + +# JS, NPM Dependency directories +node_modules/ +jspm_packages/ + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# Rope +.ropeproject \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..e1a9e5e --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,11 @@ +# Release notes + +## v1.0.0 + +Initial release. + +#### Feat + +- Parser can be configured to add attributes to the HTML elements. +- Parser optionally captures what attributes were set on HTML elements + identified by a specific attribute. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..ac35f2e --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,76 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at emil@emilstenstrom.se. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..aa2ac46 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,296 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bitflags" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "djc_core_html_parser" +version = "1.0.0" +dependencies = [ + "pyo3", + "quick-xml", +] + +[[package]] +name = "indoc" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa799dd5ed20a7e349f3b4639aa80d74549c81716d9ec4f994c9b5815598306" + +[[package]] +name = "libc" +version = "0.2.169" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "proc-macro2" +version = "1.0.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "pyo3" +version = "0.19.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e681a6cfdc4adcc93b4d3cf993749a4552018ee0a9b65fc0ccfad74352c72a38" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "parking_lot", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.19.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "076c73d0bc438f7a4ef6fdd0c3bb4732149136abd952b110ac93e4edb13a6ba5" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.19.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e53cee42e77ebe256066ba8aa77eff722b3bb91f3419177cf4cd0f304d3284d9" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.19.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfeb4c99597e136528c6dd7d5e3de5434d1ceaf487436a3f03b2d56b6fc9efd1" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.19.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "947dc12175c254889edc0c02e399476c2f652b4b9ebd123aa655c224de259536" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "quick-xml" +version = "0.37.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "165859e9e55f79d67b96c5d96f4e88b6f2695a1972849c15a6a3f5c59fc2c003" +dependencies = [ + "memchr", +] + +[[package]] +name = "quote" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_syscall" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" +dependencies = [ + "bitflags", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + +[[package]] +name = "unicode-ident" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" + +[[package]] +name = "unindent" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1766d682d402817b5ac4490b3c3002d91dfa0d22812f341609f97b08757359c" + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..7eff0db --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "djc_core_html_parser" +version = "1.0.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[lib] +name = "djc_core_html_parser" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = { version = "0.19.0", features = ["extension-module"] } +quick-xml = "0.37.2" + +# https://ohadravid.github.io/posts/2023-03-rusty-python +[profile.release] +debug = true # Debug symbols for profiler. +lto = true # Link-time optimization. +codegen-units = 1 # Slower compilation but faster code. diff --git a/README.md b/README.md new file mode 100644 index 0000000..d43894e --- /dev/null +++ b/README.md @@ -0,0 +1,118 @@ +# djc-core-html-parser + +HTML parser used by [django-components](https://github.com/django-components/django-components). Written in Rust, exposed as a Python package with [maturin](https://www.maturin.rs/). + +This implementation was found to be 40-50x faster than our Python implementation, taking ~90ms to parse 5 MB of HTML. + +## Installation + +```sh +pip install djc-core-html-parser +``` + +## Usage + +```python +from djc_core_html_parser import transform_html + +html = '

Hello

' +result, _ = transform_html( + html, + # Add attributes to the root elements + root_attributes=['data-root-id'], + # Add attributes to all elements + all_attributes=['data-v-123'], +) +``` + +To save ourselves from re-parsing the HTML, `transform_html` returns not just the transformed HTML, but also a dictionary as the second item. + +This dictionary contains a record of which HTML attributes were written to which elemenents. + +To populate this dictionary, you need set `watch_on_attribute` to an attribute name. + +Then, during the HTML transformation, we check each element for this attribute. And if the element HAS this attribute, we: + +1. Get the value of said attribute +2. Record the attributes that were added to the element, using the value of the watched attribute as the key. + +```python +from djc_core_html_parser import transform_html + +html = """ +
+

+ Hello +

+
+""" + +result, captured = transform_html( + html, + # Add attributes to the root elements + root_attributes=['data-root-id'], + # Add attributes to all elements + all_attributes=['data-djc-tag'], + # Watch for this attribute on elements + watch_on_attribute='data-watch-id', +) + +print(captured) +# { +# '123': ['data-root-id', 'data-djc-tag'], +# '456': ['data-djc-tag'], +# } +``` + +## Development + +1. Setup python env + + ```sh + python -m venv .venv + ``` + +2. Install dependencies + + ```sh + pip install -r requirements-dev.txt + ``` + + The dev requirements also include `maturin` which is used packaging a Rust project + as Python package. + +3. Install Rust + + See https://www.rust-lang.org/tools/install + +4. Run Rust tests + + ```sh + cargo test + ``` + +5. Build the Python package + + ```sh + maturin develop + ``` + + To build the production-optimized package, use `maturin develop --release`. + +6. Run Python tests + + ```sh + pytest + ``` + + > NOTE: When running Python tests, you need to run `maturin develop` first. + +## Deployment + +Deployment is done automatically via GitHub Actions. + +To publish a new version of the package, you need to: + +1. Bump the version in `pyproject.toml` and `Cargo.toml` +2. Open a PR and merge it to `main`. +3. Create a new tag on the `main` branch with the new version number (e.g. `v1.0.0`), or create a new release in the GitHub UI. diff --git a/__init__.pyi b/__init__.pyi new file mode 100644 index 0000000..9e2b7ad --- /dev/null +++ b/__init__.pyi @@ -0,0 +1,36 @@ +from typing import List, Dict, Optional + +def transform_html( + html: str, + root_attributes: List[str], + all_attributes: List[str], + expand_empty_elements: Optional[bool] = None, + check_end_names: Optional[bool] = None, + watch_on_attribute: Optional[str] = None, +) -> tuple[str, Dict[str, List[str]]]: + """ + Transform HTML by adding attributes to root and all elements. + + Args: + html (str): The HTML string to transform. Can be a fragment or full document. + root_attributes (List[str]): List of attribute names to add to root elements only. + all_attributes (List[str]): List of attribute names to add to all elements. + expand_empty_elements (Optional[bool]): Whether to expand self-closing tags into open/close pairs. Defaults to None. + check_end_names (Optional[bool]): Whether to validate matching of end tags. Defaults to None. + watch_on_attribute (Optional[str]): If set, captures which attributes were added to elements with this attribute. + + Returns: + A tuple containing: + - The transformed HTML string + - A dictionary mapping captured attribute values to lists of attributes that were added + to those elements. Only returned if watch_on_attribute is set, otherwise empty dict. + + Example: + >>> html = '

Hello

' + >>> transform_html(html, ['data-root-id'], ['data-v-123']) + '

Hello

' + + Raises: + ValueError: If the HTML is malformed or cannot be parsed. + """ + ... diff --git a/py.typed b/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..bda53c0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,96 @@ +[build-system] +requires = ["maturin>=1.8,<2.0"] +build-backend = "maturin" + +[project] +name = "djc_core_html_parser" +version = "1.0.0" +requires-python = ">=3.8, <4.0" +description = "HTML parser used by django-components written in Rust." +keywords = ["django", "components", "html"] +readme = "README.md" +authors = [ + {name = "Juro Oravec", email = "juraj.oravec.josefson@gmail.com"}, +] +classifiers = [ + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [] +license = {text = "MIT"} + +# See https://docs.pypi.org/project_metadata/#icons +[project.urls] +Homepage = "https://github.com/django-components/djc-core-html-parser/" +Changelog = "https://github.com/django-components/djc-core-html-parser/blob/main/CHANGELOG.md" +Issues = "https://github.com/django-components/djc-core-html-parser/issues" +Donate = "https://github.com/sponsors/EmilStenstrom" + +[tool.maturin] +features = ["pyo3/extension-module"] + +[tool.black] +line-length = 119 +include = '\.pyi?$' +exclude = ''' +/( + \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | activate + | _build + | buck-out + | build + | dist +)/ +''' + +[tool.isort] +profile = "black" +line_length = 119 +multi_line_output = 3 +include_trailing_comma = "True" +known_first_party = "djc_core_html_parser" + +[tool.flake8] +ignore = ['E302', 'W503'] +max-line-length = 119 +exclude = [ + 'migrations', + '__pycache__', + 'manage.py', + 'settings.py', + 'env', + '.env', + '.venv', + '.tox', + 'build', +] + +[tool.mypy] +check_untyped_defs = true +ignore_missing_imports = true +exclude = [ + 'build', +] + +[[tool.mypy.overrides]] +module = "djc_core_html_parser.*" +disallow_untyped_defs = true + + +[tool.pytest.ini_options] +testpaths = [ + "tests" +] diff --git a/requirements-ci.in b/requirements-ci.in new file mode 100644 index 0000000..33832cb --- /dev/null +++ b/requirements-ci.in @@ -0,0 +1,2 @@ +maturin +pytest \ No newline at end of file diff --git a/requirements-ci.txt b/requirements-ci.txt new file mode 100644 index 0000000..0607ebc --- /dev/null +++ b/requirements-ci.txt @@ -0,0 +1,16 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile requirements-ci.in +# +iniconfig==2.0.0 + # via pytest +maturin==1.8.1 + # via -r requirements-ci.in +packaging==24.2 + # via pytest +pluggy==1.5.0 + # via pytest +pytest==8.3.4 + # via -r requirements-ci.in diff --git a/requirements-dev.in b/requirements-dev.in new file mode 100644 index 0000000..bd354f4 --- /dev/null +++ b/requirements-dev.in @@ -0,0 +1,2 @@ +maturin +pytest diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..ea6c360 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,16 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile requirements-dev.in +# +iniconfig==2.0.0 + # via pytest +maturin==1.8.1 + # via -r requirements-dev.in +packaging==24.2 + # via pytest +pluggy==1.5.0 + # via pytest +pytest==8.3.4 + # via -r requirements-dev.in diff --git a/src/html_parser.rs b/src/html_parser.rs new file mode 100644 index 0000000..7adbd1e --- /dev/null +++ b/src/html_parser.rs @@ -0,0 +1,540 @@ +use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; +use pyo3::types::{PyDict, PyTuple}; +use quick_xml::events::{BytesStart, Event}; +use quick_xml::reader::Reader; +use quick_xml::writer::Writer; +use std::collections::HashSet; +use std::io::Cursor; + +// List of HTML5 void elements. These can be written as `` or ``, +//e.g. `
`, ``, ``, etc. +const VOID_ELEMENTS: [&str; 14] = [ + "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", + "track", "wbr", +]; + +/// Transform HTML by adding attributes to the elements. +/// +/// Args: +/// html (str): The HTML string to transform. Can be a fragment or full document. +/// root_attributes (List[str]): List of attribute names to add to root elements only. +/// all_attributes (List[str]): List of attribute names to add to all elements. +/// expand_empty_elements (bool, optional): Whether to expand self-closing tags into open/close pairs. Defaults to true. +/// check_end_names (bool, optional): Whether to validate matching of end tags. Defaults to false. +/// watch_on_attribute (str, optional): If set, captures which attributes were added to elements with this attribute. +/// +/// Returns: +/// Tuple[str, Dict[str, List[str]]]: A tuple containing: +/// - The transformed HTML string +/// - A dictionary mapping captured attribute values to lists of attributes that were added +/// to those elements. Only returned if watch_on_attribute is set, otherwise empty dict. +/// +/// Example: +/// >>> html = '

Hello

' +/// >>> html, captured = transform_html(html, ['data-root-id'], ['data-v-123'], watch_on_attribute='data-id') +/// >>> print(captured) +/// {'123': ['data-root-id', 'data-v-123']} +/// +/// Raises: +/// ValueError: If the HTML is malformed or cannot be parsed. +#[pyfunction] +#[pyo3( + text_signature = "(html, root_attributes, all_attributes, *, expand_empty_elements=True, check_end_names=False, watch_on_attribute=None)" +)] +pub fn transform_html( + py: Python, + html: &str, + root_attributes: Vec, + all_attributes: Vec, + expand_empty_elements: Option, + check_end_names: Option, + watch_on_attribute: Option, +) -> PyResult { + let config = HtmlTransformerConfig::new( + root_attributes, + all_attributes, + expand_empty_elements.unwrap_or(true), + check_end_names.unwrap_or(false), + watch_on_attribute, + ); + + match transform(&config, html) { + Ok((html, captured)) => { + // Convert captured attributes to a Python dictionary + let captured_dict = PyDict::new(py); + for (id, attrs) in captured { + captured_dict.set_item(id, attrs)?; + } + + let result = PyTuple::new(py, &[html.into_py(py), captured_dict.into_py(py)]); + Ok(result.into()) + } + Err(e) => Err(PyValueError::new_err(e.to_string())), + } +} + +/// Configuration for HTML transformation +pub struct HtmlTransformerConfig { + root_attributes: Vec, + all_attributes: Vec, + void_elements: HashSet, + expand_empty_elements: bool, + check_end_names: bool, + watch_on_attribute: Option, +} + +impl HtmlTransformerConfig { + pub fn new( + root_attributes: Vec, + all_attributes: Vec, + expand_empty_elements: bool, + check_end_names: bool, + watch_on_attribute: Option, + ) -> Self { + let void_elements = VOID_ELEMENTS.iter().map(|&s| s.to_string()).collect(); + + HtmlTransformerConfig { + root_attributes, + all_attributes, + void_elements, + expand_empty_elements, + check_end_names, + watch_on_attribute, + } + } +} + +/// Add attributes to a HTML start tag (e.g. `
`) based on the configuration +fn add_attributes( + config: &HtmlTransformerConfig, + element: &mut BytesStart, + is_root: bool, + captured_attributes: &mut Vec<(String, Vec)>, +) { + let mut added_attrs = Vec::new(); + + // Add root attributes if this is a root element + if is_root { + for attr in &config.root_attributes { + element.push_attribute((attr.as_str(), "")); + added_attrs.push(attr.clone()); + } + } + + // Add attributes that should be applied to all elements + for attr in &config.all_attributes { + element.push_attribute((attr.as_str(), "")); + added_attrs.push(attr.clone()); + } + + // If we're watching for a specific attribute, check if this element has it + if let Some(watch_attr) = &config.watch_on_attribute { + if let Some(attr_value) = element + .attributes() + .find(|a| { + if let Ok(attr) = a { + String::from_utf8_lossy(attr.key.as_ref()) == *watch_attr + } else { + false + } + }) + .and_then(|a| a.ok()) + .map(|a| String::from_utf8_lossy(a.value.as_ref()).into_owned()) + { + captured_attributes.push((attr_value, added_attrs)); + } + } +} + +/// Main entrypoint. Transform HTML by adding attributes to the elements. +pub fn transform( + config: &HtmlTransformerConfig, + html: &str, +) -> Result<(String, Vec<(String, Vec)>), Box> { + let mut reader = Reader::from_str(html); + let reader_config = reader.config_mut(); + reader_config.expand_empty_elements = config.expand_empty_elements; + reader_config.check_end_names = config.check_end_names; + + // We transform the HTML by reading it and writing it simultaneously + let mut writer = Writer::new(Cursor::new(Vec::new())); + let mut captured_attributes = Vec::new(); + + // Track the nesting depth of elements to identify root elements (depth == 0) + let mut depth: i32 = 0; + + // Read the HTML event by event + loop { + match reader.read_event() { + // Start tag + Ok(Event::Start(e)) => { + let tag_name = String::from_utf8_lossy(e.name().as_ref()) + .to_string() + .to_lowercase(); + let mut elem = e.into_owned(); + add_attributes(config, &mut elem, depth == 0, &mut captured_attributes); + + // For void elements, write as Empty event + if config.void_elements.contains(&tag_name) { + writer.write_event(Event::Empty(elem))?; + } else { + writer.write_event(Event::Start(elem))?; + depth += 1; + } + } + + // End tag + Ok(Event::End(e)) => { + let tag_name = String::from_utf8_lossy(e.name().as_ref()) + .to_string() + .to_lowercase(); + + // Skip end tags for void elements + if !config.void_elements.contains(&tag_name) { + writer.write_event(Event::End(e))?; + depth -= 1; + } + } + + // Empty element (AKA void or self-closing tag, e.g. `
`) + Ok(Event::Empty(e)) => { + let mut elem = e.into_owned(); + add_attributes(config, &mut elem, depth == 0, &mut captured_attributes); + writer.write_event(Event::Empty(elem))?; + } + + // End of file + Ok(Event::Eof) => break, + // Other events (e.g. comments, processing instructions, etc.) + Ok(e) => writer.write_event(e)?, + Err(e) => return Err(Box::new(e)), + } + } + + // Convert the transformed HTML to a string + let result = String::from_utf8(writer.into_inner().into_inner())?; + Ok((result, captured_attributes)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_transformation() { + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-all".to_string()], + true, + false, + None, + ); + + let input = "

Hello

"; + let (result, _) = transform(&config, input).unwrap(); + + assert!(result.contains("data-root")); + assert!(result.contains("data-all")); + } + + #[test] + fn test_multiple_roots() { + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-all".to_string()], + true, + false, + None, + ); + + let input = "
First
Second"; + let (result, _) = transform(&config, input).unwrap(); + + // Both root elements should have data-root + assert_eq!(result.matches("data-root").count(), 2); + // All elements should have data-all + assert_eq!(result.matches("data-all").count(), 2); + } + + #[test] + fn test_complex_html() { + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-all".to_string(), "data-v-123".to_string()], + true, + false, + None, + ); + + let input = r#" +
+
+

Hello & Welcome

+ +
+
+
+

Article 1

+

Some text with bold and emphasis

+ Test Image +
+
+
+
+

© 2024

+
+ "#; + + let (result, _) = transform(&config, input).unwrap(); + + // Check root elements have root attributes + assert!(result.contains( + r#"
"# + )); + assert!(result.contains(r#"
"#)); + + // Check nested elements have all_attributes but not root_attributes + assert!(result.contains(r#"

"#)); + assert!(result.contains(r#"

")); + } + + #[test] + fn test_html_head_with_meta() { + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-v-123".to_string()], + true, + false, + None, + ); + + let input = r#" + + + Test Page + + + "#; + + let (result, _) = transform(&config, input).unwrap(); + + // Check that it parsed successfully + assert!(result.contains(r#"Test Page"#)); + assert!(result.contains(r#"")); + assert!(!result.contains("")); + assert!(result.contains("/>")); + } + + #[test] + fn test_config_expand_empty_elements() { + // Test with expand_empty_elements = false + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-v-123".to_string()], + false, // Don't expand empty elements + false, + None, + ); + + let test_cases = [ + // Non-void elements should stay self-closing when expand_empty_elements is false + ( + "
", + "
" + ), + ( + "

", + "

" + ), + ( + "

", + "
" + ), + // Void elements should always be self-closing regardless of config + ( + "

", + "

" + ), + ]; + + for (input, expected) in test_cases { + let (result, _) = transform(&config, input).unwrap(); + assert_eq!(result, expected); + } + + // Compare with expand_empty_elements = true + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-v-123".to_string()], + true, // Expand empty elements + false, + None, + ); + + let expanded_cases = [ + ("
", "
"), + ("

", "

"), + // Void elements should still be self-closing + ( + "
", + "
", + ), + ]; + + for (input, expected) in expanded_cases { + let (result, _) = transform(&config, input).unwrap(); + assert_eq!(result, expected); + } + } + + #[test] + fn test_config_check_end_names() { + // Test with check_end_names = false (lenient mode) + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-v-123".to_string()], + true, + false, // Don't check end names + None, + ); + + // These should parse successfully with check_end_names = false + let lenient_cases = [ + "

Hello

", // Mismatched nesting + "
Text", // Wrong closing tag + "

Text", // Non-matching end tag + ]; + + for input in lenient_cases { + assert!(transform(&config, input).is_ok()); + } + + // Test with check_end_names = true (strict mode) + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-v-123".to_string()], + true, + true, // Check end names + None, + ); + + // These should fail with check_end_names = true + for input in lenient_cases { + assert!(transform(&config, input).is_err()); + } + + // But well-formed HTML should still work + let valid_input = "

Hello

"; + assert!(transform(&config, valid_input).is_ok()); + } + + #[test] + fn test_watch_attribute() { + let config = HtmlTransformerConfig::new( + vec!["data-root".to_string()], + vec!["data-v-123".to_string()], + true, + false, + Some("data-id".to_string()), + ); + + let input = r#" +
+

Regular element

+ Nested element + +
"#; + + let (result, captured) = transform(&config, input).unwrap(); + + println!("result: {}", result); + println!("captured: {:?}", captured); + + // Verify HTML transformation worked + assert!(result.contains(r#"
"#)); + assert!(result.contains(r#""#)); + assert!(result.contains(r#""#)); + + // Verify attribute capturing + assert_eq!(captured.len(), 3); + assert!(captured.iter().any(|(id, attrs)| id == "123" + && attrs.contains(&"data-root".to_string()) + && attrs.contains(&"data-v-123".to_string()))); + assert!(captured + .iter() + .any(|(id, attrs)| id == "456" && attrs.contains(&"data-v-123".to_string()))); + assert!(captured + .iter() + .any(|(id, attrs)| id == "789" && attrs.contains(&"data-v-123".to_string()))); + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..c02548b --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,11 @@ +use pyo3::prelude::*; +use pyo3::types::PyModule; + +mod html_parser; + +/// A Python module implemented in Rust for high-performance HTML transformation. +#[pymodule] +fn djc_core_html_parser(_py: Python, m: &PyModule) -> PyResult<()> { + m.add_function(wrap_pyfunction!(html_parser::transform_html, m)?)?; + Ok(()) +} diff --git a/tests/benchmark.py b/tests/benchmark.py new file mode 100644 index 0000000..fda4bda --- /dev/null +++ b/tests/benchmark.py @@ -0,0 +1,97 @@ +from statistics import mean, stdev +import time + +from djc_core_html_parser import transform_html + + +def generate_large_html(num_elements: int = 1000) -> str: + """Generate a large HTML document with various features for benchmarking.""" + elements = [] + for i in range(num_elements): + # Mix of different elements and features + if i % 5 == 0: + # Void element with multiple attributes + elements.append(f'Image {i}') + elif i % 5 == 1: + # Nested divs with attributes + elements.append( + f""" +
+
+

Content {i}

+ +
+
+ """ + ) + elif i % 5 == 2: + # Script tag with content + elements.append( + f""" + + """ + ) + elif i % 5 == 3: + # CDATA section + elements.append( + f""" + + ]]> + """ + ) + else: + # Regular element with attributes + elements.append( + f""" +
+

Heading {i}

+

Paragraph {i}

+
+ """ + ) + + return f""" + + + + Benchmark Page + + + + {''.join(elements)} + + + """ + + +# Generate test HTML +HTML_SIZE = 27_000 # Set to 11_000 for 2MB +NUM_ITER = 2 +html = generate_large_html(HTML_SIZE) +print(f"\nBenchmarking with HTML size: {len(html) // 1_000} KB") + +root_attributes = ["data-root-id"] +all_attributes = ["data-v-123"] + +# Test transform +modify_times = [] +for i in range(NUM_ITER): # Run N iterations + + start = time.perf_counter() + transform_html(html, root_attributes, all_attributes, watch_on_attribute="data-id") + modify_time = time.perf_counter() - start + modify_times.append(modify_time) + +print("\nTransform:") +print(f" Total: {sum(modify_times):.3f}s") +print(f" Min: {min(modify_times):.3f}s") +print(f" Max: {max(modify_times):.3f}s") +print(f" Avg: {mean(modify_times):.3f}s") +print(f" Std: {stdev(modify_times):.3f}s") diff --git a/tests/test_html_parser.py b/tests/test_html_parser.py new file mode 100644 index 0000000..58c1a8e --- /dev/null +++ b/tests/test_html_parser.py @@ -0,0 +1,176 @@ +# This same set of tests is also found in django-components, to ensure that +# this implementation can be replaced with the django-components' pure-python implementation + +from djc_core_html_parser import transform_html +from typing import Dict, List + + +def test_basic_transformation(): + html = "

Hello

" + result, _ = transform_html(html, ["data-root"], ["data-all"]) + expected = '

Hello

' + assert result == expected + + +def test_multiple_roots(): + html = "
First
Second" + result, _ = transform_html(html, ["data-root"], ["data-all"]) + expected = '
First
Second' + assert result == expected + + +def test_complex_html(): + html = """ +
+
+

Hello & Welcome

+ +
+
+
+

Article 1

+

Some text with bold and emphasis

+ Test Image +
+
+
+
+

© 2024

+
+ """ + + result, _ = transform_html(html, ["data-root"], ["data-all", "data-v-123"]) + expected = """ +
+
+

Hello & Welcome

+ +
+
+
+

Article 1

+

Some text with bold and emphasis

+ Test Image +
+
+
+
+

© 2024

+
+ """ # noqa: E501 + assert result == expected + + +def test_void_elements(): + test_cases = [ + ('', ''), + ('', ''), + ("


", '


'), + ('Test', 'Test'), + ] + + for input_html, expected in test_cases: + result, _ = transform_html(input_html, ["data-root"], ["data-v-123"]) + assert result == expected + + +def test_html_head_with_meta(): + html = """ + + + Test Page + + + """ + + result, _ = transform_html(html, ["data-root"], ["data-v-123"]) + expected = """ + + + Test Page + + + """ + assert result == expected + + +def test_expand_empty_elements(): + # Test with expand_empty_elements=False + test_cases = [ + # Non-void elements should stay self-closing when expand_empty_elements is false + ("
", '
'), + ("

", '

'), + ("

", '
'), + # Void elements should always be self-closing regardless of config + ("

", '

'), + ] + + for input_html, expected in test_cases: + result, _ = transform_html(input_html, ["data-root"], ["data-v-123"], expand_empty_elements=False) + assert result == expected + + # Compare with expand_empty_elements=True + expanded_cases = [ + ("
", '
'), + ("

", '

'), + # Void elements should still be self-closing + ("
", '
'), + ] + + for input_html, expected in expanded_cases: + result, _ = transform_html(input_html, ["data-root"], ["data-v-123"], expand_empty_elements=True) + assert result == expected + + +def test_watch_attribute(): + html = """ +
+

Regular element

+ Nested element + +
""" + + result: str + captured: Dict[str, List[str]] + result, captured = transform_html(html, ["data-root"], ["data-v-123"], watch_on_attribute="data-id") + expected = """ +
+

Regular element

+ Nested element + +
""" + assert result == expected + + # Verify attribute capturing + assert len(captured) == 3 + + # Root element should have both root and all attributes + assert "123" in captured + assert "data-root" in captured["123"] + assert "data-v-123" in captured["123"] + + # Non-root elements should only have all attributes + assert "456" in captured + assert captured["456"] == ["data-v-123"] + assert "789" in captured + assert captured["789"] == ["data-v-123"] + + +def test_whitespace_preservation(): + html = """
+

Hello World

+ Text with spaces +
""" + + result, _ = transform_html(html, ["data-root"], ["data-all"]) + expected = """
+

Hello World

+ Text with spaces +
""" + assert result == expected From 3b7e4354648872c989f566db140f263464d92f77 Mon Sep 17 00:00:00 2001 From: Juro Oravec Date: Thu, 23 Jan 2025 23:33:27 +0100 Subject: [PATCH 2/6] ci: create virtual env --- .github/workflows/tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0142e31..dceb6bd 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -41,6 +41,9 @@ jobs: - name: Install Python dependencies run: | + # NOTE: maturin requires a virtual environment to be active + python -m venv .venv + source .venv/bin/activate python -m pip install --upgrade pip python -m pip install -r requirements-ci.txt From d7233399b0f26f39a14397db0682fa0c5c61627b Mon Sep 17 00:00:00 2001 From: Juro Oravec Date: Thu, 23 Jan 2025 23:38:45 +0100 Subject: [PATCH 3/6] refactor: source venv --- .github/workflows/tests.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index dceb6bd..9e50c68 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -48,7 +48,11 @@ jobs: python -m pip install -r requirements-ci.txt - name: Build Python package - run: maturin develop + run: | + source .venv/bin/activate + maturin develop - name: Run Python tests - run: pytest + run: | + source .venv/bin/activate + pytest From a10f49665f9f329758ee41c12994a76d023097fd Mon Sep 17 00:00:00 2001 From: Juro Oravec Date: Thu, 23 Jan 2025 23:46:50 +0100 Subject: [PATCH 4/6] refactor: ues language-specific command to load Python env --- .github/workflows/tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9e50c68..161a84e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -43,16 +43,16 @@ jobs: run: | # NOTE: maturin requires a virtual environment to be active python -m venv .venv - source .venv/bin/activate + ${{ runner.os == 'Windows' && '.venv\Scripts\activate' || 'source .venv/bin/activate' }} python -m pip install --upgrade pip python -m pip install -r requirements-ci.txt - name: Build Python package run: | - source .venv/bin/activate + ${{ runner.os == 'Windows' && '.venv\Scripts\activate' || 'source .venv/bin/activate' }} maturin develop - name: Run Python tests run: | - source .venv/bin/activate + ${{ runner.os == 'Windows' && '.venv\Scripts\activate' || 'source .venv/bin/activate' }} pytest From 07401668a8d5492cda25e732411a7532714af8af Mon Sep 17 00:00:00 2001 From: Juro Oravec Date: Fri, 24 Jan 2025 10:33:35 +0100 Subject: [PATCH 5/6] refactor: revert the feature of expanding self-closing tags --- __init__.pyi | 2 - src/html_parser.rs | 79 +-------------------------------------- tests/test_html_parser.py | 28 -------------- 3 files changed, 1 insertion(+), 108 deletions(-) diff --git a/__init__.pyi b/__init__.pyi index 9e2b7ad..2fb21ee 100644 --- a/__init__.pyi +++ b/__init__.pyi @@ -4,7 +4,6 @@ def transform_html( html: str, root_attributes: List[str], all_attributes: List[str], - expand_empty_elements: Optional[bool] = None, check_end_names: Optional[bool] = None, watch_on_attribute: Optional[str] = None, ) -> tuple[str, Dict[str, List[str]]]: @@ -15,7 +14,6 @@ def transform_html( html (str): The HTML string to transform. Can be a fragment or full document. root_attributes (List[str]): List of attribute names to add to root elements only. all_attributes (List[str]): List of attribute names to add to all elements. - expand_empty_elements (Optional[bool]): Whether to expand self-closing tags into open/close pairs. Defaults to None. check_end_names (Optional[bool]): Whether to validate matching of end tags. Defaults to None. watch_on_attribute (Optional[str]): If set, captures which attributes were added to elements with this attribute. diff --git a/src/html_parser.rs b/src/html_parser.rs index 7adbd1e..7acf32d 100644 --- a/src/html_parser.rs +++ b/src/html_parser.rs @@ -20,7 +20,6 @@ const VOID_ELEMENTS: [&str; 14] = [ /// html (str): The HTML string to transform. Can be a fragment or full document. /// root_attributes (List[str]): List of attribute names to add to root elements only. /// all_attributes (List[str]): List of attribute names to add to all elements. -/// expand_empty_elements (bool, optional): Whether to expand self-closing tags into open/close pairs. Defaults to true. /// check_end_names (bool, optional): Whether to validate matching of end tags. Defaults to false. /// watch_on_attribute (str, optional): If set, captures which attributes were added to elements with this attribute. /// @@ -40,21 +39,19 @@ const VOID_ELEMENTS: [&str; 14] = [ /// ValueError: If the HTML is malformed or cannot be parsed. #[pyfunction] #[pyo3( - text_signature = "(html, root_attributes, all_attributes, *, expand_empty_elements=True, check_end_names=False, watch_on_attribute=None)" + text_signature = "(html, root_attributes, all_attributes, *, check_end_names=False, watch_on_attribute=None)" )] pub fn transform_html( py: Python, html: &str, root_attributes: Vec, all_attributes: Vec, - expand_empty_elements: Option, check_end_names: Option, watch_on_attribute: Option, ) -> PyResult { let config = HtmlTransformerConfig::new( root_attributes, all_attributes, - expand_empty_elements.unwrap_or(true), check_end_names.unwrap_or(false), watch_on_attribute, ); @@ -79,7 +76,6 @@ pub struct HtmlTransformerConfig { root_attributes: Vec, all_attributes: Vec, void_elements: HashSet, - expand_empty_elements: bool, check_end_names: bool, watch_on_attribute: Option, } @@ -88,7 +84,6 @@ impl HtmlTransformerConfig { pub fn new( root_attributes: Vec, all_attributes: Vec, - expand_empty_elements: bool, check_end_names: bool, watch_on_attribute: Option, ) -> Self { @@ -98,7 +93,6 @@ impl HtmlTransformerConfig { root_attributes, all_attributes, void_elements, - expand_empty_elements, check_end_names, watch_on_attribute, } @@ -154,7 +148,6 @@ pub fn transform( ) -> Result<(String, Vec<(String, Vec)>), Box> { let mut reader = Reader::from_str(html); let reader_config = reader.config_mut(); - reader_config.expand_empty_elements = config.expand_empty_elements; reader_config.check_end_names = config.check_end_names; // We transform the HTML by reading it and writing it simultaneously @@ -226,7 +219,6 @@ mod tests { let config = HtmlTransformerConfig::new( vec!["data-root".to_string()], vec!["data-all".to_string()], - true, false, None, ); @@ -243,7 +235,6 @@ mod tests { let config = HtmlTransformerConfig::new( vec!["data-root".to_string()], vec!["data-all".to_string()], - true, false, None, ); @@ -262,7 +253,6 @@ mod tests { let config = HtmlTransformerConfig::new( vec!["data-root".to_string()], vec!["data-all".to_string(), "data-v-123".to_string()], - true, false, None, ); @@ -315,7 +305,6 @@ mod tests { let config = HtmlTransformerConfig::new( vec!["data-root".to_string()], vec!["data-v-123".to_string()], - true, false, None, ); @@ -369,7 +358,6 @@ mod tests { let config = HtmlTransformerConfig::new( vec!["data-root".to_string()], vec!["data-v-123".to_string()], - true, false, None, ); @@ -395,75 +383,12 @@ mod tests { assert!(result.contains("/>")); } - #[test] - fn test_config_expand_empty_elements() { - // Test with expand_empty_elements = false - let config = HtmlTransformerConfig::new( - vec!["data-root".to_string()], - vec!["data-v-123".to_string()], - false, // Don't expand empty elements - false, - None, - ); - - let test_cases = [ - // Non-void elements should stay self-closing when expand_empty_elements is false - ( - "
", - "
" - ), - ( - "

", - "

" - ), - ( - "

", - "
" - ), - // Void elements should always be self-closing regardless of config - ( - "

", - "

" - ), - ]; - - for (input, expected) in test_cases { - let (result, _) = transform(&config, input).unwrap(); - assert_eq!(result, expected); - } - - // Compare with expand_empty_elements = true - let config = HtmlTransformerConfig::new( - vec!["data-root".to_string()], - vec!["data-v-123".to_string()], - true, // Expand empty elements - false, - None, - ); - - let expanded_cases = [ - ("
", "
"), - ("

", "

"), - // Void elements should still be self-closing - ( - "
", - "
", - ), - ]; - - for (input, expected) in expanded_cases { - let (result, _) = transform(&config, input).unwrap(); - assert_eq!(result, expected); - } - } - #[test] fn test_config_check_end_names() { // Test with check_end_names = false (lenient mode) let config = HtmlTransformerConfig::new( vec!["data-root".to_string()], vec!["data-v-123".to_string()], - true, false, // Don't check end names None, ); @@ -483,7 +408,6 @@ mod tests { let config = HtmlTransformerConfig::new( vec!["data-root".to_string()], vec!["data-v-123".to_string()], - true, true, // Check end names None, ); @@ -503,7 +427,6 @@ mod tests { let config = HtmlTransformerConfig::new( vec!["data-root".to_string()], vec!["data-v-123".to_string()], - true, false, Some("data-id".to_string()), ); diff --git a/tests/test_html_parser.py b/tests/test_html_parser.py index 58c1a8e..42ad632 100644 --- a/tests/test_html_parser.py +++ b/tests/test_html_parser.py @@ -100,34 +100,6 @@ def test_html_head_with_meta(): assert result == expected -def test_expand_empty_elements(): - # Test with expand_empty_elements=False - test_cases = [ - # Non-void elements should stay self-closing when expand_empty_elements is false - ("
", '
'), - ("

", '

'), - ("

", '
'), - # Void elements should always be self-closing regardless of config - ("

", '

'), - ] - - for input_html, expected in test_cases: - result, _ = transform_html(input_html, ["data-root"], ["data-v-123"], expand_empty_elements=False) - assert result == expected - - # Compare with expand_empty_elements=True - expanded_cases = [ - ("
", '
'), - ("

", '

'), - # Void elements should still be self-closing - ("
", '
'), - ] - - for input_html, expected in expanded_cases: - result, _ = transform_html(input_html, ["data-root"], ["data-v-123"], expand_empty_elements=True) - assert result == expected - - def test_watch_attribute(): html = """
From 1499d58cb455393feb94597f11b7f6aeb6160fa1 Mon Sep 17 00:00:00 2001 From: Juro Oravec Date: Fri, 24 Jan 2025 10:41:21 +0100 Subject: [PATCH 6/6] refactor: rename transform_html to set_html_attributes --- README.md | 10 +++++----- __init__.pyi | 4 ++-- src/html_parser.rs | 4 ++-- src/lib.rs | 2 +- tests/benchmark.py | 4 ++-- tests/test_html_parser.py | 16 ++++++++-------- 6 files changed, 20 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index d43894e..e28e25f 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,10 @@ pip install djc-core-html-parser ## Usage ```python -from djc_core_html_parser import transform_html +from djc_core_html_parser import set_html_attributes html = '

Hello

' -result, _ = transform_html( +result, _ = set_html_attributes( html, # Add attributes to the root elements root_attributes=['data-root-id'], @@ -25,7 +25,7 @@ result, _ = transform_html( ) ``` -To save ourselves from re-parsing the HTML, `transform_html` returns not just the transformed HTML, but also a dictionary as the second item. +To save ourselves from re-parsing the HTML, `set_html_attributes` returns not just the transformed HTML, but also a dictionary as the second item. This dictionary contains a record of which HTML attributes were written to which elemenents. @@ -37,7 +37,7 @@ Then, during the HTML transformation, we check each element for this attribute. 2. Record the attributes that were added to the element, using the value of the watched attribute as the key. ```python -from djc_core_html_parser import transform_html +from djc_core_html_parser import set_html_attributes html = """
@@ -47,7 +47,7 @@ html = """
""" -result, captured = transform_html( +result, captured = set_html_attributes( html, # Add attributes to the root elements root_attributes=['data-root-id'], diff --git a/__init__.pyi b/__init__.pyi index 2fb21ee..eeeff7c 100644 --- a/__init__.pyi +++ b/__init__.pyi @@ -1,6 +1,6 @@ from typing import List, Dict, Optional -def transform_html( +def set_html_attributes( html: str, root_attributes: List[str], all_attributes: List[str], @@ -25,7 +25,7 @@ def transform_html( Example: >>> html = '

Hello

' - >>> transform_html(html, ['data-root-id'], ['data-v-123']) + >>> set_html_attributes(html, ['data-root-id'], ['data-v-123']) '

Hello

' Raises: diff --git a/src/html_parser.rs b/src/html_parser.rs index 7acf32d..5bb7eef 100644 --- a/src/html_parser.rs +++ b/src/html_parser.rs @@ -31,7 +31,7 @@ const VOID_ELEMENTS: [&str; 14] = [ /// /// Example: /// >>> html = '

Hello

' -/// >>> html, captured = transform_html(html, ['data-root-id'], ['data-v-123'], watch_on_attribute='data-id') +/// >>> html, captured = set_html_attributes(html, ['data-root-id'], ['data-v-123'], watch_on_attribute='data-id') /// >>> print(captured) /// {'123': ['data-root-id', 'data-v-123']} /// @@ -41,7 +41,7 @@ const VOID_ELEMENTS: [&str; 14] = [ #[pyo3( text_signature = "(html, root_attributes, all_attributes, *, check_end_names=False, watch_on_attribute=None)" )] -pub fn transform_html( +pub fn set_html_attributes( py: Python, html: &str, root_attributes: Vec, diff --git a/src/lib.rs b/src/lib.rs index c02548b..050fa3e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,6 @@ mod html_parser; /// A Python module implemented in Rust for high-performance HTML transformation. #[pymodule] fn djc_core_html_parser(_py: Python, m: &PyModule) -> PyResult<()> { - m.add_function(wrap_pyfunction!(html_parser::transform_html, m)?)?; + m.add_function(wrap_pyfunction!(html_parser::set_html_attributes, m)?)?; Ok(()) } diff --git a/tests/benchmark.py b/tests/benchmark.py index fda4bda..1cf02b1 100644 --- a/tests/benchmark.py +++ b/tests/benchmark.py @@ -1,7 +1,7 @@ from statistics import mean, stdev import time -from djc_core_html_parser import transform_html +from djc_core_html_parser import set_html_attributes def generate_large_html(num_elements: int = 1000) -> str: @@ -85,7 +85,7 @@ def generate_large_html(num_elements: int = 1000) -> str: for i in range(NUM_ITER): # Run N iterations start = time.perf_counter() - transform_html(html, root_attributes, all_attributes, watch_on_attribute="data-id") + set_html_attributes(html, root_attributes, all_attributes, watch_on_attribute="data-id") modify_time = time.perf_counter() - start modify_times.append(modify_time) diff --git a/tests/test_html_parser.py b/tests/test_html_parser.py index 42ad632..57cf3b1 100644 --- a/tests/test_html_parser.py +++ b/tests/test_html_parser.py @@ -1,20 +1,20 @@ # This same set of tests is also found in django-components, to ensure that # this implementation can be replaced with the django-components' pure-python implementation -from djc_core_html_parser import transform_html +from djc_core_html_parser import set_html_attributes from typing import Dict, List def test_basic_transformation(): html = "

Hello

" - result, _ = transform_html(html, ["data-root"], ["data-all"]) + result, _ = set_html_attributes(html, ["data-root"], ["data-all"]) expected = '

Hello

' assert result == expected def test_multiple_roots(): html = "
First
Second" - result, _ = transform_html(html, ["data-root"], ["data-all"]) + result, _ = set_html_attributes(html, ["data-root"], ["data-all"]) expected = '
First
Second' assert result == expected @@ -42,7 +42,7 @@ def test_complex_html(): """ - result, _ = transform_html(html, ["data-root"], ["data-all", "data-v-123"]) + result, _ = set_html_attributes(html, ["data-root"], ["data-all", "data-v-123"]) expected = """
@@ -76,7 +76,7 @@ def test_void_elements(): ] for input_html, expected in test_cases: - result, _ = transform_html(input_html, ["data-root"], ["data-v-123"]) + result, _ = set_html_attributes(input_html, ["data-root"], ["data-v-123"]) assert result == expected @@ -89,7 +89,7 @@ def test_html_head_with_meta(): """ - result, _ = transform_html(html, ["data-root"], ["data-v-123"]) + result, _ = set_html_attributes(html, ["data-root"], ["data-v-123"]) expected = """ @@ -110,7 +110,7 @@ def test_watch_attribute(): result: str captured: Dict[str, List[str]] - result, captured = transform_html(html, ["data-root"], ["data-v-123"], watch_on_attribute="data-id") + result, captured = set_html_attributes(html, ["data-root"], ["data-v-123"], watch_on_attribute="data-id") expected = """

Regular element

@@ -140,7 +140,7 @@ def test_whitespace_preservation(): Text with spaces
""" - result, _ = transform_html(html, ["data-root"], ["data-all"]) + result, _ = set_html_attributes(html, ["data-root"], ["data-all"]) expected = """

Hello World

Text with spaces