#!/usr/bin/evn python

# Crawl Dataflow documentation, generates requirements files for various beam/python combinations.
# Requirements: `requests`, `beautifulsoup4`

import requests
import bs4
import re
import os


BEAM_VERSIONS = {
    "2.24", "2.25", "2.26", "2.27", "2.28",
}


def grab_sdk_worker_dependencies():

    page = requests.get("https://cloud.google.com/dataflow/docs/concepts/sdk-worker-dependencies")
    page.raise_for_status()

    bs = bs4.BeautifulSoup(page.text, features='html.parser')

    for section in bs.find_all('section'):
        if not re.match(r"py\d+", section.get('id', "")):
            continue

        beam_ver = section.h4['id'].split("-")[-1]
        beam_ver = ".".join(beam_ver.split(".")[:2])  # only major_minor

        for sub_section in section.find_all('section'):
            python_version = sub_section.h3['id'].split("-")[-1]
            python_version = ".".join(python_version.split(".")[:2])  # only major_minor

            packages = {}
            for tr in sub_section.find_all('tr'):
                if not tr.td and tr.th:
                    # header
                    continue
                package_name = tr.td.text
                version = tr.td.next_sibling.text
                packages[package_name] = version

            yield {
                'python_version': python_version,
                'beam_version': beam_ver,
                'packages': packages,
            }


def main():

    all_worker_dependencies = list(grab_sdk_worker_dependencies())
    print("python versions", sorted(set(worker_dependencies['python_version'] for worker_dependencies in all_worker_dependencies)))
    print("beam versions", sorted(set(worker_dependencies['beam_version'] for worker_dependencies in all_worker_dependencies)))

    for worker_dependencies in all_worker_dependencies:
        beam_version = worker_dependencies['beam_version']
        if beam_version not in BEAM_VERSIONS:
            print("skip beam version", beam_version)
            continue

        python_version = worker_dependencies['python_version']
        if python_version.startswith("2"):
            continue

        pkgs = worker_dependencies['packages']

        out_file_name = f"beam{beam_version}_py{python_version}.txt"
        work_dir = os.path.join(os.path.dirname(__file__))

        with open(os.path.join(work_dir, out_file_name), 'w') as f:
            print(f.name)

            f.write(f"# list of preinstalled dependencies on Dataflow workers\n")
            f.write(f"# this file was generated by `grab_beam_worker_deps.py`\n")
            f.write(f"# beam sdk version: {beam_version}\n")
            f.write(f"# python version: {python_version}\n")
            f.write("\n")

            for worker_dependencies, v in sorted(pkgs.items()):
                f.write(f"{worker_dependencies}=={v}\n")


if __name__ == '__main__':
    main()