From 14c7afaa58b12d0d0f3b011a7acde9b77bd227c7 Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Fri, 2 Jan 2026 14:34:31 -0600 Subject: [PATCH] Fix GitHub CI timeout issue for link checker (#937) * Fix GitHub CI timeout issue for link checker * update problematic links --- .github/workflows/check-links.yml | 6 ++++++ .../bpe-from-scratch-simple.ipynb | 4 ++-- ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb | 4 ++-- conftest.py | 17 +++++++++++++++++ 4 files changed, 27 insertions(+), 4 deletions(-) create mode 100644 conftest.py diff --git a/.github/workflows/check-links.yml b/.github/workflows/check-links.yml index 7ceb28f..a16c69f 100644 --- a/.github/workflows/check-links.yml +++ b/.github/workflows/check-links.yml @@ -27,12 +27,18 @@ jobs: uv add pytest-check-links - name: Check links + env: + CHECK_LINKS_TIMEOUT: "10" run: | source .venv/bin/activate pytest --check-links ./ \ --check-links-ignore "https://platform.openai.com/*" \ --check-links-ignore "https://openai.com/*" \ --check-links-ignore "https://arena.lmsys.org" \ + --check-links-ignore "https?://localhost(:\\d+)?/.*" \ + --check-links-ignore "https?://127[.]0[.]0[.]1(:\\d+)?/.*" \ + --check-links-ignore "https://mng\\.bz/.*" \ + --check-links-ignore "https://github\\.com/.*" \ --check-links-ignore "https://unsloth.ai/blog/gradient" \ --check-links-ignore "https://www.reddit.com/r/*" \ --check-links-ignore "https://code.visualstudio.com/*" \ diff --git a/ch02/05_bpe-from-scratch/bpe-from-scratch-simple.ipynb b/ch02/05_bpe-from-scratch/bpe-from-scratch-simple.ipynb index c007d85..dd795f6 100644 --- a/ch02/05_bpe-from-scratch/bpe-from-scratch-simple.ipynb +++ b/ch02/05_bpe-from-scratch/bpe-from-scratch-simple.ipynb @@ -36,7 +36,7 @@ "- This is a standalone notebook implementing the popular byte pair encoding (BPE) tokenization algorithm, which is used in models like GPT-2 to GPT-4, Llama 3, etc., from scratch for educational purposes\n", "- For more details about the purpose of tokenization, please refer to [Chapter 2](https://github.com/rasbt/LLMs-from-scratch/blob/main/ch02/01_main-chapter-code/ch02.ipynb); this code here is bonus material explaining the BPE algorithm\n", "- The original BPE tokenizer that OpenAI implemented for training the original GPT models can be found [here](https://github.com/openai/gpt-2/blob/master/src/encoder.py)\n", - "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](http://www.pennelynn.com/Documents/CUJ/HTML/94HTML/19940045.HTM)\" by Philip Gage\n", + "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](https://github.com/tpn/pdfs/blob/master/A%20New%20Algorithm%20for%20Data%20Compression%20(1994).pdf)\" by Philip Gage\n", "- Most projects, including Llama 3, nowadays use OpenAI's open-source [tiktoken library](https://github.com/openai/tiktoken) due to its computational performance; it allows loading pretrained GPT-2 and GPT-4 tokenizers, for example (the Llama 3 models were trained using the GPT-4 tokenizer as well)\n", "- The difference between the implementations above and my implementation in this notebook, besides it being is that it also includes a function for training the tokenizer (for educational purposes)\n", "- There's also an implementation called [minBPE](https://github.com/karpathy/minbpe) with training support, which is maybe more performant (my implementation here is focused on educational purposes); in contrast to `minbpe` my implementation additionally allows loading the original OpenAI tokenizer vocabulary and merges" @@ -253,7 +253,7 @@ "id": "8c0d4420-a4c7-4813-916a-06f4f46bc3f0", "metadata": {}, "source": [ - "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](http://www.pennelynn.com/Documents/CUJ/HTML/94HTML/19940045.HTM)\" by Philip Gage\n", + "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](https://github.com/tpn/pdfs/blob/master/A%20New%20Algorithm%20for%20Data%20Compression%20(1994).pdf)\" by Philip Gage\n", "- Before we get to the actual code implementation, the form that is used for LLM tokenizers today can be summarized as follows:" ] }, diff --git a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb index e8bce18..cad47a8 100644 --- a/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb +++ b/ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb @@ -36,7 +36,7 @@ "- This is a standalone notebook implementing the popular byte pair encoding (BPE) tokenization algorithm, which is used in models like GPT-2 to GPT-4, Llama 3, etc., from scratch for educational purposes\n", "- For more details about the purpose of tokenization, please refer to [Chapter 2](https://github.com/rasbt/LLMs-from-scratch/blob/main/ch02/01_main-chapter-code/ch02.ipynb); this code here is bonus material explaining the BPE algorithm\n", "- The original BPE tokenizer that OpenAI implemented for training the original GPT models can be found [here](https://github.com/openai/gpt-2/blob/master/src/encoder.py)\n", - "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](http://www.pennelynn.com/Documents/CUJ/HTML/94HTML/19940045.HTM)\" by Philip Gage\n", + "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](https://github.com/tpn/pdfs/blob/master/A%20New%20Algorithm%20for%20Data%20Compression%20(1994).pdf)\" by Philip Gage\n", "- Most projects, including Llama 3, nowadays use OpenAI's open-source [tiktoken library](https://github.com/openai/tiktoken) due to its computational performance; it allows loading pretrained GPT-2 and GPT-4 tokenizers, for example (the Llama 3 models were trained using the GPT-4 tokenizer as well)\n", "- The difference between the implementations above and my implementation in this notebook, besides it being is that it also includes a function for training the tokenizer (for educational purposes)\n", "- There's also an implementation called [minBPE](https://github.com/karpathy/minbpe) with training support, which is maybe more performant (my implementation here is focused on educational purposes); in contrast to `minbpe` my implementation additionally allows loading the original OpenAI tokenizer vocabulary and BPE \"merges\" (additionally, Hugging Face tokenizers are also capable of training and loading various tokenizers; see [this GitHub discussion](https://github.com/rasbt/LLMs-from-scratch/discussions/485) by a reader who trained a BPE tokenizer on the Nepali language for more info)" @@ -245,7 +245,7 @@ "id": "8c0d4420-a4c7-4813-916a-06f4f46bc3f0", "metadata": {}, "source": [ - "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](http://www.pennelynn.com/Documents/CUJ/HTML/94HTML/19940045.HTM)\" by Philip Gage\n", + "- The BPE algorithm was originally described in 1994: \"[A New Algorithm for Data Compression](https://github.com/tpn/pdfs/blob/master/A%20New%20Algorithm%20for%20Data%20Compression%20(1994).pdf)\" by Philip Gage\n", "- Before we get to the actual code implementation, the form that is used for LLM tokenizers today can be summarized as described in the following sections." ] }, diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..4127f71 --- /dev/null +++ b/conftest.py @@ -0,0 +1,17 @@ +import os +import requests + + +def pytest_configure(config): + if not getattr(config.option, "check_links", False): + return + + timeout = float(os.environ.get("CHECK_LINKS_TIMEOUT", "10")) + original_request = requests.sessions.Session.request + + def request_with_timeout(self, method, url, **kwargs): + if kwargs.get("timeout") is None: + kwargs["timeout"] = timeout + return original_request(self, method, url, **kwargs) + + requests.sessions.Session.request = request_with_timeout