Searchable published urls

As of today, if I include links in unpublished drafts, I have linked to 2,256 unique offsite URLs in page content.¹

I know this because I made an automatic index of every URL on this site, instantly searchable by its anchor text, which page linked to them, and the URL itself.

If you’ve got a site of your own, it’s pretty cool to be able to search back through all your outgoing links. Here’s how I designed it and built it.

Design

I really like live filtering as I type, but I had to iterate a bit before it worked well.

The filtering itself is super fast, but DOM updates needed to be optimized from my initial approach. At a few hundred entries, creating/deleting elements in the DOM for each result on every keystroke is lightning fast, but at over 2000 entries, the delay for the first few keystrokes is detestable. I was surprised at how awful a 30ms stutter felt. The current design keeps all the rows in the DOM all the time and toggles visible on them based on the search query, which is faster than changing the number of elements and makes for a much nicer incremental search experience.

In the future, if this site grew to have 10x the number of URLs, I might have to change the approach to render less, or else the first few keystrokes will feel slow again. Animation might help here, to feel that it’s working “instantly” without rendering instantly.

I played with also indexing all of this site’s own URLs as well, but I can’t think of a use case for this (I already have search). It’s controlled in my code with INCLUDE_SITE_URLS, which is just a constant that is set to false (at least as of this moment).

Implementation

Hugo builds a /urls.json file, and JavaScript on the page retrieves that file to populate the list of URLs.

Hugo template to generate /urls.json

{{/*  Build a URL inventory for the site.
      -*- mode: go -*-

      For every href found in rendered page content across the site,
      records which pages link to it and what anchor text was used.

      Fields per entry:
        url     - the href, normalized to trailing slash where applicable
        titles  - unique anchor text strings used when linking to this URL
        pages   - URLs of pages on this site that contain a link to this URL

      Ignores the following sections (same as lunr index):
        - warchive
        - twarchive
*/}}
{{- $ignoreSections := slice "warchive" "twarchive" -}}
{{- $titlesScratch := newScratch -}}
{{- $pagesScratch := newScratch -}}
{{- $pageTitleScratch := newScratch -}}
{{- $urlSlice := slice -}}
{{- $baseURL := strings.TrimSuffix "/" $.Site.BaseURL -}}

{{- range $page := $.Site.RegularPages -}}
  {{- $firstSectionName := strings.TrimPrefix "/" $page.FirstSection.Path -}}
  {{- if collections.In $ignoreSections $firstSectionName -}}
    {{- continue -}}
  {{- end -}}

  {{/* Find all anchor elements in rendered HTML content.
       Pattern: <a ...href="url"...>text without nested tags</a>
       We restrict text to [^<]* so we match simple anchors reliably. */}}
  {{- $links := findRE `<a\s[^>]*href="[^"#][^"]*"[^>]*>[^<]*</a>` $page.Content -}}
  {{- range $link := $links -}}

    {{/* Extract href via string split (avoids RE2 capture group limits) */}}
    {{- $hrefParts := split $link `href="` -}}
    {{- if lt (len $hrefParts) 2 -}}{{- continue -}}{{- end -}}
    {{- $rawUrl := index (split (index $hrefParts 1) `"`) 0 -}}

    {{/* Skip empty, fragment-only, javascript:, mailto: URLs */}}
    {{- if or
        (not $rawUrl)
        (strings.HasPrefix $rawUrl "#")
        (strings.HasPrefix $rawUrl "javascript:")
        (strings.HasPrefix $rawUrl "mailto:")
        (strings.HasPrefix $rawUrl "data:")
    -}}
      {{- continue -}}
    {{- end -}}

    {{/* Resolve relative URLs to absolute */}}
    {{- $url := $rawUrl -}}
    {{- if not (or
        (strings.HasPrefix $url "http://")
        (strings.HasPrefix $url "https://")
        (strings.HasPrefix $url "//")
        (strings.HasPrefix $url "/")
    ) -}}
      {{/* Relative URL: prepend the directory of the current page */}}
      {{- $pageDir := $page.RelPermalink -}}
      {{- if strings.HasPrefix $url "./" -}}
        {{- $url = printf "%s%s" $pageDir (strings.TrimPrefix "./" $url) -}}
      {{- else if strings.HasPrefix $url "../" -}}
        {{/* Strip one path component from pageDir for each ../ */}}
        {{- $stripped := strings.TrimSuffix "/" $pageDir -}}
        {{- $stripped = strings.TrimSuffix (path.Base $stripped) $stripped -}}
        {{- $stripped = strings.TrimSuffix "/" $stripped -}}
        {{- $url = printf "%s/%s" $stripped (strings.TrimPrefix "../" $url) -}}
      {{- else -}}
        {{- $url = printf "%s%s" $pageDir $url -}}
      {{- end -}}
    {{- end -}}

    {{/* Make site-relative URLs absolute */}}
    {{- if strings.HasPrefix $url "/" -}}
      {{- $url = printf "%s%s" $baseURL $url -}}
    {{- end -}}

    {{/* Normalize: add trailing slash to page-like URLs (no file extension, no query) */}}
    {{- if and
        (not (strings.HasSuffix $url "/"))
        (not (strings.Contains $url "?"))
        (not (strings.Contains $url "#"))
    -}}
      {{/* Get the last path segment to check for a file extension */}}
      {{- $lastSeg := path.Base $url -}}
      {{- $hasExt := strings.Contains $lastSeg "." -}}
      {{- if not $hasExt -}}
        {{- $url = printf "%s/" $url -}}
      {{- end -}}
    {{- end -}}

    {{/* Strip any trailing fragment that crept in */}}
    {{- $url = index (split $url "#") 0 -}}

    {{/* Extract anchor text */}}
    {{- $text := replaceRE `<[^>]+>` "" $link | strings.TrimSpace | htmlUnescape -}}
    {{- if not $text -}}
      {{- $text = $url -}}
    {{- end -}}

    {{/* Accumulate into index */}}
    {{- $urlSlice = $urlSlice | append $url -}}
    {{- if $titlesScratch.Get $url -}}
      {{- $titlesScratch.Set $url ($titlesScratch.Get $url | append $text) -}}
    {{- else -}}
      {{- $titlesScratch.Set $url (slice $text) -}}
    {{- end -}}
    {{- if $pagesScratch.Get $url -}}
      {{- $pagesScratch.Set $url ($pagesScratch.Get $url | append $page.Permalink) -}}
    {{- else -}}
      {{- $pagesScratch.Set $url (slice $page.Permalink) -}}
    {{- end -}}
    {{- $pageTitleScratch.Set $page.Permalink ($page.Title | htmlUnescape) -}}

  {{- end -}}
{{- end -}}

{{/* Build final result.
     When a URL appears both with and without trailing slash, merge into the slash version. */}}
{{- $result := slice -}}
{{- $seenUrls := newScratch -}}
{{- range $url := ($urlSlice | uniq) -}}

  {{/* Determine the canonical (with-slash) and the alternate (without-slash) forms */}}
  {{- $canonical := "" -}}
  {{- $alternate := "" -}}
  {{- if strings.HasSuffix $url "/" -}}
    {{- $canonical = $url -}}
    {{- $alternate = strings.TrimSuffix "/" $url -}}
  {{- else -}}
    {{- $canonical = printf "%s/" $url -}}
    {{- $alternate = $url -}}
  {{- end -}}

  {{/* Skip if already processed under this canonical */}}
  {{- if $seenUrls.Get $canonical -}}
    {{- continue -}}
  {{- end -}}
  {{- $seenUrls.Set $canonical true -}}

  {{/* Merge titles from both forms */}}
  {{- $titles := slice -}}
  {{- range $t := ($titlesScratch.Get $canonical | default slice) -}}
    {{- $titles = $titles | append $t -}}
  {{- end -}}
  {{- range $t := ($titlesScratch.Get $alternate | default slice) -}}
    {{- $titles = $titles | append $t -}}
  {{- end -}}

  {{/* Merge pages from both forms, dedup by permalink, build {url,title} objects */}}
  {{- $pagePermalinks := slice -}}
  {{- range $p := ($pagesScratch.Get $canonical | default slice) -}}
    {{- $pagePermalinks = $pagePermalinks | append $p -}}
  {{- end -}}
  {{- range $p := ($pagesScratch.Get $alternate | default slice) -}}
    {{- $pagePermalinks = $pagePermalinks | append $p -}}
  {{- end -}}
  {{- $pages := slice -}}
  {{- range $p := ($pagePermalinks | uniq) -}}
    {{- $pageTitle := $pageTitleScratch.Get $p | default $p -}}
    {{- $pages = $pages | append (dict "url" $p "title" $pageTitle) -}}
  {{- end -}}

  {{- $result = $result | append (dict
    "url"    $canonical
    "titles" ($titles | uniq)
    "pages"  ($pages | uniq)
  ) -}}
{{- end -}}

{{- $jsonifyArgs := dict -}}
{{- if eq hugo.Environment "development" -}}
  {{- $jsonifyArgs = dict "indent" "  " -}}
{{- end -}}
{{- $result | jsonify $jsonifyArgs -}}

Page JavaScript

/* URL inventory page script
 * Loads /urls.json and renders a searchable table.
 *
 * Search syntax (case-insensitive, space-separated terms):
 *   foo           full-text match across url, titles, and backlinks
 *   url:foo       match only the url column
 *   title:foo     match only the titles column
 *   backlinks:foo match only the backlinks column
 *   -foo          negate any term
 *   "foo bar"     treat as a single term (allows spaces, colons, hyphens inside)
 *
 * All terms must match (AND logic). Negation applies per-term.
 */

(function () {
  "use strict";

  const URLS_JSON = "/urls.json";
  const INCLUDE_SITE_URLS = true;
  const TELEMETRY = true;
  const siteOrigin = window.location.origin;

  /* ------------------------------------------------------------------ *
   * Query parser
   * ------------------------------------------------------------------ */

  /** Split raw input into raw token strings, respecting double-quoted spans. */
  function tokenize(raw) {
    const tokens = [];
    const re = /"([^"]*)"|(\S+)/g;
    let m;
    while ((m = re.exec(raw)) !== null) {
      tokens.push(m[1] !== undefined ? m[1] : m[2]);
    }
    return tokens;
  }

  /** Parse a single raw token string into a structured token object. */
  function parseToken(tok) {
    let negated = false;
    let str = tok;

    if (str.startsWith("-") && str.length > 1) {
      negated = true;
      str = str.slice(1);
    }

    let column = null;
    const colMatch = str.match(/^(url|title|backlinks):(.+)$/i);
    if (colMatch) {
      column = colMatch[1].toLowerCase();
      str = colMatch[2];
    }

    return { negated, column, value: str.toLowerCase() };
  }

  function parseQuery(raw) {
    return tokenize(raw)
      .map(parseToken)
      .filter((t) => t.value.length > 0);
  }

  /* ------------------------------------------------------------------ *
   * Filtering
   * ------------------------------------------------------------------ */

  function entryMatches(entry, tokens) {
    for (const tok of tokens) {
      let hit;
      switch (tok.column) {
        case "url":
          hit = entry._url.includes(tok.value);
          break;
        case "title":
          hit = entry._titles.includes(tok.value);
          break;
        case "backlinks":
          hit = entry._pages.includes(tok.value);
          break;
        default:
          hit =
            entry._url.includes(tok.value) ||
            entry._titles.includes(tok.value) ||
            entry._pages.includes(tok.value);
      }
      if (tok.negated ? hit : !hit) return false;
    }
    return true;
  }

  /* ------------------------------------------------------------------ *
   * Helpers
   * ------------------------------------------------------------------ */

  const time = TELEMETRY ? console.time : () => {};
  const timeEnd = TELEMETRY ? console.timeEnd : () => {};

  function stripOrigin(url) {
    return url.startsWith(siteOrigin + "/")
      ? url.slice(siteOrigin.length)
      : url;
  }

  // Decode HTML entities (e.g. &rsquo; → '). Uses textarea which is safe from XSS.
  const _decodeEl = document.createElement("textarea");
  function decodeHtml(str) {
    _decodeEl.innerHTML = str;
    return _decodeEl.value;
  }

  function makeUl(items) {
    if (!items || items.length === 0) return document.createTextNode("");
    const ul = document.createElement("ul");
    for (const item of items) {
      const li = document.createElement("li");
      li.textContent = decodeHtml(item);
      ul.appendChild(li);
    }
    return ul;
  }

  function makeLinkedUl(items) {
    if (!items || items.length === 0) return document.createTextNode("");
    const ul = document.createElement("ul");
    for (const item of items) {
      const li = document.createElement("li");
      const a = document.createElement("a");
      a.href = item.url;
      a.textContent = decodeHtml(item.title || stripOrigin(item.url));
      li.appendChild(a);
      ul.appendChild(li);
    }
    return ul;
  }

  /* ------------------------------------------------------------------ *
   * Render — build rows once, then toggle hidden on filter
   * ------------------------------------------------------------------ */

  // Each element: { entry, tr }
  let allRows = [];

  function prepareEntry(e) {
    return {
      ...e,
      _url: e.url.toLowerCase(),
      _titles: e.titles.join("\n").toLowerCase(),
      _pages: e.pages
        .map((p) => p.url + "\n" + p.title)
        .join("\n")
        .toLowerCase(),
    };
  }

  function buildRows(entries) {
    const tbody = document.getElementById("urls-tbody");
    const frag = document.createDocumentFragment();
    allRows = entries.map((entry) => {
      const tr = document.createElement("tr");

      tr.appendChild(
        (() => {
          const td = document.createElement("td");
          const a = document.createElement("a");
          a.href = entry.url;
          a.textContent = stripOrigin(entry.url);
          td.appendChild(a);
          return td;
        })(),
      );

      const tdTitles = document.createElement("td");
      tdTitles.appendChild(makeUl(entry.titles));
      tr.appendChild(tdTitles);

      const tdPages = document.createElement("td");
      tdPages.appendChild(makeLinkedUl(entry.pages));
      tr.appendChild(tdPages);

      frag.appendChild(tr);
      return { entry, tr };
    });
    tbody.appendChild(frag);
  }

  function filter(query) {
    time("filter");

    time("filter:parse");
    const tokens = parseQuery(query || "");
    timeEnd("filter:parse");

    const countEl = document.getElementById("urls-count");
    let shown = 0;

    time("filter:dom");
    for (const { entry, tr } of allRows) {
      const visible = tokens.length === 0 || entryMatches(entry, tokens);
      tr.hidden = !visible;
      if (visible) shown++;
    }
    timeEnd("filter:dom");

    countEl.textContent = `${shown} of ${allRows.length} shown`;

    timeEnd("filter");
  }

  /* ------------------------------------------------------------------ *
   * Init
   * ------------------------------------------------------------------ */

  function init() {
    const searchEl = document.getElementById("urls-search");
    const countEl = document.getElementById("urls-count");

    countEl.textContent = "Loading…";

    searchEl.focus();

    fetch(URLS_JSON)
      .then((r) => {
        if (!r.ok) throw new Error(`HTTP ${r.status}`);
        return r.json();
      })
      .then((data) => {
        if (!INCLUDE_SITE_URLS) {
          data = data.filter(
            (e) =>
              !e.url.startsWith(siteOrigin + "/") && !e.url.startsWith("/"),
          );
        }
        buildRows(data.map(prepareEntry));
        filter(searchEl.value);
        searchEl.addEventListener("input", () => filter(searchEl.value));
      })
      .catch((err) => {
        countEl.textContent = "Error loading " + URLS_JSON + ": " + err.message;
      });
  }

  if (document.readyState === "loading") {
    document.addEventListener("DOMContentLoaded", init);
  } else {
    init();
  }
})();

This requires setting some options in the site configuration

Hugo config.yml

outputFormats:
  # ... snip ...
  urls:
    basename: urls
    isPlainText: true
    mediaType: application/json

outputs:
  home:
    - HTML
    - urls
    # ... snip ...

This number will be lower for the public site without drafts, and also doesn’t include links generated by templates from data files, like those in Books and References ↩︎

2026 0411 Searchable published urls

Design

Implementation

Responses

Webmentions

Comments