Context Navigation

← Previous Changeset
Next Changeset →

Changeset 3423919

Timestamp:

12/19/2025 06:10:18 PM (3 weeks ago)

Author:

dizzysoft

Message:

Version 0.9.0

Location:

Files:

: 1 added
: 3 edited
: 5 copied

tags/0.9.0 (added)
tags/0.9.0/LICENSE (copied) (copied from crawler-record/trunk/LICENSE)
tags/0.9.0/crawler-record.php (copied) (copied from crawler-record/trunk/crawler-record.php) (41 diffs)
tags/0.9.0/languages (copied) (copied from crawler-record/trunk/languages)
tags/0.9.0/readme.txt (copied) (copied from crawler-record/trunk/readme.txt) (2 diffs)
tags/0.9.0/uninstall.php (copied) (copied from crawler-record/trunk/uninstall.php) (1 diff)
trunk/crawler-record.php (modified) (41 diffs)
trunk/readme.txt (modified) (2 diffs)
trunk/uninstall.php (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

crawler-record/tags/0.9.0/crawler-record.php

-                      r3400651
+                      r3423919
  * Plugin URI:  https://www.dizzysoft.com/crawler-record-plugin-for-wordpress/
  * Description: Are the most common search and chat (AI/LLM) bots able to access the pages on your website? Crawler Record can tell you the last time each of the most common search/chat bots visited -and which pages at which they looked.
  * Version:     0.8.0
+ * Version:     0.9.0
  * Requires at least: 6.0
  * Tested up to: 6.8
+ * Tested up to: 6.9
  * Requires PHP: 7.4
  * Author:      dizzysoft
 …
+}
+define( 'CRAWLER_RECORD_VERSION', '0.8.0' );
+define( 'CRAWLER_RECORD_VERSION', '0.9.0' );
+define( 'DZCR_ADMIN_SLUG', 'dzcr-crawler-record' );
 class Crawler_Record {
+    /**
+     * Holds a list of all bot groups and their UA patterns.
+     *
+     * Why:  The agent list is small, stable, and rarely changes; storing
+     *       it in code removes the need for a custom database table
+     *       and keeps migration simple.  The `apply_filters()` wrapper
+     *       lets developers add or replace bots without touching this file.
+     *
+     * @var array
+     */
+    // https://www.searchenginejournal.com/ai-crawler-user-agents-list/558130/
     private $dzcr_default_agent_groups = [
         'Google' => [
 …
         ],
+        'Meta' => [
+            'doc'   => 'https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/',
+            'items' => [
+                'meta_externalagent' => [
+                    'label'   => 'Meta-ExternalAgent',
+                    'pattern' => '#meta-externalagent/\d+(?:\.\d+)?#i',
+                ],
+                'meta_webindexer' => [
+                    'label'   => 'Meta-WebIndexer',
+                    'pattern' => '#meta-webindexer/\d+(?:\.\d+)?#i',
+                ],
+            ],
+        ],
+        'Apple' => [
+            'doc'   => 'https://support.apple.com/en-us/119829',
+            'items' => [
+                'applebot' => [
+                    'label'   => 'Applebot',
+                    'pattern' => '#Applebot/\d+(?:\.\d+)?#i',
+                ],
+                // Robots-only (does not crawl pages)
+                'applebot_extended' => [
+                    'label'   => 'Applebot-Extended (AI)',
+                    'pattern' => '#(?!.)#',
+                ],
+            ],
+        ],
         'DuckDuckGo' => [
             'doc'   => 'https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot',
 …
     ];
-    /**
-     * Cache of the resolved, filtered agent list.
+     *
-     * Why:  Resolving the agent list once per request saves expensive work
-     *       (filtering & validation).  Stored in memory for the duration of
-     *       the request; no persistence needed.
+     *
-     * @var array|null
-     */
     private $dzcr_agents_effective_cache = null;
-    /**
-     * Cache of the flat list of UA patterns keyed by the agent key.
+     *
-     * Why:  Many look-ups need a simple key > regex map; building it once avoids
-     *       iterating through the nested agent group structure each time.
+     *
-     * @var array|null
-     */
     private $dzcr_agents_flat_cache      = null;
-    /**
-     * Prefix used for per-post Meta keys that store the last timestamp a bot
-     * visited the post.
+     *
-     * Why:  Namespacing the meta key prevents collisions with other plugins.
-     *       The prefix is combined with the unique agent key.
+     *
-     * @var string
-     */
     private $dzcr_meta_prefix     = '_dzcr_last_seen_';
-    /**
-     * Prefix used for site-wide last-seen Meta keys (stored as options).
+     *
-     * Why:  Allows quick access to the most recent time *any* page was seen by
-     *       each bot without scanning the entire database.
+     *
-     * @var string
-     */
     private $dzcr_site_prefix     = '_dzcr_site_last_';
-    /**
-     * Prefix used for per-URL last-seen keys (stored as options).
+     *
-     * Why:  URLs are not tied to a post, so options keep arbitrary
-     *       key/value pairs without cluttering postmeta.
+     *
-     * @var string
-     */
     private $dzcr_url_prefix      = '_dzcr_url_last_';
-    /**
-     * Prefix used for the last-post-ID per bot (stored as options).
+     *
-     * Why:  This is a fast index: if the last post ID is known we can fetch
-     *       its meta immediately, avoiding a full table scan.
+     *
-     * @var string
-     */
     private $dzcr_lastpost_prefix = '_dzcr_last_post_';
-    /**
-     * Class constructor.
+     *
-     * Why:  Hooks the core functionality into WordPress:
-     *   - `template_redirect` records when a crawler requests a page.
-     *   - `add_meta_boxes` adds the per-post "Crawler Record" box.
-     *   - `admin_bar_menu` inserts a quick toolbar entry for logged-in users.
-     *   - `admin_menu` registers the settings page in the admin sidebar.
-     *   - `plugin_action_links_*` adds Settings/Documentation links on the
-     *     Plugins screen.
+     *
-     * This is the entry point of the plugin â€“ everything else is called
-     * indirectly via these actions.
-     */
     public function __construct() {
         add_action( 'template_redirect', [ $this, 'maybe_record_last_seen' ] );
 …
     public function enqueue_admin_assets() {
         $screen = function_exists('get_current_screen') ? get_current_screen() : null;
         if ( ! $screen || $screen->id !== 'toplevel_page_crawler-record' ) {
+        if ( ! $screen || $screen->id !== 'toplevel_page_' . DZCR_ADMIN_SLUG ) {
             return; // only on the plugin admin page
+        }
 …
             'https://player.vimeo.com/api/player.js',
             [],
             $ver,
+            CRAWLER_RECORD_VERSION,
             true
         );
 …
+    }
+    /**
+     * Retrieves the effective agent groups after filters, validation,
+     * and custom ordering have been applied.
+     *
+     * Why:  The default list is processed once per request.  This method
+     *       returns a validated array that can be cached for all other
+     *       look‑ups (`get_agent_groups()`).  The validation step removes
+     *       malformed entries and guarantees the presence of a `doc`
+     *       URL and a `label` for each item.
+     *
+     * @return array The resolved, validated agent groups.
+    /**
+     * Lazily resolves the complete agent‑group tree, applying the `dzcr_agent_groups`
+     * and `dzcr_agents_order` filters, and caches the result for the lifetime of the request.
+     *
+     * The returned array maps group names to an array containing a `doc` URL and a list of
+     * individual bot definitions (`label` and `pattern`).  The data is filtered so that
+     * malformed entries (missing patterns or labels) are removed automatically.
+     *
+     * @return array<string,array<string,mixed>> The fully‑validated and ordered agent groups.
      */
     private function get_agent_groups() {
 …
     /**
+     * Returns a flattened map of bot keys to regex pattern strings.
+     *
+     * Why:  The recording logic (`maybe_record_last_seen`) needs a
+     *       simple `foreach ($agent_key => $pattern)` loop.  Flattening
+     *       removes the overhead of walking the nested group structure
+     *       for every request.
+     *
+     * @return array Key → regex string.
+     * Returns a flattened map of every bot key to its compiled regular‑expression pattern.
+     *
+     * The function iterates over the effective agent groups obtained via {@see get_agent_groups()}
+     * and builds a key‑=>pattern list.  The result is cached for the current request.
+     *
+     * @return array<string,string> Bot key → regex pattern.
      */
     private function get_agents_flat() {
 …
     /**
+     * Looks up the human‑readable label for a bot key.
+     *
+     * Why:  The UI shows the label, not the internal key.  The helper
+     *       centralises the lookup and provides a default in case the
+     *       key is unknown.
+     *
+     * @param string $key Internal bot identifier.
+     * @return string Human‑readable label.
+     * Looks up a human‑readable label for a bot key in the current agent‑group configuration.
+     *
+     * If the key cannot be found, its raw value is returned as a fallback.  This helper is
+     * used to keep UI code terse while still presenting friendly names to site editors.
+     *
+     * @param string $key The internal identifier for a bot (e.g. 'googlebot_desktop').
+     * @return string The user‑friendly label.
      */
     private function get_label_for_key( $key ) {
 …
     /**
+     * Builds the canonical URL of the current front‑end request.
+     *
+     * Why:  Recorded timestamps are stored per‑URL, so we need an exact,
+     *       normalised URL.  The function normalises the scheme
+     *       (`http/https`), host, path, and applies the
+     *       `dzcr_normalize_url` filter (useful for stripping tracking
+     *       parameters or converting to a canonical form).
+     *
+     * @return string Full URL (or empty string in admin/invalid state).
+     */
+    private function current_url() {
+        if ( is_admin() ) {
+     * Generates the canonical, normalized permalink for the queried singular post.
+     *
+     * The function guarantees the returned URL does **not** contain a query string,
+     * a fragment identifier, or an empty path.  The result is suitable for storage
+     * in post meta and for robots‑txt matching.
+     *
+     * @param int $post_id By reference, receives the resolved post ID or 0 if no singular.
+     * @return string The sanitized absolute URL, or an empty string when no query can be resolved.
+     */
+    private function current_url_for_post( &$post_id = 0 ) {
+        if ( is_admin() || ! is_singular() ) {
             return '';
+        }
+        if ( ! isset( $_SERVER['HTTP_HOST'], $_SERVER['REQUEST_URI'] ) ) {
+        $post_id = (int) get_queried_object_id();
+        if ( ! $post_id ) {
             return '';
+        }
+        $scheme = ( ! empty( $_SERVER['HTTPS'] ) && 'off' !== $_SERVER['HTTPS'] ) ? 'https' : 'http';
+        $host   = sanitize_text_field( wp_unslash( $_SERVER['HTTP_HOST'] ) );
+        $uri    = esc_url_raw( wp_unslash( $_SERVER['REQUEST_URI'] ) );
+        if ( '' === $host ) {
+        $url = get_permalink( $post_id );
+        if ( ! $url ) {
             return '';
+        }
+        $url = $scheme . '://' . $host . $uri;
+        $url = apply_filters( 'dzcr_normalize_url', $url );
+        if ( strlen( $url ) > 2048 ) {
+            $url = substr( $url, 0, 2048 );
+        }
+        return $url;
+    }
+    /**
+     * Records the last “seen” timestamp when a crawler requests a page.
+     *
+     * Why:  This method is the heart of the plugin.  It is called on
+     *       `template_redirect` and performs the following actions:
+     *
+     *   1. Skips previews, feeds, REST, admin, AJAX, and cron requests.
+     *   2. Only processes GET/HEAD requests, which are the standard
+     *      HTTP verbs used by crawlers.
+     *   3. Trims and sanitises the UA string (max 512 chars) and
+     *      normalises it to avoid runaway regexes.
+     *   4. Throttles writes using a 10‑minute window (configurable via
+     *      `dzcr_throttle_window`) to avoid spamming the DB when the same
+     *      bot visits the page repeatedly.
+     *   5. Stores three pieces of information:
+     *        - Per‑post meta (`_dzcr_last_seen_[bot]`).
+     *        - Per‑URL options (`_dzcr_url_last_[bot]_[hash]`) plus companion
+     *          `_url_` and `_ua_` options.
+     *        - Site‑wide option (`_dzcr_site_last_[bot]`).
+     *
+     * The method also updates the “last UA string” meta/option so the
+     * UI can show exactly what the bot sent.
+     *
+        // Normalize: strip query string and fragment
+        $parts = wp_parse_url( $url );
+        if ( empty( $parts['host'] ) || empty( $parts['path'] ) ) {
+            return '';
+        }
+        $scheme = $parts['scheme'] ?? 'https';
+        return $scheme . '://' . $parts['host'] . $parts['path'];
+    }
+    /**
+     * Core routine that records bot visit timestamps when the current request matches a known crawler.
+     *
+     * The method is executed early in the front‑end rendering cycle (`template_redirect`).
+     * 1. It exits immediately for admin screens, REST requests, feeds, `HEAD` requests, or
+     *    when the `User‑Agent` header is missing or too long.
+     * 2. The UA string is matched against the compiled regex list.  Only the first matching
+     *    rule is considered.
+     * 3. A per‑post meta key and a global option are updated if the throttle window
+     *    (10 minutes by default) allows it.  The method also records the exact UA string
+     *    that triggered the update so editors can inspect it later.
+     *
+     * @param int $post_id Optional post ID when the query resolves to a single post.  Passed by value.
      * @return void
+     * @throws RuntimeException If the PHP `set_transient` failures occur (unlikely, but reported).
      */
     public function maybe_record_last_seen() {
 …
         $now     = microtime( true );
+        $post_id = is_singular() ? (int) get_queried_object_id() : 0;
         $url     = $this->current_url();
         $urlhash = $url ? md5( $url ) : '';
+        $post_id = 0;
+        $url     = $this->current_url_for_post( $post_id );
         $throttle = (int) apply_filters( 'dzcr_throttle_window', 10 * MINUTE_IN_SECONDS );
 …
+            }
+            // Per URL
+            if ( $urlhash ) {
+                $opt_key     = $this->dzcr_url_prefix . $key . '_' . $urlhash;
+                $opt_url_key = $this->dzcr_url_prefix . $key . '_url_' . $urlhash;
+                $opt_ua_key  = $this->dzcr_url_prefix . $key . '_ua_'  . $urlhash; // store last UA for this URL+agent
+                $t_url       = 'dzcr_seen_url_' . $key . '_' . $urlhash;
+                if ( ! $throttle || ! get_transient( $t_url ) ) {
+                    $prev = (float) get_option( $opt_key, 0 );
+                    if ( $now > $prev ) {
+                        update_option( $opt_key, (string) $now, false );
+                        update_option( $opt_url_key, $url, false );
+                        update_option( $opt_ua_key,  $ua,  false );
+                    }
+                    if ( $throttle ) {
+                        set_transient( $t_url, 1, $throttle );
+                    }
+            // Site-wide latest real post/page
+            if ( $post_id && $url ) {
+                $latest_key = 'dzcr_latest_url_' . $key;
+                $prev       = get_option( $latest_key, [] );
+                $prev_ts    = isset( $prev['ts'] ) ? (float) $prev['ts'] : 0.0;
+                if ( $now > $prev_ts ) {
+                    update_option(
+                        $latest_key,
+                        [
+                            'ts'  => (string) $now,
+                            'url' => $url,
+                            'ua'  => $ua,
+                        ],
+                        false
+                    );
+                }
+            }
 …
     /**
+     * Safely matches a UA string against a stored pattern or literal value.
+     *
+     * Why:  Direct use of `preg_match()` on untrusted user‑agent strings
+     *       can trigger PHP warnings/errors (e.g. malformed regex).
+     *       This wrapper:
+     *   - Ensures the pattern begins with a `#` and ends with `#i`.
+     *   - Suppresses `preg_match()` errors with the `@` operator.
+     *   - Validates that the regex is syntactically correct.
+     *   - Fallbacks to literal `hash_equals()` comparison when a plain string
+     *     is supplied.
+     *
+     * @param string $ua             The UA string from the request.
+     * @param string $pattern_or_exact  Either a regex string or a literal.
+     * @return bool True if the UA matches the pattern or string.
+     * Safely tests whether a given User‑Agent string matches a pattern.
+     *
+     * Handles both plain strings (exact comparison) and regular‑expressions by inspecting
+     * the first character.  The function avoids the costly `preg_match` errors through
+     * the `@` error‑control operator and checks the regex error code.
+     *
+     * @param string $ua The User‑Agent string to test.
+     * @param string $pattern_or_exact Either a regex (enclosed in `#...#i`) or a literal string.
+     * @return bool True if the UA satisfies the pattern; otherwise false.
      */
     private function ua_matches_safe( $ua, $pattern_or_exact ) {
 …
     /**
+     * Registers the “Crawler Record” meta‑box that appears on the post
+     * editing screen.
+     *
+     * Why:  Post editors need instant insight into which crawlers have
+     *       visited the post and their access status.  The meta‑box
+     *       appears in the normal “normal” context with high priority,
+     *       making it highly visible without adding extra menu items.
+     * Registers the “Crawler Record” meta‑box on the post editing screen.
+     *
+     * The meta‑box is added to the *Advanced* context so that it does not clutter the editor
+     * for non‑technical users.  It is only visible to capable editors (users with
+     * `edit_post` capability).
+     *
      */
     public function register_meta_box() {
 …
     /**
+     * Renders the per‑post “Crawler Record” meta‑box.
+     *
+     * Why:  The box shows a table of all bots, each with:
+     *   - A human‑readable label.
+     *   - The last time the bot accessed the post (`Last Seen`).
+     *   - A status badge that tells you whether the URL is allowed by
+     *     `robots.txt`.
+     *   - A collapsible section that reveals the exact UA string
+     *     the bot used when it visited.
+     *
+     * The function also prints a warning if the site is globally
+     * blocking all crawlers (WordPress “Discourage search‑engine” setting).
+     *
+     * @param WP_Post $post Current post object.
+     * Renders the contents of the “Crawler Record” meta‑box.
+     *
+     * Generates a table that lists every tracked agent, its last‑seen timestamp, and a
+     * robots‑txt allow/blocked status.  The UI includes a collapsible `details` element
+     * that shows the exact User‑Agent string used for the most recent hit.
+     *
+     * @param WP_Post $post The post object currently being edited.
      * @return void
      */
 …
                 'diag'   => $diag,
                 'ua'     => $last_ua,
                 'na'     => ( 'google_extended' === $key ), // treat Google-Extended as non-crawler
+                'na' => $this->is_robots_only_agent( $key ),
             ];
+        }
 …
     /**
+     * Adds a “Crawler Record” entry to the front‑end admin bar.
+     *
+     * Why:  Site admins often browse the front‑end while logged in.
+     *       The toolbar provides a one‑click shortcut to the
+     *       detailed bot‑status page and shows a quick list of
+     *       timestamps for each bot, whether looking at a single page,
+     *       a specific URL, or the entire site.
+     *
+     * @param WP_Admin_Bar $wp_admin_bar Admin‑bar object.
+     * Adds a “Crawler Record” node to the front‑end admin bar with context‑aware children.
+     *
+     * The node is only shown to logged‑in users viewing the front‑end (not the admin).  Depending
+     * on whether the current page is singular or generic, the children list will display
+     * per‑post or site‑wide last‑seen data for each crawler.
+     *
+     * @param WP_Admin_Bar $wp_admin_bar The admin bar instance.
      * @return void
      */
 …
+        }
         $admin_page_url = admin_url( 'admin.php?page=cls-crawler-record' );
+        $admin_page_url = admin_url( 'admin.php?page=' . DZCR_ADMIN_SLUG );
         // Parent link
 …
                 foreach ( $wrap['items'] as $key => $_def ) {
                     if ( 'google_extended' === $key ) {
                         continue; // do not show Google-Extended in admin bar
+                    if ( $this->is_robots_only_agent( $key ) ) {
+                        continue;
+                    }
                     $label = $this->get_label_for_key( $key );
 …
     /**
+     * Registers the plugin’s top‑level admin page.
+     *
+     * Why:  The settings page is a convenient place to:
+     *   - See all bots and their global timestamps.
+     *   - Inspect the last page each bot has seen.
+     *   - View robots.txt diagnostics for each bot.
+     * Registers the top‑level “Crawler Record” admin submenu page.
+     *
+     * The submenu is created under the **Settings** root so that it is consistently
+     * reachable regardless of the user role or the presence of other plugins.
+     *
      * @return void
 …
             __( 'Crawler Record', 'crawler-record' ),
             'manage_options',
             'cls-crawler-record',
+            DZCR_ADMIN_SLUG,
             [ $this, 'render_admin_page' ],
             'dashicons-search',
 …
     /**
+     * Renders the entire admin page.
+     *
+     * Why:  The page shows a per‑bot table containing:
+     *   - The bot’s human label.
+     *   - The most recent global timestamp.
+     *   - The link to the most recent page the bot visited.
+     *   - A status badge indicating robots.txt permissibility.
+     *
+     * The table also has a useful “Important” notice if
+     * WordPress is blocking all crawlers.
+     * Renders the public dashboard for all tracked agents.
+     *
+     * The page lists each bot’s last‑seen timestamp, the last URL it visited, and the
+     * result of a robots‑txt evaluation for the site’s home path.  A small embedded video
+     * tutorial is shown in the right‑hand column.
+     *
      * @return void
 …
                     : $this->robots_status_for_agent_path( $key, $home_path, $diag );
                 $show_last_fields_as_na = ( 'google_extended' === $key );
+                $show_last_fields_as_na = $this->is_robots_only_agent( $key );
                 // Agent cell (expandable UA string unless NA)
 …
     /**
+     * Checks if the site is configured to “Discourage search engines”.
+     *
+     * Why:  If WordPress’s global setting is enabled, all crawlers are
+     *       blocked regardless of `robots.txt`.  The function lets us
+     *       quickly short‑circuit further checks.
+     *
+     * @return bool True if the site is discouraging crawlers.
+     * Determines if WordPress is currently discouraging search engines (`blog_public` = 0).
+     *
+     * The helper is used to short‑circuit logic that would otherwise record activity,
+     * and to display a warning banner on admin screens.
+     *
+     * @return bool True when site‑wide crawling is disabled.
      */
     private function is_site_discouraged() {
 …
     /**
+     * Returns a status array for the “blocked by WordPress” case.
+     *
+     * Why:  Used when `is_site_discouraged()` is true; the admin UI
+     *       needs a consistent structure (`state` & `reason`) for
+     *       rendering the status badge.
+     *
+     * @return array `state` (blocked) and a human‑readable reason.
+     * Builds a standard “blocked by WordPress setting” status array.
+     *
+     * Returned from {@see robots_status_for_agent_path()} when the site is
+     * configured to discourage search engines.
+     *
+     * @return array{state:string,reason:string} Blocking information.
      */
     private function forced_block_status() {
 …
     /**
+     * Maps an internal bot key to the list of UA strings that the bot
+     * uses (used to choose the “most representative” UA for robots.txt
+     * checks).
+     *
+     * Why:  Different bots have distinct patterns for desktop/mobile,
+     *       legacy, and AI‑bot forms.  This helper returns the set of
+     *       tokens that must be matched against the `robots.txt` filter
+     *       logic.
+     *
+     * @param string $key Internal bot key.
+     * @return array List of tokens.
+     * Maps an internal bot key to the canonical User‑Agent string(s) that can appear
+     * in the site’s robots.txt.
+     *
+     * The mapping is used for robots‑txt group selection.  When a key maps to
+     * multiple tokens it is returned as an array of strings; otherwise a single‑element array.
+     *
+     * @param string $key The internal identifier for a bot.
+     * @return array<string> The list of UA strings to check in robots.txt.
      */
     private function robots_tokens_for_key(string $key): array {
 …
             case 'perplexity_user':      return ['Perplexity-User']; // normalize Unicode hyphens
+            // Meta
+            case 'meta_externalagent':
+                return ['meta-externalagent'];
+            case 'meta_webindexer':
+                return ['Meta-WebIndexer'];
+            // Apple
+            case 'applebot':
+                return ['Applebot'];
+            case 'applebot_extended':
+                return ['Applebot-Extended'];
             // DuckDuckGo
             case 'duckduckgo_search':    return ['DuckDuckBot'];
 …
     /**
+     * Chooses the appropriate `User‑Agent` group in robots.txt for
+     * a given bot.
+     *
+     * Why:  A robot may have multiple `User‑Agent` lines (desktop/mobile).
+     *       The longest exact match wins; if none match we fallback to
+     *       the wildcard group (`*`).  This function returns the entire
+     *       group structure (agents + rules) for that bot.
+     *
+     * @param string $robots_txt Raw text of robots.txt.
+     * @param array  $tokens     List of tokens that identify the bot.
+     * @return array Group with `agents` and `rules` keys.
+     * Returns true for bots that only check robots.txt and never fetch pages.
+     *
+     * For these bots the per‑post “last‑seen” columns are marked “N/A” because it is
+     * impossible to record an actual page visit.
+     *
+     * @param string $key The internal bot identifier.
+     * @return bool True when the bot is robots‑only.
+     */
+    private function is_robots_only_agent( string $key ): bool {
+        return in_array(
+            $key,
+            [
+                'google_extended',
+                'applebot_extended',
+            ],
+            true
+        );
+    }
+    /**
+     * Chooses the most specific robots.txt group that matches the requested bot tokens.
+     *
+     * The grouping algorithm follows the order of rules in the file: a group that
+     * lists a specific User‑Agent token and the longest matching rule wins.
+     * If no group matches, the first wildcard `*` group is returned (if present).
+     *
+     * @param string $robots_txt Raw robots.txt contents.
+     * @param array<string> $tokens List of canonical UA strings for the bot.
+     * @return array{agents:list<string>,rules:list<mixed>} The selected group’s agents and rules.
      */
     private function select_robots_group(string $robots_txt, array $tokens) {
 …
     /**
+     * Checks whether a path starts with the rule prefix.
+     *
+     * Why:  `robots.txt` rules are prefix based; e.g. `Disallow: /wp-admin`
+     *       blocks everything under `/wp-admin`.  A simple `strncmp()`
+     *       suffices because we already normalised the rule to start with
+     *       a slash and have removed any trailing `$` terminator.
+     *
+     * @param string $path Path from the request.
+     * @param string $rule Rule from robots.txt.
+     * @return bool True if the path matches the rule.
+     * Tests whether the supplied robots rule path is a strict prefix of $path.
+     *
+     * The helper is used by {@see robots_txt_allows()} when applying the longest‑prefix
+     * rule logic.
+     *
+     * @param string $path The request path.
+     * @param string $rule The rule path defined in robots.txt.
+     * @return bool True when $rule is a prefix of $path.
      */
     private function path_prefix_match(string $path, string $rule): bool {
 …
     /**
+     * Normalises hyphen characters in a string; replaces various Unicode
+     * hyphens with the ASCII hyphen.
+     *
+     * Why:  Some bots (e.g., Perplexity) use non‑standard hyphens in
+     *       their UA strings.  Normalising them guarantees that
+     *       string‑based matching (e.g. `strcasecmp()`) works
+     *       regardless of the particular hyphen glyph.
+     * Normalises a URL path so that an empty string becomes “/”.
+     *
+     * An empty path can cause subtle bugs in robots‑txt evaluation and is treated
+     * as the root path by WordPress.
+     *
+     * @param string $p The original path.
+     * @return string Normalised path.
+     */
+    private function normalize_path(string $p): string {
+        return ($p === '') ? '/' : $p;
+    }
+    /**
+     * Replaces every form of Unicode hyphen (–, ‑, －, …) with a simple ASCII hyphen.
+     *
+     * The rule set is chosen to avoid false mismatches when a site’s robots.txt
+     * contains variant hyphens in its UA tokens.
+     *
      * @param string $s Input string.
      * @return string Normalised string.
      */
-    private function normalize_path(string $p): string {
-        return ($p === '') ? '/' : $p;
+    }
-    /**
-     * Normalises hyphen characters in a string; replaces various Unicode
-     * hyphens with the ASCII hyphen.
+     *
-     * Why:  Some bots (e.g., Perplexity) use non‑standard hyphens in
-     *       their UA strings.  Normalising them guarantees that
-     *       string‑based matching (e.g. `strcasecmp()`) works
-     *       regardless of the particular hyphen glyph.
+     *
-     * @param string $s Input string.
-     * @return string Normalised string.
-     */
     private function normalize_hyphens(string $s): string {
         return preg_replace('/[\x{2010}-\x{2015}\x{2212}\x{FE58}\x{FE63}\x{FF0D}]/u','-',$s);
 …
     /**
+     * Parses a raw `robots.txt` string into an array of groups.
+     *
+     * Why:  The plugin re‑implements the minimal parsing logic needed
+     *       for our use‑case (User‑Agent, Allow, Disallow).  The function
+     *       ignores other directives (Sitemap, Crawl‑delay) and keeps
+     *       groups and rules in the order they appear, which is essential
+     *       for the longest‑path‑wins logic.
+     *
+     * @param string $txt Raw content of robots.txt.
+     * @return array Array of groups, each with `agents` and `rules`.
+     * Parses the raw robots.txt into an array of agent‑group objects.
+     *
+     * The parser emits structures of the form:
+     *   [{'agents'=>[...], 'rules'=>[['allow'|'disallow', '/path'], ...]}, ...]
+     *  * Non‑essential lines (`Sitemap:`, `Crawl‑delay:`, comments) are ignored.
+     *  * Blank lines reset state, but a group is only flushed when a director is encountered.
+     *
+     * @param string $txt Raw robots.txt string.
+     * @return array<List<string>,list<mixed>> List of groups, each with `agents` and `rules`.
      */
     private function parse_robots_groups(string $txt): array {
 …
     /**
      * Retrieves the raw `robots.txt` content for the current site.
+     *
      * Why:  We avoid an HTTP request to the public `robots.txt`.
      *       WordPress can generate it via `do_robots()` or the file can
      *       be read directly.  The function falls back to the filter
      *       `robots_txt` if no file or `do_robots()` output is
      *       available.
+     *
      * @return string|null Raw robots.txt body or null if unavailable.
+     * Retrieves the active robots.txt for the current site.
+     *
+     * The method respects the order imposed by the WordPress core:
+     *   1. Physical `robots.txt` file
+     *   2. `wp_robots()` (WP 5.7+)
+     *   3. `do_robots` action
+     *   4. `robots_txt` filter
+     *
+     * @return string|null The robots.txt contents, or `null` if none could be generated.
      */
     private function get_local_robots_txt() {
+        // 1. Physical robots.txt file takes precedence
         $file = ABSPATH . 'robots.txt';
         if ( @is_readable( $file ) ) {
             $body = @file_get_contents( $file );
+            if ( is_string( $body ) && '' !== $body ) return $body;
+        }
+            if ( is_string( $body ) && '' !== trim( $body ) ) {
+                return $body;
+            }
+        }
+        // 2. Prefer wp_robots() when available (WP 5.7+)
+        if ( function_exists( 'wp_robots' ) ) {
+            ob_start();
+            wp_robots();
+            $out = ob_get_clean();
+            if ( is_string( $out ) && '' !== trim( $out ) ) {
+                return $out;
+            }
+        }
+        // 3. Fallback to core do_robots action
+        // do_robots is a WordPress core action used to generate robots.txt output.
         ob_start();
+        // phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedHooknameFound
         do_action( 'do_robots' );
         $out = ob_get_clean();
+        if ( is_string( $out ) && '' !== trim( $out ) ) return $out;
+        if ( is_string( $out ) && '' !== trim( $out ) ) {
+            return $out;
+        }
+        // 4. Final fallback via robots_txt filter
+        // phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedHooknameFound
         $output = apply_filters( 'robots_txt', '', get_option( 'blog_public' ) );
+        if ( is_string( $output ) && '' !== trim( $output ) ) return $output;
+        if ( is_string( $output ) && '' !== trim( $output ) ) {
+            return $output;
+        }
         return null;
 …
     /**
+     * Returns true if the robots.txt rules permit the bot to access the path.
+     *
+     * Why:  The function implements a simplified rule engine that:
+     *   1. Chooses the best User‑Agent group for the bot (exact match
+     *      or wildcard).
+     *   2. Normalises the path and checks it against each rule in the
+     *      selected group.
+     *   3. Uses longest‑path‑wins; disallow wins only if it
+     *      matches a longer path than any allow.  If no rule matches,
+     *      the URL is allowed by default.
+     *
+     * @param string $robots_txt Raw robots.txt text.
+     * @param string $agent_key  Internal bot key.
+     * @param string $path       Requested path.
+     * @return bool True if the path is allowed for the bot.
+     * Determines whether the given bot is allowed to fetch $path based on the active robots.txt.
+     *
+     * Implements the standard longest‑prefix‑wins rule for a single bot’s group.  The method
+     * returns `true` for ``allow`` or for paths that match no rule (default allow).
+     *
+     * @param string $robots_txt The retrieved robots.txt contents.
+     * @param string $agent_key The internal identifier for the bot.
+     * @param string $path The requested URL path (starting with “/”).
+     * @return bool True when crawling is permitted, false when disallowed.
      */
     private function robots_txt_allows(string $robots_txt, string $agent_key, string $path): bool {
 …
     /**
      * Public wrapper that provides a “state” + “reason” array describing
      * whether the bot is allowed to crawl the path, and populates
      * diagnostics output for the UI.
+     *
      * Why:  The admin pages display a status badge; this method produces
      *       the data the badge expects and also records diagnostic
      *       information (matched group & rule) that can be shown in
      *       an expandable details box.
+     *
      * @param string $agent_key Bot key.
      * @param string $path      Requested path.
      * @param array  &$diag     Output diagnostics (group, rule).
      * @return array (`state` => 'allowed'|'blocked'|'unknown',  `reason` => string).
+     * Public API that reports the allow/blocked status for a bot and a specific path.
+     *
+     * The method first checks whether the site globally discourages crawlers (`blog_public=0`).
+     * If not, it loads the robots.txt, selects the appropriate group, and evaluates
+     * the path.  A detailed diagnostic array (group name, rule used) is returned
+     * alongside the status.  When a global block applies, the status is `blocked`
+     * with a specific reason.
+     *
+     * @param string $agent_key The bot’s internal key (e.g. 'googlebot_desktop').
+     * @param string $path Path fragment (starting with “/”) to test.
+     * @param array|null $diag Optional reference parameter that receives diagnostics about
+     *                         the matching group and rule; can be omitted if not needed.
+     * @return array{state:string,reason:string}<p>Possible `state` values: `allowed`, `blocked`, `unknown`.</p>
      */
     public function robots_status_for_agent_path( $agent_key, $path, &$diag = null ) {
 …
     /**
+     * Renders a status badge and an expandable details section.
+     *
+     * Why:  The status badge (green check, red X, gray) gives a quick visual
+     *       cue.  The details expand on hover/click and show the exact
+     *       robots.txt group and rule that determined the decision.
+     *
+     * @param array $status Associative array returned by
+     *                      `robots_status_for_agent_path`.
+     * @param array $diag   Diagnostics array (group & rule).
+     * @return string HTML safe snippet for the badge + details.
+     * Renders a colour‑coded badge indicating the robots.txt status and a collapsible
+     * details section that shows which rule was matched.
+     *
+     * The badge colors are:
+     *   * Green with a checkmark for “allowed”
+     *   * Red with a cross for “blocked”
+     *   * Grey for “unknown”
+     *
+     * @param array{state:string,reason:string} $status Returned from {@see robots_status_for_agent_path()}.
+     * @param array{group:string,rule:string} $diag Diagnostics array for detail output.
+     * @return string Safe HTML for the badge + details element.
      */
     private function render_status_badge_expandable( $status, $diag ) {
 …
     /**
+     * Formats a timestamp in “human‑readable” form and a precise
+     * full‑timestamp (with micro‑seconds) for display.
+     *
+     * Why:  Users want to see “3 days ago (2025‑08‑12 15:04:23.123456)”.
+     *       This helper keeps the code in the main rendering loop
+     *       terse and centralises the formatting logic.
+     *
+     * @param float $ts Timestamp from the database.
+     * @return string HTML safe representation.
+     * Formats a UNIX timestamp (with microseconds) into a human‑readable “time‑ago” string
+     * plus the exact date/time in the site’s configured timezone.
+     *
+     * The output is safe for HTML rendering and is not localized beyond what
+     * WordPress’ `human_time_diff()` and `wp_date()` provide.
+     *
+     * @param float $ts Timestamp value returned by `microtime(true)`.
+     * @return string The formatted cell content; if $ts is falsy, “Not Yet” is returned.
      */
     private function format_last_seen_cell( $ts ) {
 …
     /**
+     * Formats the string shown in the admin bar for each bot.
+     *
+     * Why:  The toolbar entry should be compact (label + timestamp)
+     *       but still show the exact time.  This helper keeps the
+     *       formatting consistent between the toolbar and the
+     *       meta‑box.
+     *
+     * @param string $label Label for the bot.
+     * @param float  $ts    Timestamp (may be 0).
+     * @param string $suffix Optional small label (e.g., “today”).
+     * @return string Safe html string.
+     * Builds the display string used in admin‑bar nodes.
+     *
+     * It shows the label, the relative “time‑ago” string, and the absolute timestamp
+     * (with microseconds).  If no timestamp is available, “Not Yet” is used.
+     *
+     * @param string $label Human‑readable name of the agent.
+     * @param float $ts Timestamp value or 0 for “Not Yet”.
+     * @param string $suffix Optional suffix string to append after the label.
+     * @return string The formatted admin‑bar line.
      */
     private function format_admin_bar_line( $label, $ts, $suffix = '' ) {
 …
     /**
+     * Formats the “last page” cell that appears in the admin page.
+     *
+     * Why:  In the list view we only want the URL, not the post title.
+     *       The helper converts the stored data from `compute_agent_latest()`
+     *       into a link or an “–” if the data is missing.
+     *
+     * @param array $latest Associative array from `compute_agent_latest`.
+     * @return string HTML markup.
+     * Formats the “last page” cell of the admin dashboard table.
+     *
+     * Accepts either a post ID or a raw URL string.  The output is a clickable link
+     * that opens the page in a new tab.  Empty or missing values become a dash.
+     *
+     * @param array{ts:float,type:string,post_id:int,url:string,ua:string} $latest Data from {@see compute_agent_latest()}.
+     * @return string The safe HTML for the link or a dash.
      */
     private function format_context_cell_url_only( $latest ) {
 …
     /**
+     * Converts a UTC timestamp to the administrator’s timezone string.
+     *
+     * Why:  All timestamps are stored in UTC; showing them in the
+     *       local timezone (configured in Settings → General)
+     *       is far more user‑friendly.  The helper uses `wp_date()` if
+     *       available, else falls back to `date_i18n()`.
+     *
+     * @param int    $sec    Unix timestamp.
+     * @param string $format Optional format string (uses WordPress defaults if omitted).
+     * @return string Localised timestamp.
+     * Returns the current site time in the configured timezone, formatted with the given mask.
+     *
+     * Wrapper around `wp_date()` (WordPress 5.5+) and falls back to
+     * `date_i18n()` on older cores.  The function takes a UNIX timestamp
+     * (seconds since the epoch) and a PHP date format string.
+     *
+     * @param int $sec Unix timestamp in seconds.
+     * @param string $format PHP date format mask.
+     * @return string Formatted date/time string.
      */
     private function format_site_tz( $sec, $format ) {
 …
     /**
+     * Computes the most recent known access time for a given bot.
+     *
+     * Why:  The admin page needs to show the “last seen” across the
+     *       entire site.  This method:
+     *   1. Fetches the cached “latest post” option and reads the
+     *      associated meta.  If that timestamp is newer, it is used.
+     *   2. Reads the cached “latest URL” option (created during recording)
+     *      which holds the exact URL and UA.  If that is newer than the
+     *      post timestamp, it wins.
+     *   3. Returns an associative array (`ts`, `type`, `post_id`, `url`, `ua`).
+     *
+     * @param string $key Bot key.
+     * @return array Latest visit info.
+     * Computes the most recent event for a given bot across the entire site.
+     *
+     * The algorithm inspects:
+     *   1. The post‑meta value of the last seen timestamp and associated UA for the last post that bot hit.
+     *   2. The `dzcr_latest_url_{$key}` option that records the most recently accessed URL.
+     *   3. Falls back to a default empty result if nothing is found.
+     *
+     * The function returns the candidate that has the greatest timestamp, together with
+     * the type (“post” or “url”) and details of the location.
+     *
+     * @param string $key Internal bot identifier.
+     * @return array{ts:float,type:string,post_id:int,url:string,ua:string} The best match.
      */
     private function compute_agent_latest( $key ) {
 …
     /**
+     * Adds “Settings” and “Documentation” links to the plugin’s row
+     * in the WordPress Plugins page.
+     *
+     * Why:  Site admins often look for quick access to the plugin’s
+     *       configuration.  Adding these links saves a few clicks.
+     *
+     * @param array $links Existing action links.
+     * @return array Updated array with new links.
+     * Adds “Settings” and “Documentation” links to the plugin’s row in the admin‑plugins page.
+     *
+     * The links open in the same window and in a new tab, respectively.
+     *
+     * @param array<string,string> $links Array of existing action links.
+     * @return array<string,string> The extended links array.
      */
     public function plugin_action_links( $links ) {
         $settings_url = admin_url( 'admin.php?page=cls-crawler-record' );
+        $settings_url = admin_url( 'admin.php?page=' . DZCR_ADMIN_SLUG );
         $docs_url     = 'https://www.dizzysoft.com/crawler-record-plugin-for-wordpress/';

crawler-record/tags/0.9.0/readme.txt

-                      r3400651
+                      r3423919
 Tags: googlebot, bingbot, gptbot, seo, robots
 Requires at least: 6.0
 Tested up to: 6.8
+Tested up to: 6.9
 Requires PHP: 7.4
 Stable tag: 0.8.0
 …
 == Changelog ==
+= 0.9.0 =
+* Now monitoring for Meta and Apple User Agents
+* More accurate site-wide UA reporting.
+* Ensured video tutorial appears on all admin screens.
+* Fixed small code errors.
 = 0.8.0 =

crawler-record/tags/0.9.0/uninstall.php

-                      r3400651
+                      r3423919
 // 2) Remove any known discrete options (if you introduced settings later).
 $discrete_options = array(
     'dzcr_settings',       // reserved for future settings array.
     'dzcr_agents_custom',  // reserved if you ever allow custom agent configs.
+$dzcr_discrete_options = array(
+    'dzcr_settings',
+    'dzcr_agents_custom',
 );
 foreach ( $discrete_options as $opt ) {
     delete_option( $opt );
+foreach ( $dzcr_discrete_options as $dzcr_opt ) {
+    delete_option( $dzcr_opt );
+}
 // 3) Optional: purge post meta set by the plugin (disabled by default).
 // Enable by defining DZCR_PURGE_POSTMETA true in wp-config.php OR using the filter below.
 $purge_postmeta = ( defined( 'DZCR_PURGE_POSTMETA' ) && DZCR_PURGE_POSTMETA )
+$dzcr_purge_postmeta = ( defined( 'DZCR_PURGE_POSTMETA' ) && DZCR_PURGE_POSTMETA )
     || apply_filters( 'dzcr_uninstall_purge_postmeta', false );
 if ( $purge_postmeta ) {
+if ( $dzcr_purge_postmeta ) {
     // Delete meta keys written per post:
     //  - _dzcr_last_seen_{agent}

crawler-record/trunk/crawler-record.php

-                      r3400643
+                      r3423919
  * Plugin URI:  https://www.dizzysoft.com/crawler-record-plugin-for-wordpress/
  * Description: Are the most common search and chat (AI/LLM) bots able to access the pages on your website? Crawler Record can tell you the last time each of the most common search/chat bots visited -and which pages at which they looked.
  * Version:     0.8.0
+ * Version:     0.9.0
  * Requires at least: 6.0
  * Tested up to: 6.8
+ * Tested up to: 6.9
  * Requires PHP: 7.4
  * Author:      dizzysoft
 …
+}
+define( 'CRAWLER_RECORD_VERSION', '0.8.0' );
+define( 'CRAWLER_RECORD_VERSION', '0.9.0' );
+define( 'DZCR_ADMIN_SLUG', 'dzcr-crawler-record' );
 class Crawler_Record {
+    /**
+     * Holds a list of all bot groups and their UA patterns.
+     *
+     * Why:  The agent list is small, stable, and rarely changes; storing
+     *       it in code removes the need for a custom database table
+     *       and keeps migration simple.  The `apply_filters()` wrapper
+     *       lets developers add or replace bots without touching this file.
+     *
+     * @var array
+     */
+    // https://www.searchenginejournal.com/ai-crawler-user-agents-list/558130/
     private $dzcr_default_agent_groups = [
         'Google' => [
 …
         ],
+        'Meta' => [
+            'doc'   => 'https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/',
+            'items' => [
+                'meta_externalagent' => [
+                    'label'   => 'Meta-ExternalAgent',
+                    'pattern' => '#meta-externalagent/\d+(?:\.\d+)?#i',
+                ],
+                'meta_webindexer' => [
+                    'label'   => 'Meta-WebIndexer',
+                    'pattern' => '#meta-webindexer/\d+(?:\.\d+)?#i',
+                ],
+            ],
+        ],
+        'Apple' => [
+            'doc'   => 'https://support.apple.com/en-us/119829',
+            'items' => [
+                'applebot' => [
+                    'label'   => 'Applebot',
+                    'pattern' => '#Applebot/\d+(?:\.\d+)?#i',
+                ],
+                // Robots-only (does not crawl pages)
+                'applebot_extended' => [
+                    'label'   => 'Applebot-Extended (AI)',
+                    'pattern' => '#(?!.)#',
+                ],
+            ],
+        ],
         'DuckDuckGo' => [
             'doc'   => 'https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot',
 …
     ];
-    /**
-     * Cache of the resolved, filtered agent list.
+     *
-     * Why:  Resolving the agent list once per request saves expensive work
-     *       (filtering & validation).  Stored in memory for the duration of
-     *       the request; no persistence needed.
+     *
-     * @var array|null
-     */
     private $dzcr_agents_effective_cache = null;
-    /**
-     * Cache of the flat list of UA patterns keyed by the agent key.
+     *
-     * Why:  Many look-ups need a simple key > regex map; building it once avoids
-     *       iterating through the nested agent group structure each time.
+     *
-     * @var array|null
-     */
     private $dzcr_agents_flat_cache      = null;
-    /**
-     * Prefix used for per-post Meta keys that store the last timestamp a bot
-     * visited the post.
+     *
-     * Why:  Namespacing the meta key prevents collisions with other plugins.
-     *       The prefix is combined with the unique agent key.
+     *
-     * @var string
-     */
     private $dzcr_meta_prefix     = '_dzcr_last_seen_';
-    /**
-     * Prefix used for site-wide last-seen Meta keys (stored as options).
+     *
-     * Why:  Allows quick access to the most recent time *any* page was seen by
-     *       each bot without scanning the entire database.
+     *
-     * @var string
-     */
     private $dzcr_site_prefix     = '_dzcr_site_last_';
-    /**
-     * Prefix used for per-URL last-seen keys (stored as options).
+     *
-     * Why:  URLs are not tied to a post, so options keep arbitrary
-     *       key/value pairs without cluttering postmeta.
+     *
-     * @var string
-     */
     private $dzcr_url_prefix      = '_dzcr_url_last_';
-    /**
-     * Prefix used for the last-post-ID per bot (stored as options).
+     *
-     * Why:  This is a fast index: if the last post ID is known we can fetch
-     *       its meta immediately, avoiding a full table scan.
+     *
-     * @var string
-     */
     private $dzcr_lastpost_prefix = '_dzcr_last_post_';
-    /**
-     * Class constructor.
+     *
-     * Why:  Hooks the core functionality into WordPress:
-     *   - `template_redirect` records when a crawler requests a page.
-     *   - `add_meta_boxes` adds the per-post "Crawler Record" box.
-     *   - `admin_bar_menu` inserts a quick toolbar entry for logged-in users.
-     *   - `admin_menu` registers the settings page in the admin sidebar.
-     *   - `plugin_action_links_*` adds Settings/Documentation links on the
-     *     Plugins screen.
+     *
-     * This is the entry point of the plugin â€“ everything else is called
-     * indirectly via these actions.
-     */
     public function __construct() {
         add_action( 'template_redirect', [ $this, 'maybe_record_last_seen' ] );
 …
     public function enqueue_admin_assets() {
         $screen = function_exists('get_current_screen') ? get_current_screen() : null;
         if ( ! $screen || $screen->id !== 'toplevel_page_crawler-record' ) {
+        if ( ! $screen || $screen->id !== 'toplevel_page_' . DZCR_ADMIN_SLUG ) {
             return; // only on the plugin admin page
+        }
 …
             'https://player.vimeo.com/api/player.js',
             [],
             $ver,
+            CRAWLER_RECORD_VERSION,
             true
         );
 …
+    }
+    /**
+     * Retrieves the effective agent groups after filters, validation,
+     * and custom ordering have been applied.
+     *
+     * Why:  The default list is processed once per request.  This method
+     *       returns a validated array that can be cached for all other
+     *       look‑ups (`get_agent_groups()`).  The validation step removes
+     *       malformed entries and guarantees the presence of a `doc`
+     *       URL and a `label` for each item.
+     *
+     * @return array The resolved, validated agent groups.
+    /**
+     * Lazily resolves the complete agent‑group tree, applying the `dzcr_agent_groups`
+     * and `dzcr_agents_order` filters, and caches the result for the lifetime of the request.
+     *
+     * The returned array maps group names to an array containing a `doc` URL and a list of
+     * individual bot definitions (`label` and `pattern`).  The data is filtered so that
+     * malformed entries (missing patterns or labels) are removed automatically.
+     *
+     * @return array<string,array<string,mixed>> The fully‑validated and ordered agent groups.
      */
     private function get_agent_groups() {
 …
     /**
+     * Returns a flattened map of bot keys to regex pattern strings.
+     *
+     * Why:  The recording logic (`maybe_record_last_seen`) needs a
+     *       simple `foreach ($agent_key => $pattern)` loop.  Flattening
+     *       removes the overhead of walking the nested group structure
+     *       for every request.
+     *
+     * @return array Key → regex string.
+     * Returns a flattened map of every bot key to its compiled regular‑expression pattern.
+     *
+     * The function iterates over the effective agent groups obtained via {@see get_agent_groups()}
+     * and builds a key‑=>pattern list.  The result is cached for the current request.
+     *
+     * @return array<string,string> Bot key → regex pattern.
      */
     private function get_agents_flat() {
 …
     /**
+     * Looks up the human‑readable label for a bot key.
+     *
+     * Why:  The UI shows the label, not the internal key.  The helper
+     *       centralises the lookup and provides a default in case the
+     *       key is unknown.
+     *
+     * @param string $key Internal bot identifier.
+     * @return string Human‑readable label.
+     * Looks up a human‑readable label for a bot key in the current agent‑group configuration.
+     *
+     * If the key cannot be found, its raw value is returned as a fallback.  This helper is
+     * used to keep UI code terse while still presenting friendly names to site editors.
+     *
+     * @param string $key The internal identifier for a bot (e.g. 'googlebot_desktop').
+     * @return string The user‑friendly label.
      */
     private function get_label_for_key( $key ) {
 …
     /**
+     * Builds the canonical URL of the current front‑end request.
+     *
+     * Why:  Recorded timestamps are stored per‑URL, so we need an exact,
+     *       normalised URL.  The function normalises the scheme
+     *       (`http/https`), host, path, and applies the
+     *       `dzcr_normalize_url` filter (useful for stripping tracking
+     *       parameters or converting to a canonical form).
+     *
+     * @return string Full URL (or empty string in admin/invalid state).
+     */
+    private function current_url() {
+        if ( is_admin() ) {
+     * Generates the canonical, normalized permalink for the queried singular post.
+     *
+     * The function guarantees the returned URL does **not** contain a query string,
+     * a fragment identifier, or an empty path.  The result is suitable for storage
+     * in post meta and for robots‑txt matching.
+     *
+     * @param int $post_id By reference, receives the resolved post ID or 0 if no singular.
+     * @return string The sanitized absolute URL, or an empty string when no query can be resolved.
+     */
+    private function current_url_for_post( &$post_id = 0 ) {
+        if ( is_admin() || ! is_singular() ) {
             return '';
+        }
+        if ( ! isset( $_SERVER['HTTP_HOST'], $_SERVER['REQUEST_URI'] ) ) {
+        $post_id = (int) get_queried_object_id();
+        if ( ! $post_id ) {
             return '';
+        }
+        $scheme = ( ! empty( $_SERVER['HTTPS'] ) && 'off' !== $_SERVER['HTTPS'] ) ? 'https' : 'http';
+        $host   = sanitize_text_field( wp_unslash( $_SERVER['HTTP_HOST'] ) );
+        $uri    = esc_url_raw( wp_unslash( $_SERVER['REQUEST_URI'] ) );
+        if ( '' === $host ) {
+        $url = get_permalink( $post_id );
+        if ( ! $url ) {
             return '';
+        }
+        $url = $scheme . '://' . $host . $uri;
+        $url = apply_filters( 'dzcr_normalize_url', $url );
+        if ( strlen( $url ) > 2048 ) {
+            $url = substr( $url, 0, 2048 );
+        }
+        return $url;
+    }
+    /**
+     * Records the last “seen” timestamp when a crawler requests a page.
+     *
+     * Why:  This method is the heart of the plugin.  It is called on
+     *       `template_redirect` and performs the following actions:
+     *
+     *   1. Skips previews, feeds, REST, admin, AJAX, and cron requests.
+     *   2. Only processes GET/HEAD requests, which are the standard
+     *      HTTP verbs used by crawlers.
+     *   3. Trims and sanitises the UA string (max 512 chars) and
+     *      normalises it to avoid runaway regexes.
+     *   4. Throttles writes using a 10‑minute window (configurable via
+     *      `dzcr_throttle_window`) to avoid spamming the DB when the same
+     *      bot visits the page repeatedly.
+     *   5. Stores three pieces of information:
+     *        - Per‑post meta (`_dzcr_last_seen_[bot]`).
+     *        - Per‑URL options (`_dzcr_url_last_[bot]_[hash]`) plus companion
+     *          `_url_` and `_ua_` options.
+     *        - Site‑wide option (`_dzcr_site_last_[bot]`).
+     *
+     * The method also updates the “last UA string” meta/option so the
+     * UI can show exactly what the bot sent.
+     *
+        // Normalize: strip query string and fragment
+        $parts = wp_parse_url( $url );
+        if ( empty( $parts['host'] ) || empty( $parts['path'] ) ) {
+            return '';
+        }
+        $scheme = $parts['scheme'] ?? 'https';
+        return $scheme . '://' . $parts['host'] . $parts['path'];
+    }
+    /**
+     * Core routine that records bot visit timestamps when the current request matches a known crawler.
+     *
+     * The method is executed early in the front‑end rendering cycle (`template_redirect`).
+     * 1. It exits immediately for admin screens, REST requests, feeds, `HEAD` requests, or
+     *    when the `User‑Agent` header is missing or too long.
+     * 2. The UA string is matched against the compiled regex list.  Only the first matching
+     *    rule is considered.
+     * 3. A per‑post meta key and a global option are updated if the throttle window
+     *    (10 minutes by default) allows it.  The method also records the exact UA string
+     *    that triggered the update so editors can inspect it later.
+     *
+     * @param int $post_id Optional post ID when the query resolves to a single post.  Passed by value.
      * @return void
+     * @throws RuntimeException If the PHP `set_transient` failures occur (unlikely, but reported).
      */
     public function maybe_record_last_seen() {
 …
         $now     = microtime( true );
+        $post_id = is_singular() ? (int) get_queried_object_id() : 0;
         $url     = $this->current_url();
         $urlhash = $url ? md5( $url ) : '';
+        $post_id = 0;
+        $url     = $this->current_url_for_post( $post_id );
         $throttle = (int) apply_filters( 'dzcr_throttle_window', 10 * MINUTE_IN_SECONDS );
 …
+            }
+            // Per URL
+            if ( $urlhash ) {
+                $opt_key     = $this->dzcr_url_prefix . $key . '_' . $urlhash;
+                $opt_url_key = $this->dzcr_url_prefix . $key . '_url_' . $urlhash;
+                $opt_ua_key  = $this->dzcr_url_prefix . $key . '_ua_'  . $urlhash; // store last UA for this URL+agent
+                $t_url       = 'dzcr_seen_url_' . $key . '_' . $urlhash;
+                if ( ! $throttle || ! get_transient( $t_url ) ) {
+                    $prev = (float) get_option( $opt_key, 0 );
+                    if ( $now > $prev ) {
+                        update_option( $opt_key, (string) $now, false );
+                        update_option( $opt_url_key, $url, false );
+                        update_option( $opt_ua_key,  $ua,  false );
+                    }
+                    if ( $throttle ) {
+                        set_transient( $t_url, 1, $throttle );
+                    }
+            // Site-wide latest real post/page
+            if ( $post_id && $url ) {
+                $latest_key = 'dzcr_latest_url_' . $key;
+                $prev       = get_option( $latest_key, [] );
+                $prev_ts    = isset( $prev['ts'] ) ? (float) $prev['ts'] : 0.0;
+                if ( $now > $prev_ts ) {
+                    update_option(
+                        $latest_key,
+                        [
+                            'ts'  => (string) $now,
+                            'url' => $url,
+                            'ua'  => $ua,
+                        ],
+                        false
+                    );
+                }
+            }
 …
     /**
+     * Safely matches a UA string against a stored pattern or literal value.
+     *
+     * Why:  Direct use of `preg_match()` on untrusted user‑agent strings
+     *       can trigger PHP warnings/errors (e.g. malformed regex).
+     *       This wrapper:
+     *   - Ensures the pattern begins with a `#` and ends with `#i`.
+     *   - Suppresses `preg_match()` errors with the `@` operator.
+     *   - Validates that the regex is syntactically correct.
+     *   - Fallbacks to literal `hash_equals()` comparison when a plain string
+     *     is supplied.
+     *
+     * @param string $ua             The UA string from the request.
+     * @param string $pattern_or_exact  Either a regex string or a literal.
+     * @return bool True if the UA matches the pattern or string.
+     * Safely tests whether a given User‑Agent string matches a pattern.
+     *
+     * Handles both plain strings (exact comparison) and regular‑expressions by inspecting
+     * the first character.  The function avoids the costly `preg_match` errors through
+     * the `@` error‑control operator and checks the regex error code.
+     *
+     * @param string $ua The User‑Agent string to test.
+     * @param string $pattern_or_exact Either a regex (enclosed in `#...#i`) or a literal string.
+     * @return bool True if the UA satisfies the pattern; otherwise false.
      */
     private function ua_matches_safe( $ua, $pattern_or_exact ) {
 …
     /**
+     * Registers the “Crawler Record” meta‑box that appears on the post
+     * editing screen.
+     *
+     * Why:  Post editors need instant insight into which crawlers have
+     *       visited the post and their access status.  The meta‑box
+     *       appears in the normal “normal” context with high priority,
+     *       making it highly visible without adding extra menu items.
+     * Registers the “Crawler Record” meta‑box on the post editing screen.
+     *
+     * The meta‑box is added to the *Advanced* context so that it does not clutter the editor
+     * for non‑technical users.  It is only visible to capable editors (users with
+     * `edit_post` capability).
+     *
      */
     public function register_meta_box() {
 …
     /**
+     * Renders the per‑post “Crawler Record” meta‑box.
+     *
+     * Why:  The box shows a table of all bots, each with:
+     *   - A human‑readable label.
+     *   - The last time the bot accessed the post (`Last Seen`).
+     *   - A status badge that tells you whether the URL is allowed by
+     *     `robots.txt`.
+     *   - A collapsible section that reveals the exact UA string
+     *     the bot used when it visited.
+     *
+     * The function also prints a warning if the site is globally
+     * blocking all crawlers (WordPress “Discourage search‑engine” setting).
+     *
+     * @param WP_Post $post Current post object.
+     * Renders the contents of the “Crawler Record” meta‑box.
+     *
+     * Generates a table that lists every tracked agent, its last‑seen timestamp, and a
+     * robots‑txt allow/blocked status.  The UI includes a collapsible `details` element
+     * that shows the exact User‑Agent string used for the most recent hit.
+     *
+     * @param WP_Post $post The post object currently being edited.
      * @return void
      */
 …
                 'diag'   => $diag,
                 'ua'     => $last_ua,
                 'na'     => ( 'google_extended' === $key ), // treat Google-Extended as non-crawler
+                'na' => $this->is_robots_only_agent( $key ),
             ];
+        }
 …
     /**
+     * Adds a “Crawler Record” entry to the front‑end admin bar.
+     *
+     * Why:  Site admins often browse the front‑end while logged in.
+     *       The toolbar provides a one‑click shortcut to the
+     *       detailed bot‑status page and shows a quick list of
+     *       timestamps for each bot, whether looking at a single page,
+     *       a specific URL, or the entire site.
+     *
+     * @param WP_Admin_Bar $wp_admin_bar Admin‑bar object.
+     * Adds a “Crawler Record” node to the front‑end admin bar with context‑aware children.
+     *
+     * The node is only shown to logged‑in users viewing the front‑end (not the admin).  Depending
+     * on whether the current page is singular or generic, the children list will display
+     * per‑post or site‑wide last‑seen data for each crawler.
+     *
+     * @param WP_Admin_Bar $wp_admin_bar The admin bar instance.
      * @return void
      */
 …
+        }
         $admin_page_url = admin_url( 'admin.php?page=cls-crawler-record' );
+        $admin_page_url = admin_url( 'admin.php?page=' . DZCR_ADMIN_SLUG );
         // Parent link
 …
                 foreach ( $wrap['items'] as $key => $_def ) {
                     if ( 'google_extended' === $key ) {
                         continue; // do not show Google-Extended in admin bar
+                    if ( $this->is_robots_only_agent( $key ) ) {
+                        continue;
+                    }
                     $label = $this->get_label_for_key( $key );
 …
     /**
+     * Registers the plugin’s top‑level admin page.
+     *
+     * Why:  The settings page is a convenient place to:
+     *   - See all bots and their global timestamps.
+     *   - Inspect the last page each bot has seen.
+     *   - View robots.txt diagnostics for each bot.
+     * Registers the top‑level “Crawler Record” admin submenu page.
+     *
+     * The submenu is created under the **Settings** root so that it is consistently
+     * reachable regardless of the user role or the presence of other plugins.
+     *
      * @return void
 …
             __( 'Crawler Record', 'crawler-record' ),
             'manage_options',
             'cls-crawler-record',
+            DZCR_ADMIN_SLUG,
             [ $this, 'render_admin_page' ],
             'dashicons-search',
 …
     /**
+     * Renders the entire admin page.
+     *
+     * Why:  The page shows a per‑bot table containing:
+     *   - The bot’s human label.
+     *   - The most recent global timestamp.
+     *   - The link to the most recent page the bot visited.
+     *   - A status badge indicating robots.txt permissibility.
+     *
+     * The table also has a useful “Important” notice if
+     * WordPress is blocking all crawlers.
+     * Renders the public dashboard for all tracked agents.
+     *
+     * The page lists each bot’s last‑seen timestamp, the last URL it visited, and the
+     * result of a robots‑txt evaluation for the site’s home path.  A small embedded video
+     * tutorial is shown in the right‑hand column.
+     *
      * @return void
 …
                     : $this->robots_status_for_agent_path( $key, $home_path, $diag );
                 $show_last_fields_as_na = ( 'google_extended' === $key );
+                $show_last_fields_as_na = $this->is_robots_only_agent( $key );
                 // Agent cell (expandable UA string unless NA)
 …
     /**
+     * Checks if the site is configured to “Discourage search engines”.
+     *
+     * Why:  If WordPress’s global setting is enabled, all crawlers are
+     *       blocked regardless of `robots.txt`.  The function lets us
+     *       quickly short‑circuit further checks.
+     *
+     * @return bool True if the site is discouraging crawlers.
+     * Determines if WordPress is currently discouraging search engines (`blog_public` = 0).
+     *
+     * The helper is used to short‑circuit logic that would otherwise record activity,
+     * and to display a warning banner on admin screens.
+     *
+     * @return bool True when site‑wide crawling is disabled.
      */
     private function is_site_discouraged() {
 …
     /**
+     * Returns a status array for the “blocked by WordPress” case.
+     *
+     * Why:  Used when `is_site_discouraged()` is true; the admin UI
+     *       needs a consistent structure (`state` & `reason`) for
+     *       rendering the status badge.
+     *
+     * @return array `state` (blocked) and a human‑readable reason.
+     * Builds a standard “blocked by WordPress setting” status array.
+     *
+     * Returned from {@see robots_status_for_agent_path()} when the site is
+     * configured to discourage search engines.
+     *
+     * @return array{state:string,reason:string} Blocking information.
      */
     private function forced_block_status() {
 …
     /**
+     * Maps an internal bot key to the list of UA strings that the bot
+     * uses (used to choose the “most representative” UA for robots.txt
+     * checks).
+     *
+     * Why:  Different bots have distinct patterns for desktop/mobile,
+     *       legacy, and AI‑bot forms.  This helper returns the set of
+     *       tokens that must be matched against the `robots.txt` filter
+     *       logic.
+     *
+     * @param string $key Internal bot key.
+     * @return array List of tokens.
+     * Maps an internal bot key to the canonical User‑Agent string(s) that can appear
+     * in the site’s robots.txt.
+     *
+     * The mapping is used for robots‑txt group selection.  When a key maps to
+     * multiple tokens it is returned as an array of strings; otherwise a single‑element array.
+     *
+     * @param string $key The internal identifier for a bot.
+     * @return array<string> The list of UA strings to check in robots.txt.
      */
     private function robots_tokens_for_key(string $key): array {
 …
             case 'perplexity_user':      return ['Perplexity-User']; // normalize Unicode hyphens
+            // Meta
+            case 'meta_externalagent':
+                return ['meta-externalagent'];
+            case 'meta_webindexer':
+                return ['Meta-WebIndexer'];
+            // Apple
+            case 'applebot':
+                return ['Applebot'];
+            case 'applebot_extended':
+                return ['Applebot-Extended'];
             // DuckDuckGo
             case 'duckduckgo_search':    return ['DuckDuckBot'];
 …
     /**
+     * Chooses the appropriate `User‑Agent` group in robots.txt for
+     * a given bot.
+     *
+     * Why:  A robot may have multiple `User‑Agent` lines (desktop/mobile).
+     *       The longest exact match wins; if none match we fallback to
+     *       the wildcard group (`*`).  This function returns the entire
+     *       group structure (agents + rules) for that bot.
+     *
+     * @param string $robots_txt Raw text of robots.txt.
+     * @param array  $tokens     List of tokens that identify the bot.
+     * @return array Group with `agents` and `rules` keys.
+     * Returns true for bots that only check robots.txt and never fetch pages.
+     *
+     * For these bots the per‑post “last‑seen” columns are marked “N/A” because it is
+     * impossible to record an actual page visit.
+     *
+     * @param string $key The internal bot identifier.
+     * @return bool True when the bot is robots‑only.
+     */
+    private function is_robots_only_agent( string $key ): bool {
+        return in_array(
+            $key,
+            [
+                'google_extended',
+                'applebot_extended',
+            ],
+            true
+        );
+    }
+    /**
+     * Chooses the most specific robots.txt group that matches the requested bot tokens.
+     *
+     * The grouping algorithm follows the order of rules in the file: a group that
+     * lists a specific User‑Agent token and the longest matching rule wins.
+     * If no group matches, the first wildcard `*` group is returned (if present).
+     *
+     * @param string $robots_txt Raw robots.txt contents.
+     * @param array<string> $tokens List of canonical UA strings for the bot.
+     * @return array{agents:list<string>,rules:list<mixed>} The selected group’s agents and rules.
      */
     private function select_robots_group(string $robots_txt, array $tokens) {
 …
     /**
+     * Checks whether a path starts with the rule prefix.
+     *
+     * Why:  `robots.txt` rules are prefix based; e.g. `Disallow: /wp-admin`
+     *       blocks everything under `/wp-admin`.  A simple `strncmp()`
+     *       suffices because we already normalised the rule to start with
+     *       a slash and have removed any trailing `$` terminator.
+     *
+     * @param string $path Path from the request.
+     * @param string $rule Rule from robots.txt.
+     * @return bool True if the path matches the rule.
+     * Tests whether the supplied robots rule path is a strict prefix of $path.
+     *
+     * The helper is used by {@see robots_txt_allows()} when applying the longest‑prefix
+     * rule logic.
+     *
+     * @param string $path The request path.
+     * @param string $rule The rule path defined in robots.txt.
+     * @return bool True when $rule is a prefix of $path.
      */
     private function path_prefix_match(string $path, string $rule): bool {
 …
     /**
+     * Normalises hyphen characters in a string; replaces various Unicode
+     * hyphens with the ASCII hyphen.
+     *
+     * Why:  Some bots (e.g., Perplexity) use non‑standard hyphens in
+     *       their UA strings.  Normalising them guarantees that
+     *       string‑based matching (e.g. `strcasecmp()`) works
+     *       regardless of the particular hyphen glyph.
+     * Normalises a URL path so that an empty string becomes “/”.
+     *
+     * An empty path can cause subtle bugs in robots‑txt evaluation and is treated
+     * as the root path by WordPress.
+     *
+     * @param string $p The original path.
+     * @return string Normalised path.
+     */
+    private function normalize_path(string $p): string {
+        return ($p === '') ? '/' : $p;
+    }
+    /**
+     * Replaces every form of Unicode hyphen (–, ‑, －, …) with a simple ASCII hyphen.
+     *
+     * The rule set is chosen to avoid false mismatches when a site’s robots.txt
+     * contains variant hyphens in its UA tokens.
+     *
      * @param string $s Input string.
      * @return string Normalised string.
      */
-    private function normalize_path(string $p): string {
-        return ($p === '') ? '/' : $p;
+    }
-    /**
-     * Normalises hyphen characters in a string; replaces various Unicode
-     * hyphens with the ASCII hyphen.
+     *
-     * Why:  Some bots (e.g., Perplexity) use non‑standard hyphens in
-     *       their UA strings.  Normalising them guarantees that
-     *       string‑based matching (e.g. `strcasecmp()`) works
-     *       regardless of the particular hyphen glyph.
+     *
-     * @param string $s Input string.
-     * @return string Normalised string.
-     */
     private function normalize_hyphens(string $s): string {
         return preg_replace('/[\x{2010}-\x{2015}\x{2212}\x{FE58}\x{FE63}\x{FF0D}]/u','-',$s);
 …
     /**
+     * Parses a raw `robots.txt` string into an array of groups.
+     *
+     * Why:  The plugin re‑implements the minimal parsing logic needed
+     *       for our use‑case (User‑Agent, Allow, Disallow).  The function
+     *       ignores other directives (Sitemap, Crawl‑delay) and keeps
+     *       groups and rules in the order they appear, which is essential
+     *       for the longest‑path‑wins logic.
+     *
+     * @param string $txt Raw content of robots.txt.
+     * @return array Array of groups, each with `agents` and `rules`.
+     * Parses the raw robots.txt into an array of agent‑group objects.
+     *
+     * The parser emits structures of the form:
+     *   [{'agents'=>[...], 'rules'=>[['allow'|'disallow', '/path'], ...]}, ...]
+     *  * Non‑essential lines (`Sitemap:`, `Crawl‑delay:`, comments) are ignored.
+     *  * Blank lines reset state, but a group is only flushed when a director is encountered.
+     *
+     * @param string $txt Raw robots.txt string.
+     * @return array<List<string>,list<mixed>> List of groups, each with `agents` and `rules`.
      */
     private function parse_robots_groups(string $txt): array {
 …
     /**
      * Retrieves the raw `robots.txt` content for the current site.
+     *
      * Why:  We avoid an HTTP request to the public `robots.txt`.
      *       WordPress can generate it via `do_robots()` or the file can
      *       be read directly.  The function falls back to the filter
      *       `robots_txt` if no file or `do_robots()` output is
      *       available.
+     *
      * @return string|null Raw robots.txt body or null if unavailable.
+     * Retrieves the active robots.txt for the current site.
+     *
+     * The method respects the order imposed by the WordPress core:
+     *   1. Physical `robots.txt` file
+     *   2. `wp_robots()` (WP 5.7+)
+     *   3. `do_robots` action
+     *   4. `robots_txt` filter
+     *
+     * @return string|null The robots.txt contents, or `null` if none could be generated.
      */
     private function get_local_robots_txt() {
+        // 1. Physical robots.txt file takes precedence
         $file = ABSPATH . 'robots.txt';
         if ( @is_readable( $file ) ) {
             $body = @file_get_contents( $file );
+            if ( is_string( $body ) && '' !== $body ) return $body;
+        }
+            if ( is_string( $body ) && '' !== trim( $body ) ) {
+                return $body;
+            }
+        }
+        // 2. Prefer wp_robots() when available (WP 5.7+)
+        if ( function_exists( 'wp_robots' ) ) {
+            ob_start();
+            wp_robots();
+            $out = ob_get_clean();
+            if ( is_string( $out ) && '' !== trim( $out ) ) {
+                return $out;
+            }
+        }
+        // 3. Fallback to core do_robots action
+        // do_robots is a WordPress core action used to generate robots.txt output.
         ob_start();
+        // phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedHooknameFound
         do_action( 'do_robots' );
         $out = ob_get_clean();
+        if ( is_string( $out ) && '' !== trim( $out ) ) return $out;
+        if ( is_string( $out ) && '' !== trim( $out ) ) {
+            return $out;
+        }
+        // 4. Final fallback via robots_txt filter
+        // phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedHooknameFound
         $output = apply_filters( 'robots_txt', '', get_option( 'blog_public' ) );
+        if ( is_string( $output ) && '' !== trim( $output ) ) return $output;
+        if ( is_string( $output ) && '' !== trim( $output ) ) {
+            return $output;
+        }
         return null;
 …
     /**
+     * Returns true if the robots.txt rules permit the bot to access the path.
+     *
+     * Why:  The function implements a simplified rule engine that:
+     *   1. Chooses the best User‑Agent group for the bot (exact match
+     *      or wildcard).
+     *   2. Normalises the path and checks it against each rule in the
+     *      selected group.
+     *   3. Uses longest‑path‑wins; disallow wins only if it
+     *      matches a longer path than any allow.  If no rule matches,
+     *      the URL is allowed by default.
+     *
+     * @param string $robots_txt Raw robots.txt text.
+     * @param string $agent_key  Internal bot key.
+     * @param string $path       Requested path.
+     * @return bool True if the path is allowed for the bot.
+     * Determines whether the given bot is allowed to fetch $path based on the active robots.txt.
+     *
+     * Implements the standard longest‑prefix‑wins rule for a single bot’s group.  The method
+     * returns `true` for ``allow`` or for paths that match no rule (default allow).
+     *
+     * @param string $robots_txt The retrieved robots.txt contents.
+     * @param string $agent_key The internal identifier for the bot.
+     * @param string $path The requested URL path (starting with “/”).
+     * @return bool True when crawling is permitted, false when disallowed.
      */
     private function robots_txt_allows(string $robots_txt, string $agent_key, string $path): bool {
 …
     /**
      * Public wrapper that provides a “state” + “reason” array describing
      * whether the bot is allowed to crawl the path, and populates
      * diagnostics output for the UI.
+     *
      * Why:  The admin pages display a status badge; this method produces
      *       the data the badge expects and also records diagnostic
      *       information (matched group & rule) that can be shown in
      *       an expandable details box.
+     *
      * @param string $agent_key Bot key.
      * @param string $path      Requested path.
      * @param array  &$diag     Output diagnostics (group, rule).
      * @return array (`state` => 'allowed'|'blocked'|'unknown',  `reason` => string).
+     * Public API that reports the allow/blocked status for a bot and a specific path.
+     *
+     * The method first checks whether the site globally discourages crawlers (`blog_public=0`).
+     * If not, it loads the robots.txt, selects the appropriate group, and evaluates
+     * the path.  A detailed diagnostic array (group name, rule used) is returned
+     * alongside the status.  When a global block applies, the status is `blocked`
+     * with a specific reason.
+     *
+     * @param string $agent_key The bot’s internal key (e.g. 'googlebot_desktop').
+     * @param string $path Path fragment (starting with “/”) to test.
+     * @param array|null $diag Optional reference parameter that receives diagnostics about
+     *                         the matching group and rule; can be omitted if not needed.
+     * @return array{state:string,reason:string}<p>Possible `state` values: `allowed`, `blocked`, `unknown`.</p>
      */
     public function robots_status_for_agent_path( $agent_key, $path, &$diag = null ) {
 …
     /**
+     * Renders a status badge and an expandable details section.
+     *
+     * Why:  The status badge (green check, red X, gray) gives a quick visual
+     *       cue.  The details expand on hover/click and show the exact
+     *       robots.txt group and rule that determined the decision.
+     *
+     * @param array $status Associative array returned by
+     *                      `robots_status_for_agent_path`.
+     * @param array $diag   Diagnostics array (group & rule).
+     * @return string HTML safe snippet for the badge + details.
+     * Renders a colour‑coded badge indicating the robots.txt status and a collapsible
+     * details section that shows which rule was matched.
+     *
+     * The badge colors are:
+     *   * Green with a checkmark for “allowed”
+     *   * Red with a cross for “blocked”
+     *   * Grey for “unknown”
+     *
+     * @param array{state:string,reason:string} $status Returned from {@see robots_status_for_agent_path()}.
+     * @param array{group:string,rule:string} $diag Diagnostics array for detail output.
+     * @return string Safe HTML for the badge + details element.
      */
     private function render_status_badge_expandable( $status, $diag ) {
 …
     /**
+     * Formats a timestamp in “human‑readable” form and a precise
+     * full‑timestamp (with micro‑seconds) for display.
+     *
+     * Why:  Users want to see “3 days ago (2025‑08‑12 15:04:23.123456)”.
+     *       This helper keeps the code in the main rendering loop
+     *       terse and centralises the formatting logic.
+     *
+     * @param float $ts Timestamp from the database.
+     * @return string HTML safe representation.
+     * Formats a UNIX timestamp (with microseconds) into a human‑readable “time‑ago” string
+     * plus the exact date/time in the site’s configured timezone.
+     *
+     * The output is safe for HTML rendering and is not localized beyond what
+     * WordPress’ `human_time_diff()` and `wp_date()` provide.
+     *
+     * @param float $ts Timestamp value returned by `microtime(true)`.
+     * @return string The formatted cell content; if $ts is falsy, “Not Yet” is returned.
      */
     private function format_last_seen_cell( $ts ) {
 …
     /**
+     * Formats the string shown in the admin bar for each bot.
+     *
+     * Why:  The toolbar entry should be compact (label + timestamp)
+     *       but still show the exact time.  This helper keeps the
+     *       formatting consistent between the toolbar and the
+     *       meta‑box.
+     *
+     * @param string $label Label for the bot.
+     * @param float  $ts    Timestamp (may be 0).
+     * @param string $suffix Optional small label (e.g., “today”).
+     * @return string Safe html string.
+     * Builds the display string used in admin‑bar nodes.
+     *
+     * It shows the label, the relative “time‑ago” string, and the absolute timestamp
+     * (with microseconds).  If no timestamp is available, “Not Yet” is used.
+     *
+     * @param string $label Human‑readable name of the agent.
+     * @param float $ts Timestamp value or 0 for “Not Yet”.
+     * @param string $suffix Optional suffix string to append after the label.
+     * @return string The formatted admin‑bar line.
      */
     private function format_admin_bar_line( $label, $ts, $suffix = '' ) {
 …
     /**
+     * Formats the “last page” cell that appears in the admin page.
+     *
+     * Why:  In the list view we only want the URL, not the post title.
+     *       The helper converts the stored data from `compute_agent_latest()`
+     *       into a link or an “–” if the data is missing.
+     *
+     * @param array $latest Associative array from `compute_agent_latest`.
+     * @return string HTML markup.
+     * Formats the “last page” cell of the admin dashboard table.
+     *
+     * Accepts either a post ID or a raw URL string.  The output is a clickable link
+     * that opens the page in a new tab.  Empty or missing values become a dash.
+     *
+     * @param array{ts:float,type:string,post_id:int,url:string,ua:string} $latest Data from {@see compute_agent_latest()}.
+     * @return string The safe HTML for the link or a dash.
      */
     private function format_context_cell_url_only( $latest ) {
 …
     /**
+     * Converts a UTC timestamp to the administrator’s timezone string.
+     *
+     * Why:  All timestamps are stored in UTC; showing them in the
+     *       local timezone (configured in Settings → General)
+     *       is far more user‑friendly.  The helper uses `wp_date()` if
+     *       available, else falls back to `date_i18n()`.
+     *
+     * @param int    $sec    Unix timestamp.
+     * @param string $format Optional format string (uses WordPress defaults if omitted).
+     * @return string Localised timestamp.
+     * Returns the current site time in the configured timezone, formatted with the given mask.
+     *
+     * Wrapper around `wp_date()` (WordPress 5.5+) and falls back to
+     * `date_i18n()` on older cores.  The function takes a UNIX timestamp
+     * (seconds since the epoch) and a PHP date format string.
+     *
+     * @param int $sec Unix timestamp in seconds.
+     * @param string $format PHP date format mask.
+     * @return string Formatted date/time string.
      */
     private function format_site_tz( $sec, $format ) {
 …
     /**
+     * Computes the most recent known access time for a given bot.
+     *
+     * Why:  The admin page needs to show the “last seen” across the
+     *       entire site.  This method:
+     *   1. Fetches the cached “latest post” option and reads the
+     *      associated meta.  If that timestamp is newer, it is used.
+     *   2. Reads the cached “latest URL” option (created during recording)
+     *      which holds the exact URL and UA.  If that is newer than the
+     *      post timestamp, it wins.
+     *   3. Returns an associative array (`ts`, `type`, `post_id`, `url`, `ua`).
+     *
+     * @param string $key Bot key.
+     * @return array Latest visit info.
+     * Computes the most recent event for a given bot across the entire site.
+     *
+     * The algorithm inspects:
+     *   1. The post‑meta value of the last seen timestamp and associated UA for the last post that bot hit.
+     *   2. The `dzcr_latest_url_{$key}` option that records the most recently accessed URL.
+     *   3. Falls back to a default empty result if nothing is found.
+     *
+     * The function returns the candidate that has the greatest timestamp, together with
+     * the type (“post” or “url”) and details of the location.
+     *
+     * @param string $key Internal bot identifier.
+     * @return array{ts:float,type:string,post_id:int,url:string,ua:string} The best match.
      */
     private function compute_agent_latest( $key ) {
 …
     /**
+     * Adds “Settings” and “Documentation” links to the plugin’s row
+     * in the WordPress Plugins page.
+     *
+     * Why:  Site admins often look for quick access to the plugin’s
+     *       configuration.  Adding these links saves a few clicks.
+     *
+     * @param array $links Existing action links.
+     * @return array Updated array with new links.
+     * Adds “Settings” and “Documentation” links to the plugin’s row in the admin‑plugins page.
+     *
+     * The links open in the same window and in a new tab, respectively.
+     *
+     * @param array<string,string> $links Array of existing action links.
+     * @return array<string,string> The extended links array.
      */
     public function plugin_action_links( $links ) {
         $settings_url = admin_url( 'admin.php?page=cls-crawler-record' );
+        $settings_url = admin_url( 'admin.php?page=' . DZCR_ADMIN_SLUG );
         $docs_url     = 'https://www.dizzysoft.com/crawler-record-plugin-for-wordpress/';

crawler-record/trunk/readme.txt

-                      r3400643
+                      r3423919
 Tags: googlebot, bingbot, gptbot, seo, robots
 Requires at least: 6.0
 Tested up to: 6.8
+Tested up to: 6.9
 Requires PHP: 7.4
 Stable tag: 0.8.0
 …
 == Changelog ==
+= 0.9.0 =
+* Now monitoring for Meta and Apple User Agents
+* More accurate site-wide UA reporting.
+* Ensured video tutorial appears on all admin screens.
+* Fixed small code errors.
 = 0.8.0 =

crawler-record/trunk/uninstall.php

-                      r3366584
+                      r3423919
 // 2) Remove any known discrete options (if you introduced settings later).
 $discrete_options = array(
     'dzcr_settings',       // reserved for future settings array.
     'dzcr_agents_custom',  // reserved if you ever allow custom agent configs.
+$dzcr_discrete_options = array(
+    'dzcr_settings',
+    'dzcr_agents_custom',
 );
 foreach ( $discrete_options as $opt ) {
     delete_option( $opt );
+foreach ( $dzcr_discrete_options as $dzcr_opt ) {
+    delete_option( $dzcr_opt );
+}
 // 3) Optional: purge post meta set by the plugin (disabled by default).
 // Enable by defining DZCR_PURGE_POSTMETA true in wp-config.php OR using the filter below.
 $purge_postmeta = ( defined( 'DZCR_PURGE_POSTMETA' ) && DZCR_PURGE_POSTMETA )
+$dzcr_purge_postmeta = ( defined( 'DZCR_PURGE_POSTMETA' ) && DZCR_PURGE_POSTMETA )
     || apply_filters( 'dzcr_uninstall_purge_postmeta', false );
 if ( $purge_postmeta ) {
+if ( $dzcr_purge_postmeta ) {
     // Delete meta keys written per post:
     //  - _dzcr_last_seen_{agent}

Note: See TracChangeset for help on using the changeset viewer.

Trac UI Preferences

Download in other formats: