Plugin Directory

Changeset 3423919


Ignore:
Timestamp:
12/19/2025 06:10:18 PM (3 weeks ago)
Author:
dizzysoft
Message:

Version 0.9.0

Location:
crawler-record
Files:
1 added
3 edited
5 copied

Legend:

Unmodified
Added
Removed
  • crawler-record/tags/0.9.0/crawler-record.php

    r3400651 r3423919  
    44 * Plugin URI:  https://www.dizzysoft.com/crawler-record-plugin-for-wordpress/
    55 * Description: Are the most common search and chat (AI/LLM) bots able to access the pages on your website? Crawler Record can tell you the last time each of the most common search/chat bots visited -and which pages at which they looked.
    6  * Version:     0.8.0
     6 * Version:     0.9.0
    77 * Requires at least: 6.0
    8  * Tested up to: 6.8
     8 * Tested up to: 6.9
    99 * Requires PHP: 7.4
    1010 * Author:      dizzysoft
     
    2020}
    2121
    22 define( 'CRAWLER_RECORD_VERSION', '0.8.0' );
     22define( 'CRAWLER_RECORD_VERSION', '0.9.0' );
     23define( 'DZCR_ADMIN_SLUG', 'dzcr-crawler-record' );
    2324
    2425class Crawler_Record {
    2526
    26     /**
    27      * Holds a list of all bot groups and their UA patterns.
    28      *
    29      * Why:  The agent list is small, stable, and rarely changes; storing
    30      *       it in code removes the need for a custom database table
    31      *       and keeps migration simple.  The `apply_filters()` wrapper
    32      *       lets developers add or replace bots without touching this file.
    33      *
    34      * @var array
    35      */
     27    // https://www.searchenginejournal.com/ai-crawler-user-agents-list/558130/
    3628    private $dzcr_default_agent_groups = [
    3729        'Google' => [
     
    144136        ],
    145137
     138        'Meta' => [
     139            'doc'   => 'https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/',
     140            'items' => [
     141                'meta_externalagent' => [
     142                    'label'   => 'Meta-ExternalAgent',
     143                    'pattern' => '#meta-externalagent/\d+(?:\.\d+)?#i',
     144                ],
     145                'meta_webindexer' => [
     146                    'label'   => 'Meta-WebIndexer',
     147                    'pattern' => '#meta-webindexer/\d+(?:\.\d+)?#i',
     148                ],
     149            ],
     150        ],
     151
     152        'Apple' => [
     153            'doc'   => 'https://support.apple.com/en-us/119829',
     154            'items' => [
     155                'applebot' => [
     156                    'label'   => 'Applebot',
     157                    'pattern' => '#Applebot/\d+(?:\.\d+)?#i',
     158                ],
     159
     160                // Robots-only (does not crawl pages)
     161                'applebot_extended' => [
     162                    'label'   => 'Applebot-Extended (AI)',
     163                    'pattern' => '#(?!.)#',
     164                ],
     165            ],
     166        ],
     167
    146168        'DuckDuckGo' => [
    147169            'doc'   => 'https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot',
     
    169191    ];
    170192
    171 
    172     /**
    173      * Cache of the resolved, filtered agent list.
    174      *
    175      * Why:  Resolving the agent list once per request saves expensive work
    176      *       (filtering & validation).  Stored in memory for the duration of
    177      *       the request; no persistence needed.
    178      *
    179      * @var array|null
    180      */
    181193    private $dzcr_agents_effective_cache = null;
    182194
    183     /**
    184      * Cache of the flat list of UA patterns keyed by the agent key.
    185      *
    186      * Why:  Many look-ups need a simple key > regex map; building it once avoids
    187      *       iterating through the nested agent group structure each time.
    188      *
    189      * @var array|null
    190      */
    191195    private $dzcr_agents_flat_cache      = null;
    192196
    193     /**
    194      * Prefix used for per-post Meta keys that store the last timestamp a bot
    195      * visited the post.
    196      *
    197      * Why:  Namespacing the meta key prevents collisions with other plugins.
    198      *       The prefix is combined with the unique agent key.
    199      *
    200      * @var string
    201      */
    202197    private $dzcr_meta_prefix     = '_dzcr_last_seen_';
    203198
    204     /**
    205      * Prefix used for site-wide last-seen Meta keys (stored as options).
    206      *
    207      * Why:  Allows quick access to the most recent time *any* page was seen by
    208      *       each bot without scanning the entire database.
    209      *
    210      * @var string
    211      */
    212199    private $dzcr_site_prefix     = '_dzcr_site_last_';
    213200
    214     /**
    215      * Prefix used for per-URL last-seen keys (stored as options).
    216      *
    217      * Why:  URLs are not tied to a post, so options keep arbitrary
    218      *       key/value pairs without cluttering postmeta.
    219      *
    220      * @var string
    221      */
    222201    private $dzcr_url_prefix      = '_dzcr_url_last_';
    223202
    224     /**
    225      * Prefix used for the last-post-ID per bot (stored as options).
    226      *
    227      * Why:  This is a fast index: if the last post ID is known we can fetch
    228      *       its meta immediately, avoiding a full table scan.
    229      *
    230      * @var string
    231      */
    232203    private $dzcr_lastpost_prefix = '_dzcr_last_post_';
    233204
    234 
    235     /**
    236      * Class constructor.
    237      *
    238      * Why:  Hooks the core functionality into WordPress:
    239      *   - `template_redirect` records when a crawler requests a page.
    240      *   - `add_meta_boxes` adds the per-post "Crawler Record" box.
    241      *   - `admin_bar_menu` inserts a quick toolbar entry for logged-in users.
    242      *   - `admin_menu` registers the settings page in the admin sidebar.
    243      *   - `plugin_action_links_*` adds Settings/Documentation links on the
    244      *     Plugins screen.
    245      *
    246      * This is the entry point of the plugin – everything else is called
    247      * indirectly via these actions.
    248      */
    249205    public function __construct() {
    250206        add_action( 'template_redirect', [ $this, 'maybe_record_last_seen' ] );
     
    258214    public function enqueue_admin_assets() {
    259215        $screen = function_exists('get_current_screen') ? get_current_screen() : null;
    260         if ( ! $screen || $screen->id !== 'toplevel_page_crawler-record' ) {
     216        if ( ! $screen || $screen->id !== 'toplevel_page_' . DZCR_ADMIN_SLUG ) {
    261217            return; // only on the plugin admin page
    262218        }
     
    265221            'https://player.vimeo.com/api/player.js',
    266222            [],
    267             $ver,
     223            CRAWLER_RECORD_VERSION,
    268224            true
    269225        );
     
    271227    }
    272228
    273 
    274     /**
    275      * Retrieves the effective agent groups after filters, validation,
    276      * and custom ordering have been applied.
    277      *
    278      * Why:  The default list is processed once per request.  This method
    279      *       returns a validated array that can be cached for all other
    280      *       look‑ups (`get_agent_groups()`).  The validation step removes
    281      *       malformed entries and guarantees the presence of a `doc`
    282      *       URL and a `label` for each item.
    283      *
    284      * @return array The resolved, validated agent groups.
     229    /**
     230     * Lazily resolves the complete agent‑group tree, applying the `dzcr_agent_groups`
     231     * and `dzcr_agents_order` filters, and caches the result for the lifetime of the request.
     232     *
     233     * The returned array maps group names to an array containing a `doc` URL and a list of
     234     * individual bot definitions (`label` and `pattern`).  The data is filtered so that
     235     * malformed entries (missing patterns or labels) are removed automatically.
     236     *
     237     * @return array<string,array<string,mixed>> The fully‑validated and ordered agent groups.
    285238     */
    286239    private function get_agent_groups() {
     
    332285
    333286    /**
    334      * Returns a flattened map of bot keys to regex pattern strings.
    335      *
    336      * Why:  The recording logic (`maybe_record_last_seen`) needs a
    337      *       simple `foreach ($agent_key => $pattern)` loop.  Flattening
    338      *       removes the overhead of walking the nested group structure
    339      *       for every request.
    340      *
    341      * @return array Key → regex string.
     287     * Returns a flattened map of every bot key to its compiled regular‑expression pattern.
     288     *
     289     * The function iterates over the effective agent groups obtained via {@see get_agent_groups()}
     290     * and builds a key‑=>pattern list.  The result is cached for the current request.
     291     *
     292     * @return array<string,string> Bot key → regex pattern.
    342293     */
    343294    private function get_agents_flat() {
     
    356307
    357308    /**
    358      * Looks up the human‑readable label for a bot key.
    359      *
    360      * Why:  The UI shows the label, not the internal key.  The helper
    361      *       centralises the lookup and provides a default in case the
    362      *       key is unknown.
    363      *
    364      * @param string $key Internal bot identifier.
    365      * @return string Human‑readable label.
     309     * Looks up a human‑readable label for a bot key in the current agent‑group configuration.
     310     *
     311     * If the key cannot be found, its raw value is returned as a fallback.  This helper is
     312     * used to keep UI code terse while still presenting friendly names to site editors.
     313     *
     314     * @param string $key The internal identifier for a bot (e.g. 'googlebot_desktop').
     315     * @return string The user‑friendly label.
    366316     */
    367317    private function get_label_for_key( $key ) {
     
    375325
    376326    /**
    377      * Builds the canonical URL of the current front‑end request.
    378      *
    379      * Why:  Recorded timestamps are stored per‑URL, so we need an exact,
    380      *       normalised URL.  The function normalises the scheme
    381      *       (`http/https`), host, path, and applies the
    382      *       `dzcr_normalize_url` filter (useful for stripping tracking
    383      *       parameters or converting to a canonical form).
    384      *
    385      * @return string Full URL (or empty string in admin/invalid state).
    386      */
    387     private function current_url() {
    388         if ( is_admin() ) {
     327     * Generates the canonical, normalized permalink for the queried singular post.
     328     *
     329     * The function guarantees the returned URL does **not** contain a query string,
     330     * a fragment identifier, or an empty path.  The result is suitable for storage
     331     * in post meta and for robots‑txt matching.
     332     *
     333     * @param int $post_id By reference, receives the resolved post ID or 0 if no singular.
     334     * @return string The sanitized absolute URL, or an empty string when no query can be resolved.
     335     */
     336    private function current_url_for_post( &$post_id = 0 ) {
     337        if ( is_admin() || ! is_singular() ) {
    389338            return '';
    390339        }
    391         if ( ! isset( $_SERVER['HTTP_HOST'], $_SERVER['REQUEST_URI'] ) ) {
     340
     341        $post_id = (int) get_queried_object_id();
     342        if ( ! $post_id ) {
    392343            return '';
    393344        }
    394         $scheme = ( ! empty( $_SERVER['HTTPS'] ) && 'off' !== $_SERVER['HTTPS'] ) ? 'https' : 'http';
    395         $host   = sanitize_text_field( wp_unslash( $_SERVER['HTTP_HOST'] ) );
    396         $uri    = esc_url_raw( wp_unslash( $_SERVER['REQUEST_URI'] ) );
    397 
    398         if ( '' === $host ) {
     345
     346        $url = get_permalink( $post_id );
     347        if ( ! $url ) {
    399348            return '';
    400349        }
    401         $url = $scheme . '://' . $host . $uri;
    402         $url = apply_filters( 'dzcr_normalize_url', $url );
    403         if ( strlen( $url ) > 2048 ) {
    404             $url = substr( $url, 0, 2048 );
    405         }
    406         return $url;
    407     }
    408 
    409     /**
    410      * Records the last “seen” timestamp when a crawler requests a page.
    411      *
    412      * Why:  This method is the heart of the plugin.  It is called on
    413      *       `template_redirect` and performs the following actions:
    414      *
    415      *   1. Skips previews, feeds, REST, admin, AJAX, and cron requests.
    416      *   2. Only processes GET/HEAD requests, which are the standard
    417      *      HTTP verbs used by crawlers.
    418      *   3. Trims and sanitises the UA string (max 512 chars) and
    419      *      normalises it to avoid runaway regexes.
    420      *   4. Throttles writes using a 10‑minute window (configurable via
    421      *      `dzcr_throttle_window`) to avoid spamming the DB when the same
    422      *      bot visits the page repeatedly.
    423      *   5. Stores three pieces of information:
    424      *        - Per‑post meta (`_dzcr_last_seen_[bot]`).
    425      *        - Per‑URL options (`_dzcr_url_last_[bot]_[hash]`) plus companion
    426      *          `_url_` and `_ua_` options.
    427      *        - Site‑wide option (`_dzcr_site_last_[bot]`).
    428      *
    429      * The method also updates the “last UA string” meta/option so the
    430      * UI can show exactly what the bot sent.
    431      *
     350
     351        // Normalize: strip query string and fragment
     352        $parts = wp_parse_url( $url );
     353        if ( empty( $parts['host'] ) || empty( $parts['path'] ) ) {
     354            return '';
     355        }
     356
     357        $scheme = $parts['scheme'] ?? 'https';
     358        return $scheme . '://' . $parts['host'] . $parts['path'];
     359    }
     360
     361    /**
     362     * Core routine that records bot visit timestamps when the current request matches a known crawler.
     363     *
     364     * The method is executed early in the front‑end rendering cycle (`template_redirect`).
     365     * 1. It exits immediately for admin screens, REST requests, feeds, `HEAD` requests, or
     366     *    when the `User‑Agent` header is missing or too long.
     367     * 2. The UA string is matched against the compiled regex list.  Only the first matching
     368     *    rule is considered.
     369     * 3. A per‑post meta key and a global option are updated if the throttle window
     370     *    (10 minutes by default) allows it.  The method also records the exact UA string
     371     *    that triggered the update so editors can inspect it later.
     372     *
     373     * @param int $post_id Optional post ID when the query resolves to a single post.  Passed by value.
    432374     * @return void
     375     * @throws RuntimeException If the PHP `set_transient` failures occur (unlikely, but reported).
    433376     */
    434377    public function maybe_record_last_seen() {
     
    445388
    446389        $now     = microtime( true );
    447         $post_id = is_singular() ? (int) get_queried_object_id() : 0;
    448         $url     = $this->current_url();
    449         $urlhash = $url ? md5( $url ) : '';
     390
     391        $post_id = 0;
     392        $url     = $this->current_url_for_post( $post_id );
    450393
    451394        $throttle = (int) apply_filters( 'dzcr_throttle_window', 10 * MINUTE_IN_SECONDS );
     
    470413            }
    471414
    472             // Per URL
    473             if ( $urlhash ) {
    474                 $opt_key     = $this->dzcr_url_prefix . $key . '_' . $urlhash;
    475                 $opt_url_key = $this->dzcr_url_prefix . $key . '_url_' . $urlhash;
    476                 $opt_ua_key  = $this->dzcr_url_prefix . $key . '_ua_'  . $urlhash; // store last UA for this URL+agent
    477                 $t_url       = 'dzcr_seen_url_' . $key . '_' . $urlhash;
    478 
    479                 if ( ! $throttle || ! get_transient( $t_url ) ) {
    480                     $prev = (float) get_option( $opt_key, 0 );
    481                     if ( $now > $prev ) {
    482                         update_option( $opt_key, (string) $now, false );
    483                         update_option( $opt_url_key, $url, false );
    484                         update_option( $opt_ua_key,  $ua,  false );
    485                     }
    486                     if ( $throttle ) {
    487                         set_transient( $t_url, 1, $throttle );
    488                     }
     415            // Site-wide latest real post/page
     416            if ( $post_id && $url ) {
     417                $latest_key = 'dzcr_latest_url_' . $key;
     418                $prev       = get_option( $latest_key, [] );
     419                $prev_ts    = isset( $prev['ts'] ) ? (float) $prev['ts'] : 0.0;
     420
     421                if ( $now > $prev_ts ) {
     422                    update_option(
     423                        $latest_key,
     424                        [
     425                            'ts'  => (string) $now,
     426                            'url' => $url,
     427                            'ua'  => $ua,
     428                        ],
     429                        false
     430                    );
    489431                }
    490432            }
     
    501443
    502444    /**
    503      * Safely matches a UA string against a stored pattern or literal value.
    504      *
    505      * Why:  Direct use of `preg_match()` on untrusted user‑agent strings
    506      *       can trigger PHP warnings/errors (e.g. malformed regex).
    507      *       This wrapper:
    508      *   - Ensures the pattern begins with a `#` and ends with `#i`.
    509      *   - Suppresses `preg_match()` errors with the `@` operator.
    510      *   - Validates that the regex is syntactically correct.
    511      *   - Fallbacks to literal `hash_equals()` comparison when a plain string
    512      *     is supplied.
    513      *
    514      * @param string $ua             The UA string from the request.
    515      * @param string $pattern_or_exact  Either a regex string or a literal.
    516      * @return bool True if the UA matches the pattern or string.
     445     * Safely tests whether a given User‑Agent string matches a pattern.
     446     *
     447     * Handles both plain strings (exact comparison) and regular‑expressions by inspecting
     448     * the first character.  The function avoids the costly `preg_match` errors through
     449     * the `@` error‑control operator and checks the regex error code.
     450     *
     451     * @param string $ua The User‑Agent string to test.
     452     * @param string $pattern_or_exact Either a regex (enclosed in `#...#i`) or a literal string.
     453     * @return bool True if the UA satisfies the pattern; otherwise false.
    517454     */
    518455    private function ua_matches_safe( $ua, $pattern_or_exact ) {
     
    542479
    543480    /**
    544      * Registers the “Crawler Record” meta‑box that appears on the post
    545      * editing screen.
    546      *
    547      * Why:  Post editors need instant insight into which crawlers have
    548      *       visited the post and their access status.  The meta‑box
    549      *       appears in the normal “normal” context with high priority,
    550      *       making it highly visible without adding extra menu items.
     481     * Registers the “Crawler Record” meta‑box on the post editing screen.
     482     *
     483     * The meta‑box is added to the *Advanced* context so that it does not clutter the editor
     484     * for non‑technical users.  It is only visible to capable editors (users with
     485     * `edit_post` capability).
     486     *
    551487     */
    552488    public function register_meta_box() {
     
    562498
    563499    /**
    564      * Renders the per‑post “Crawler Record” meta‑box.
    565      *
    566      * Why:  The box shows a table of all bots, each with:
    567      *   - A human‑readable label.
    568      *   - The last time the bot accessed the post (`Last Seen`).
    569      *   - A status badge that tells you whether the URL is allowed by
    570      *     `robots.txt`.
    571      *   - A collapsible section that reveals the exact UA string
    572      *     the bot used when it visited.
    573      *
    574      * The function also prints a warning if the site is globally
    575      * blocking all crawlers (WordPress “Discourage search‑engine” setting).
    576      *
    577      * @param WP_Post $post Current post object.
     500     * Renders the contents of the “Crawler Record” meta‑box.
     501     *
     502     * Generates a table that lists every tracked agent, its last‑seen timestamp, and a
     503     * robots‑txt allow/blocked status.  The UI includes a collapsible `details` element
     504     * that shows the exact User‑Agent string used for the most recent hit.
     505     *
     506     * @param WP_Post $post The post object currently being edited.
    578507     * @return void
    579508     */
     
    618547                'diag'   => $diag,
    619548                'ua'     => $last_ua,
    620                 'na'     => ( 'google_extended' === $key ), // treat Google-Extended as non-crawler
     549                'na' => $this->is_robots_only_agent( $key ),
    621550            ];
    622551        }
     
    662591
    663592    /**
    664      * Adds a “Crawler Record” entry to the front‑end admin bar.
    665      *
    666      * Why:  Site admins often browse the front‑end while logged in.
    667      *       The toolbar provides a one‑click shortcut to the
    668      *       detailed bot‑status page and shows a quick list of
    669      *       timestamps for each bot, whether looking at a single page,
    670      *       a specific URL, or the entire site.
    671      *
    672      * @param WP_Admin_Bar $wp_admin_bar Admin‑bar object.
     593     * Adds a “Crawler Record” node to the front‑end admin bar with context‑aware children.
     594     *
     595     * The node is only shown to logged‑in users viewing the front‑end (not the admin).  Depending
     596     * on whether the current page is singular or generic, the children list will display
     597     * per‑post or site‑wide last‑seen data for each crawler.
     598     *
     599     * @param WP_Admin_Bar $wp_admin_bar The admin bar instance.
    673600     * @return void
    674601     */
     
    679606        }
    680607
    681         $admin_page_url = admin_url( 'admin.php?page=cls-crawler-record' );
     608        $admin_page_url = admin_url( 'admin.php?page=' . DZCR_ADMIN_SLUG );
    682609
    683610        // Parent link
     
    700627
    701628                foreach ( $wrap['items'] as $key => $_def ) {
    702                     if ( 'google_extended' === $key ) {
    703                         continue; // do not show Google-Extended in admin bar
     629                    if ( $this->is_robots_only_agent( $key ) ) {
     630                        continue;
    704631                    }
     632
    705633                    $label = $this->get_label_for_key( $key );
    706634
     
    736664
    737665    /**
    738      * Registers the plugin’s top‑level admin page.
    739      *
    740      * Why:  The settings page is a convenient place to:
    741      *   - See all bots and their global timestamps.
    742      *   - Inspect the last page each bot has seen.
    743      *   - View robots.txt diagnostics for each bot.
     666     * Registers the top‑level “Crawler Record” admin submenu page.
     667     *
     668     * The submenu is created under the **Settings** root so that it is consistently
     669     * reachable regardless of the user role or the presence of other plugins.
    744670     *
    745671     * @return void
     
    750676            __( 'Crawler Record', 'crawler-record' ),
    751677            'manage_options',
    752             'cls-crawler-record',
     678            DZCR_ADMIN_SLUG,
    753679            [ $this, 'render_admin_page' ],
    754680            'dashicons-search',
     
    758684
    759685    /**
    760      * Renders the entire admin page.
    761      *
    762      * Why:  The page shows a per‑bot table containing:
    763      *   - The bot’s human label.
    764      *   - The most recent global timestamp.
    765      *   - The link to the most recent page the bot visited.
    766      *   - A status badge indicating robots.txt permissibility.
    767      *
    768      * The table also has a useful “Important” notice if
    769      * WordPress is blocking all crawlers.
     686     * Renders the public dashboard for all tracked agents.
     687     *
     688     * The page lists each bot’s last‑seen timestamp, the last URL it visited, and the
     689     * result of a robots‑txt evaluation for the site’s home path.  A small embedded video
     690     * tutorial is shown in the right‑hand column.
    770691     *
    771692     * @return void
     
    847768                    : $this->robots_status_for_agent_path( $key, $home_path, $diag );
    848769
    849                 $show_last_fields_as_na = ( 'google_extended' === $key );
     770                $show_last_fields_as_na = $this->is_robots_only_agent( $key );
    850771
    851772                // Agent cell (expandable UA string unless NA)
     
    879800
    880801    /**
    881      * Checks if the site is configured to “Discourage search engines”.
    882      *
    883      * Why:  If WordPress’s global setting is enabled, all crawlers are
    884      *       blocked regardless of `robots.txt`.  The function lets us
    885      *       quickly short‑circuit further checks.
    886      *
    887      * @return bool True if the site is discouraging crawlers.
     802     * Determines if WordPress is currently discouraging search engines (`blog_public` = 0).
     803     *
     804     * The helper is used to short‑circuit logic that would otherwise record activity,
     805     * and to display a warning banner on admin screens.
     806     *
     807     * @return bool True when site‑wide crawling is disabled.
    888808     */
    889809    private function is_site_discouraged() {
     
    892812
    893813    /**
    894      * Returns a status array for the “blocked by WordPress” case.
    895      *
    896      * Why:  Used when `is_site_discouraged()` is true; the admin UI
    897      *       needs a consistent structure (`state` & `reason`) for
    898      *       rendering the status badge.
    899      *
    900      * @return array `state` (blocked) and a human‑readable reason.
     814     * Builds a standard “blocked by WordPress setting” status array.
     815     *
     816     * Returned from {@see robots_status_for_agent_path()} when the site is
     817     * configured to discourage search engines.
     818     *
     819     * @return array{state:string,reason:string} Blocking information.
    901820     */
    902821    private function forced_block_status() {
     
    908827
    909828    /**
    910      * Maps an internal bot key to the list of UA strings that the bot
    911      * uses (used to choose the “most representative” UA for robots.txt
    912      * checks).
    913      *
    914      * Why:  Different bots have distinct patterns for desktop/mobile,
    915      *       legacy, and AI‑bot forms.  This helper returns the set of
    916      *       tokens that must be matched against the `robots.txt` filter
    917      *       logic.
    918      *
    919      * @param string $key Internal bot key.
    920      * @return array List of tokens.
     829     * Maps an internal bot key to the canonical User‑Agent string(s) that can appear
     830     * in the site’s robots.txt.
     831     *
     832     * The mapping is used for robots‑txt group selection.  When a key maps to
     833     * multiple tokens it is returned as an array of strings; otherwise a single‑element array.
     834     *
     835     * @param string $key The internal identifier for a bot.
     836     * @return array<string> The list of UA strings to check in robots.txt.
    921837     */
    922838    private function robots_tokens_for_key(string $key): array {
     
    953869            case 'perplexity_user':      return ['Perplexity-User']; // normalize Unicode hyphens
    954870
     871            // Meta
     872            case 'meta_externalagent':
     873                return ['meta-externalagent'];
     874            case 'meta_webindexer':
     875                return ['Meta-WebIndexer'];
     876
     877            // Apple
     878            case 'applebot':
     879                return ['Applebot'];
     880            case 'applebot_extended':
     881                return ['Applebot-Extended'];
     882
     883
    955884            // DuckDuckGo
    956885            case 'duckduckgo_search':    return ['DuckDuckBot'];
     
    964893
    965894    /**
    966      * Chooses the appropriate `User‑Agent` group in robots.txt for
    967      * a given bot.
    968      *
    969      * Why:  A robot may have multiple `User‑Agent` lines (desktop/mobile).
    970      *       The longest exact match wins; if none match we fallback to
    971      *       the wildcard group (`*`).  This function returns the entire
    972      *       group structure (agents + rules) for that bot.
    973      *
    974      * @param string $robots_txt Raw text of robots.txt.
    975      * @param array  $tokens     List of tokens that identify the bot.
    976      * @return array Group with `agents` and `rules` keys.
     895     * Returns true for bots that only check robots.txt and never fetch pages.
     896     *
     897     * For these bots the per‑post “last‑seen” columns are marked “N/A” because it is
     898     * impossible to record an actual page visit.
     899     *
     900     * @param string $key The internal bot identifier.
     901     * @return bool True when the bot is robots‑only.
     902     */
     903    private function is_robots_only_agent( string $key ): bool {
     904        return in_array(
     905            $key,
     906            [
     907                'google_extended',
     908                'applebot_extended',
     909            ],
     910            true
     911        );
     912    }
     913
     914    /**
     915     * Chooses the most specific robots.txt group that matches the requested bot tokens.
     916     *
     917     * The grouping algorithm follows the order of rules in the file: a group that
     918     * lists a specific User‑Agent token and the longest matching rule wins.
     919     * If no group matches, the first wildcard `*` group is returned (if present).
     920     *
     921     * @param string $robots_txt Raw robots.txt contents.
     922     * @param array<string> $tokens List of canonical UA strings for the bot.
     923     * @return array{agents:list<string>,rules:list<mixed>} The selected group’s agents and rules.
    977924     */
    978925    private function select_robots_group(string $robots_txt, array $tokens) {
     
    1003950
    1004951    /**
    1005      * Checks whether a path starts with the rule prefix.
    1006      *
    1007      * Why:  `robots.txt` rules are prefix based; e.g. `Disallow: /wp-admin`
    1008      *       blocks everything under `/wp-admin`.  A simple `strncmp()`
    1009      *       suffices because we already normalised the rule to start with
    1010      *       a slash and have removed any trailing `$` terminator.
    1011      *
    1012      * @param string $path Path from the request.
    1013      * @param string $rule Rule from robots.txt.
    1014      * @return bool True if the path matches the rule.
     952     * Tests whether the supplied robots rule path is a strict prefix of $path.
     953     *
     954     * The helper is used by {@see robots_txt_allows()} when applying the longest‑prefix
     955     * rule logic.
     956     *
     957     * @param string $path The request path.
     958     * @param string $rule The rule path defined in robots.txt.
     959     * @return bool True when $rule is a prefix of $path.
    1015960     */
    1016961    private function path_prefix_match(string $path, string $rule): bool {
     
    1019964
    1020965    /**
    1021      * Normalises hyphen characters in a string; replaces various Unicode
    1022      * hyphens with the ASCII hyphen.
    1023      *
    1024      * Why:  Some bots (e.g., Perplexity) use non‑standard hyphens in
    1025      *       their UA strings.  Normalising them guarantees that
    1026      *       string‑based matching (e.g. `strcasecmp()`) works
    1027      *       regardless of the particular hyphen glyph.
     966     * Normalises a URL path so that an empty string becomes “/”.
     967     *
     968     * An empty path can cause subtle bugs in robots‑txt evaluation and is treated
     969     * as the root path by WordPress.
     970     *
     971     * @param string $p The original path.
     972     * @return string Normalised path.
     973     */
     974    private function normalize_path(string $p): string {
     975        return ($p === '') ? '/' : $p;
     976    }
     977
     978    /**
     979     * Replaces every form of Unicode hyphen (–, ‑, -, …) with a simple ASCII hyphen.
     980     *
     981     * The rule set is chosen to avoid false mismatches when a site’s robots.txt
     982     * contains variant hyphens in its UA tokens.
    1028983     *
    1029984     * @param string $s Input string.
    1030985     * @return string Normalised string.
    1031986     */
    1032     private function normalize_path(string $p): string {
    1033         return ($p === '') ? '/' : $p;
    1034     }
    1035 
    1036     /**
    1037      * Normalises hyphen characters in a string; replaces various Unicode
    1038      * hyphens with the ASCII hyphen.
    1039      *
    1040      * Why:  Some bots (e.g., Perplexity) use non‑standard hyphens in
    1041      *       their UA strings.  Normalising them guarantees that
    1042      *       string‑based matching (e.g. `strcasecmp()`) works
    1043      *       regardless of the particular hyphen glyph.
    1044      *
    1045      * @param string $s Input string.
    1046      * @return string Normalised string.
    1047      */
    1048987    private function normalize_hyphens(string $s): string {
    1049988        return preg_replace('/[\x{2010}-\x{2015}\x{2212}\x{FE58}\x{FE63}\x{FF0D}]/u','-',$s);
     
    1051990
    1052991    /**
    1053      * Parses a raw `robots.txt` string into an array of groups.
    1054      *
    1055      * Why:  The plugin re‑implements the minimal parsing logic needed
    1056      *       for our use‑case (User‑Agent, Allow, Disallow).  The function
    1057      *       ignores other directives (Sitemap, Crawl‑delay) and keeps
    1058      *       groups and rules in the order they appear, which is essential
    1059      *       for the longest‑path‑wins logic.
    1060      *
    1061      * @param string $txt Raw content of robots.txt.
    1062      * @return array Array of groups, each with `agents` and `rules`.
     992     * Parses the raw robots.txt into an array of agent‑group objects.
     993     *
     994     * The parser emits structures of the form:
     995     *   [{'agents'=>[...], 'rules'=>[['allow'|'disallow', '/path'], ...]}, ...]
     996     *  * Non‑essential lines (`Sitemap:`, `Crawl‑delay:`, comments) are ignored.
     997     *  * Blank lines reset state, but a group is only flushed when a director is encountered.
     998     *
     999     * @param string $txt Raw robots.txt string.
     1000     * @return array<List<string>,list<mixed>> List of groups, each with `agents` and `rules`.
    10631001     */
    10641002    private function parse_robots_groups(string $txt): array {
     
    11051043
    11061044    /**
    1107      * Retrieves the raw `robots.txt` content for the current site.
    1108      *
    1109      * Why:  We avoid an HTTP request to the public `robots.txt`.
    1110      *       WordPress can generate it via `do_robots()` or the file can
    1111      *       be read directly.  The function falls back to the filter
    1112      *       `robots_txt` if no file or `do_robots()` output is
    1113      *       available.
    1114      *
    1115      * @return string|null Raw robots.txt body or null if unavailable.
     1045     * Retrieves the active robots.txt for the current site.
     1046     *
     1047     * The method respects the order imposed by the WordPress core:
     1048     *   1. Physical `robots.txt` file
     1049     *   2. `wp_robots()` (WP 5.7+)
     1050     *   3. `do_robots` action
     1051     *   4. `robots_txt` filter
     1052     *
     1053     * @return string|null The robots.txt contents, or `null` if none could be generated.
    11161054     */
    11171055    private function get_local_robots_txt() {
     1056        // 1. Physical robots.txt file takes precedence
    11181057        $file = ABSPATH . 'robots.txt';
    11191058        if ( @is_readable( $file ) ) {
    11201059            $body = @file_get_contents( $file );
    1121             if ( is_string( $body ) && '' !== $body ) return $body;
    1122         }
     1060            if ( is_string( $body ) && '' !== trim( $body ) ) {
     1061                return $body;
     1062            }
     1063        }
     1064
     1065        // 2. Prefer wp_robots() when available (WP 5.7+)
     1066        if ( function_exists( 'wp_robots' ) ) {
     1067            ob_start();
     1068            wp_robots();
     1069            $out = ob_get_clean();
     1070            if ( is_string( $out ) && '' !== trim( $out ) ) {
     1071                return $out;
     1072            }
     1073        }
     1074
     1075        // 3. Fallback to core do_robots action
     1076        // do_robots is a WordPress core action used to generate robots.txt output.
    11231077        ob_start();
     1078        // phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedHooknameFound
    11241079        do_action( 'do_robots' );
    11251080        $out = ob_get_clean();
    1126         if ( is_string( $out ) && '' !== trim( $out ) ) return $out;
    1127 
     1081        if ( is_string( $out ) && '' !== trim( $out ) ) {
     1082            return $out;
     1083        }
     1084
     1085        // 4. Final fallback via robots_txt filter
     1086        // phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedHooknameFound
    11281087        $output = apply_filters( 'robots_txt', '', get_option( 'blog_public' ) );
    1129         if ( is_string( $output ) && '' !== trim( $output ) ) return $output;
     1088        if ( is_string( $output ) && '' !== trim( $output ) ) {
     1089            return $output;
     1090        }
    11301091
    11311092        return null;
     
    11331094
    11341095    /**
    1135      * Returns true if the robots.txt rules permit the bot to access the path.
    1136      *
    1137      * Why:  The function implements a simplified rule engine that:
    1138      *   1. Chooses the best User‑Agent group for the bot (exact match
    1139      *      or wildcard).
    1140      *   2. Normalises the path and checks it against each rule in the
    1141      *      selected group.
    1142      *   3. Uses longest‑path‑wins; disallow wins only if it
    1143      *      matches a longer path than any allow.  If no rule matches,
    1144      *      the URL is allowed by default.
    1145      *
    1146      * @param string $robots_txt Raw robots.txt text.
    1147      * @param string $agent_key  Internal bot key.
    1148      * @param string $path       Requested path.
    1149      * @return bool True if the path is allowed for the bot.
     1096     * Determines whether the given bot is allowed to fetch $path based on the active robots.txt.
     1097     *
     1098     * Implements the standard longest‑prefix‑wins rule for a single bot’s group.  The method
     1099     * returns `true` for ``allow`` or for paths that match no rule (default allow).
     1100     *
     1101     * @param string $robots_txt The retrieved robots.txt contents.
     1102     * @param string $agent_key The internal identifier for the bot.
     1103     * @param string $path The requested URL path (starting with “/”).
     1104     * @return bool True when crawling is permitted, false when disallowed.
    11501105     */
    11511106    private function robots_txt_allows(string $robots_txt, string $agent_key, string $path): bool {
     
    11751130
    11761131    /**
    1177      * Public wrapper that provides a “state” + “reason” array describing
    1178      * whether the bot is allowed to crawl the path, and populates
    1179      * diagnostics output for the UI.
    1180      *
    1181      * Why:  The admin pages display a status badge; this method produces
    1182      *       the data the badge expects and also records diagnostic
    1183      *       information (matched group & rule) that can be shown in
    1184      *       an expandable details box.
    1185      *
    1186      * @param string $agent_key Bot key.
    1187      * @param string $path      Requested path.
    1188      * @param array  &$diag     Output diagnostics (group, rule).
    1189      * @return array (`state` => 'allowed'|'blocked'|'unknown',  `reason` => string).
     1132     * Public API that reports the allow/blocked status for a bot and a specific path.
     1133     *
     1134     * The method first checks whether the site globally discourages crawlers (`blog_public=0`).
     1135     * If not, it loads the robots.txt, selects the appropriate group, and evaluates
     1136     * the path.  A detailed diagnostic array (group name, rule used) is returned
     1137     * alongside the status.  When a global block applies, the status is `blocked`
     1138     * with a specific reason.
     1139     *
     1140     * @param string $agent_key The bot’s internal key (e.g. 'googlebot_desktop').
     1141     * @param string $path Path fragment (starting with “/”) to test.
     1142     * @param array|null $diag Optional reference parameter that receives diagnostics about
     1143     *                         the matching group and rule; can be omitted if not needed.
     1144     * @return array{state:string,reason:string}<p>Possible `state` values: `allowed`, `blocked`, `unknown`.</p>
    11901145     */
    11911146    public function robots_status_for_agent_path( $agent_key, $path, &$diag = null ) {
     
    12101165
    12111166    /**
    1212      * Renders a status badge and an expandable details section.
    1213      *
    1214      * Why:  The status badge (green check, red X, gray) gives a quick visual
    1215      *       cue.  The details expand on hover/click and show the exact
    1216      *       robots.txt group and rule that determined the decision.
    1217      *
    1218      * @param array $status Associative array returned by
    1219      *                      `robots_status_for_agent_path`.
    1220      * @param array $diag   Diagnostics array (group & rule).
    1221      * @return string HTML safe snippet for the badge + details.
     1167     * Renders a colour‑coded badge indicating the robots.txt status and a collapsible
     1168     * details section that shows which rule was matched.
     1169     *
     1170     * The badge colors are:
     1171     *   * Green with a checkmark for “allowed”
     1172     *   * Red with a cross for “blocked”
     1173     *   * Grey for “unknown”
     1174     *
     1175     * @param array{state:string,reason:string} $status Returned from {@see robots_status_for_agent_path()}.
     1176     * @param array{group:string,rule:string} $diag Diagnostics array for detail output.
     1177     * @return string Safe HTML for the badge + details element.
    12221178     */
    12231179    private function render_status_badge_expandable( $status, $diag ) {
     
    12441200
    12451201    /**
    1246      * Formats a timestamp in “human‑readable” form and a precise
    1247      * full‑timestamp (with micro‑seconds) for display.
    1248      *
    1249      * Why:  Users want to see “3 days ago (2025‑08‑12 15:04:23.123456)”.
    1250      *       This helper keeps the code in the main rendering loop
    1251      *       terse and centralises the formatting logic.
    1252      *
    1253      * @param float $ts Timestamp from the database.
    1254      * @return string HTML safe representation.
     1202     * Formats a UNIX timestamp (with microseconds) into a human‑readable “time‑ago” string
     1203     * plus the exact date/time in the site’s configured timezone.
     1204     *
     1205     * The output is safe for HTML rendering and is not localized beyond what
     1206     * WordPress’ `human_time_diff()` and `wp_date()` provide.
     1207     *
     1208     * @param float $ts Timestamp value returned by `microtime(true)`.
     1209     * @return string The formatted cell content; if $ts is falsy, “Not Yet” is returned.
    12551210     */
    12561211    private function format_last_seen_cell( $ts ) {
     
    12701225
    12711226    /**
    1272      * Formats the string shown in the admin bar for each bot.
    1273      *
    1274      * Why:  The toolbar entry should be compact (label + timestamp)
    1275      *       but still show the exact time.  This helper keeps the
    1276      *       formatting consistent between the toolbar and the
    1277      *       meta‑box.
    1278      *
    1279      * @param string $label Label for the bot.
    1280      * @param float  $ts    Timestamp (may be 0).
    1281      * @param string $suffix Optional small label (e.g., “today”).
    1282      * @return string Safe html string.
     1227     * Builds the display string used in admin‑bar nodes.
     1228     *
     1229     * It shows the label, the relative “time‑ago” string, and the absolute timestamp
     1230     * (with microseconds).  If no timestamp is available, “Not Yet” is used.
     1231     *
     1232     * @param string $label Human‑readable name of the agent.
     1233     * @param float $ts Timestamp value or 0 for “Not Yet”.
     1234     * @param string $suffix Optional suffix string to append after the label.
     1235     * @return string The formatted admin‑bar line.
    12831236     */
    12841237    private function format_admin_bar_line( $label, $ts, $suffix = '' ) {
     
    12971250
    12981251    /**
    1299      * Formats the “last page” cell that appears in the admin page.
    1300      *
    1301      * Why:  In the list view we only want the URL, not the post title.
    1302      *       The helper converts the stored data from `compute_agent_latest()`
    1303      *       into a link or an “–” if the data is missing.
    1304      *
    1305      * @param array $latest Associative array from `compute_agent_latest`.
    1306      * @return string HTML markup.
     1252     * Formats the “last page” cell of the admin dashboard table.
     1253     *
     1254     * Accepts either a post ID or a raw URL string.  The output is a clickable link
     1255     * that opens the page in a new tab.  Empty or missing values become a dash.
     1256     *
     1257     * @param array{ts:float,type:string,post_id:int,url:string,ua:string} $latest Data from {@see compute_agent_latest()}.
     1258     * @return string The safe HTML for the link or a dash.
    13071259     */
    13081260    private function format_context_cell_url_only( $latest ) {
     
    13221274
    13231275    /**
    1324      * Converts a UTC timestamp to the administrator’s timezone string.
    1325      *
    1326      * Why:  All timestamps are stored in UTC; showing them in the
    1327      *       local timezone (configured in Settings → General)
    1328      *       is far more user‑friendly.  The helper uses `wp_date()` if
    1329      *       available, else falls back to `date_i18n()`.
    1330      *
    1331      * @param int    $sec    Unix timestamp.
    1332      * @param string $format Optional format string (uses WordPress defaults if omitted).
    1333      * @return string Localised timestamp.
     1276     * Returns the current site time in the configured timezone, formatted with the given mask.
     1277     *
     1278     * Wrapper around `wp_date()` (WordPress 5.5+) and falls back to
     1279     * `date_i18n()` on older cores.  The function takes a UNIX timestamp
     1280     * (seconds since the epoch) and a PHP date format string.
     1281     *
     1282     * @param int $sec Unix timestamp in seconds.
     1283     * @param string $format PHP date format mask.
     1284     * @return string Formatted date/time string.
    13341285     */
    13351286    private function format_site_tz( $sec, $format ) {
     
    13411292
    13421293    /**
    1343      * Computes the most recent known access time for a given bot.
    1344      *
    1345      * Why:  The admin page needs to show the “last seen” across the
    1346      *       entire site.  This method:
    1347      *   1. Fetches the cached “latest post” option and reads the
    1348      *      associated meta.  If that timestamp is newer, it is used.
    1349      *   2. Reads the cached “latest URL” option (created during recording)
    1350      *      which holds the exact URL and UA.  If that is newer than the
    1351      *      post timestamp, it wins.
    1352      *   3. Returns an associative array (`ts`, `type`, `post_id`, `url`, `ua`).
    1353      *
    1354      * @param string $key Bot key.
    1355      * @return array Latest visit info.
     1294     * Computes the most recent event for a given bot across the entire site.
     1295     *
     1296     * The algorithm inspects:
     1297     *   1. The post‑meta value of the last seen timestamp and associated UA for the last post that bot hit.
     1298     *   2. The `dzcr_latest_url_{$key}` option that records the most recently accessed URL.
     1299     *   3. Falls back to a default empty result if nothing is found.
     1300     *
     1301     * The function returns the candidate that has the greatest timestamp, together with
     1302     * the type (“post” or “url”) and details of the location.
     1303     *
     1304     * @param string $key Internal bot identifier.
     1305     * @return array{ts:float,type:string,post_id:int,url:string,ua:string} The best match.
    13561306     */
    13571307    private function compute_agent_latest( $key ) {
     
    13931343
    13941344    /**
    1395      * Adds “Settings” and “Documentation” links to the plugin’s row
    1396      * in the WordPress Plugins page.
    1397      *
    1398      * Why:  Site admins often look for quick access to the plugin’s
    1399      *       configuration.  Adding these links saves a few clicks.
    1400      *
    1401      * @param array $links Existing action links.
    1402      * @return array Updated array with new links.
     1345     * Adds “Settings” and “Documentation” links to the plugin’s row in the admin‑plugins page.
     1346     *
     1347     * The links open in the same window and in a new tab, respectively.
     1348     *
     1349     * @param array<string,string> $links Array of existing action links.
     1350     * @return array<string,string> The extended links array.
    14031351     */
    14041352    public function plugin_action_links( $links ) {
    1405         $settings_url = admin_url( 'admin.php?page=cls-crawler-record' );
     1353        $settings_url = admin_url( 'admin.php?page=' . DZCR_ADMIN_SLUG );
    14061354        $docs_url     = 'https://www.dizzysoft.com/crawler-record-plugin-for-wordpress/';
    14071355
  • crawler-record/tags/0.9.0/readme.txt

    r3400651 r3423919  
    33Tags: googlebot, bingbot, gptbot, seo, robots
    44Requires at least: 6.0
    5 Tested up to: 6.8
     5Tested up to: 6.9
    66Requires PHP: 7.4
    77Stable tag: 0.8.0
     
    5858
    5959== Changelog ==
     60= 0.9.0 =
     61* Now monitoring for Meta and Apple User Agents
     62* More accurate site-wide UA reporting.
     63* Ensured video tutorial appears on all admin screens.
     64* Fixed small code errors.
     65
    6066
    6167= 0.8.0 =
  • crawler-record/tags/0.9.0/uninstall.php

    r3400651 r3423919  
    6363
    6464// 2) Remove any known discrete options (if you introduced settings later).
    65 $discrete_options = array(
    66     'dzcr_settings',       // reserved for future settings array.
    67     'dzcr_agents_custom',  // reserved if you ever allow custom agent configs.
     65$dzcr_discrete_options = array(
     66    'dzcr_settings',
     67    'dzcr_agents_custom',
    6868);
    6969
    70 foreach ( $discrete_options as $opt ) {
    71     delete_option( $opt );
     70foreach ( $dzcr_discrete_options as $dzcr_opt ) {
     71    delete_option( $dzcr_opt );
    7272}
    7373
    7474// 3) Optional: purge post meta set by the plugin (disabled by default).
    7575// Enable by defining DZCR_PURGE_POSTMETA true in wp-config.php OR using the filter below.
    76 $purge_postmeta = ( defined( 'DZCR_PURGE_POSTMETA' ) && DZCR_PURGE_POSTMETA )
     76$dzcr_purge_postmeta = ( defined( 'DZCR_PURGE_POSTMETA' ) && DZCR_PURGE_POSTMETA )
    7777    || apply_filters( 'dzcr_uninstall_purge_postmeta', false );
    7878
    79 if ( $purge_postmeta ) {
     79if ( $dzcr_purge_postmeta ) {
    8080    // Delete meta keys written per post:
    8181    //  - _dzcr_last_seen_{agent}
  • crawler-record/trunk/crawler-record.php

    r3400643 r3423919  
    44 * Plugin URI:  https://www.dizzysoft.com/crawler-record-plugin-for-wordpress/
    55 * Description: Are the most common search and chat (AI/LLM) bots able to access the pages on your website? Crawler Record can tell you the last time each of the most common search/chat bots visited -and which pages at which they looked.
    6  * Version:     0.8.0
     6 * Version:     0.9.0
    77 * Requires at least: 6.0
    8  * Tested up to: 6.8
     8 * Tested up to: 6.9
    99 * Requires PHP: 7.4
    1010 * Author:      dizzysoft
     
    2020}
    2121
    22 define( 'CRAWLER_RECORD_VERSION', '0.8.0' );
     22define( 'CRAWLER_RECORD_VERSION', '0.9.0' );
     23define( 'DZCR_ADMIN_SLUG', 'dzcr-crawler-record' );
    2324
    2425class Crawler_Record {
    2526
    26     /**
    27      * Holds a list of all bot groups and their UA patterns.
    28      *
    29      * Why:  The agent list is small, stable, and rarely changes; storing
    30      *       it in code removes the need for a custom database table
    31      *       and keeps migration simple.  The `apply_filters()` wrapper
    32      *       lets developers add or replace bots without touching this file.
    33      *
    34      * @var array
    35      */
     27    // https://www.searchenginejournal.com/ai-crawler-user-agents-list/558130/
    3628    private $dzcr_default_agent_groups = [
    3729        'Google' => [
     
    144136        ],
    145137
     138        'Meta' => [
     139            'doc'   => 'https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/',
     140            'items' => [
     141                'meta_externalagent' => [
     142                    'label'   => 'Meta-ExternalAgent',
     143                    'pattern' => '#meta-externalagent/\d+(?:\.\d+)?#i',
     144                ],
     145                'meta_webindexer' => [
     146                    'label'   => 'Meta-WebIndexer',
     147                    'pattern' => '#meta-webindexer/\d+(?:\.\d+)?#i',
     148                ],
     149            ],
     150        ],
     151
     152        'Apple' => [
     153            'doc'   => 'https://support.apple.com/en-us/119829',
     154            'items' => [
     155                'applebot' => [
     156                    'label'   => 'Applebot',
     157                    'pattern' => '#Applebot/\d+(?:\.\d+)?#i',
     158                ],
     159
     160                // Robots-only (does not crawl pages)
     161                'applebot_extended' => [
     162                    'label'   => 'Applebot-Extended (AI)',
     163                    'pattern' => '#(?!.)#',
     164                ],
     165            ],
     166        ],
     167
    146168        'DuckDuckGo' => [
    147169            'doc'   => 'https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot',
     
    169191    ];
    170192
    171 
    172     /**
    173      * Cache of the resolved, filtered agent list.
    174      *
    175      * Why:  Resolving the agent list once per request saves expensive work
    176      *       (filtering & validation).  Stored in memory for the duration of
    177      *       the request; no persistence needed.
    178      *
    179      * @var array|null
    180      */
    181193    private $dzcr_agents_effective_cache = null;
    182194
    183     /**
    184      * Cache of the flat list of UA patterns keyed by the agent key.
    185      *
    186      * Why:  Many look-ups need a simple key > regex map; building it once avoids
    187      *       iterating through the nested agent group structure each time.
    188      *
    189      * @var array|null
    190      */
    191195    private $dzcr_agents_flat_cache      = null;
    192196
    193     /**
    194      * Prefix used for per-post Meta keys that store the last timestamp a bot
    195      * visited the post.
    196      *
    197      * Why:  Namespacing the meta key prevents collisions with other plugins.
    198      *       The prefix is combined with the unique agent key.
    199      *
    200      * @var string
    201      */
    202197    private $dzcr_meta_prefix     = '_dzcr_last_seen_';
    203198
    204     /**
    205      * Prefix used for site-wide last-seen Meta keys (stored as options).
    206      *
    207      * Why:  Allows quick access to the most recent time *any* page was seen by
    208      *       each bot without scanning the entire database.
    209      *
    210      * @var string
    211      */
    212199    private $dzcr_site_prefix     = '_dzcr_site_last_';
    213200
    214     /**
    215      * Prefix used for per-URL last-seen keys (stored as options).
    216      *
    217      * Why:  URLs are not tied to a post, so options keep arbitrary
    218      *       key/value pairs without cluttering postmeta.
    219      *
    220      * @var string
    221      */
    222201    private $dzcr_url_prefix      = '_dzcr_url_last_';
    223202
    224     /**
    225      * Prefix used for the last-post-ID per bot (stored as options).
    226      *
    227      * Why:  This is a fast index: if the last post ID is known we can fetch
    228      *       its meta immediately, avoiding a full table scan.
    229      *
    230      * @var string
    231      */
    232203    private $dzcr_lastpost_prefix = '_dzcr_last_post_';
    233204
    234 
    235     /**
    236      * Class constructor.
    237      *
    238      * Why:  Hooks the core functionality into WordPress:
    239      *   - `template_redirect` records when a crawler requests a page.
    240      *   - `add_meta_boxes` adds the per-post "Crawler Record" box.
    241      *   - `admin_bar_menu` inserts a quick toolbar entry for logged-in users.
    242      *   - `admin_menu` registers the settings page in the admin sidebar.
    243      *   - `plugin_action_links_*` adds Settings/Documentation links on the
    244      *     Plugins screen.
    245      *
    246      * This is the entry point of the plugin – everything else is called
    247      * indirectly via these actions.
    248      */
    249205    public function __construct() {
    250206        add_action( 'template_redirect', [ $this, 'maybe_record_last_seen' ] );
     
    258214    public function enqueue_admin_assets() {
    259215        $screen = function_exists('get_current_screen') ? get_current_screen() : null;
    260         if ( ! $screen || $screen->id !== 'toplevel_page_crawler-record' ) {
     216        if ( ! $screen || $screen->id !== 'toplevel_page_' . DZCR_ADMIN_SLUG ) {
    261217            return; // only on the plugin admin page
    262218        }
     
    265221            'https://player.vimeo.com/api/player.js',
    266222            [],
    267             $ver,
     223            CRAWLER_RECORD_VERSION,
    268224            true
    269225        );
     
    271227    }
    272228
    273 
    274     /**
    275      * Retrieves the effective agent groups after filters, validation,
    276      * and custom ordering have been applied.
    277      *
    278      * Why:  The default list is processed once per request.  This method
    279      *       returns a validated array that can be cached for all other
    280      *       look‑ups (`get_agent_groups()`).  The validation step removes
    281      *       malformed entries and guarantees the presence of a `doc`
    282      *       URL and a `label` for each item.
    283      *
    284      * @return array The resolved, validated agent groups.
     229    /**
     230     * Lazily resolves the complete agent‑group tree, applying the `dzcr_agent_groups`
     231     * and `dzcr_agents_order` filters, and caches the result for the lifetime of the request.
     232     *
     233     * The returned array maps group names to an array containing a `doc` URL and a list of
     234     * individual bot definitions (`label` and `pattern`).  The data is filtered so that
     235     * malformed entries (missing patterns or labels) are removed automatically.
     236     *
     237     * @return array<string,array<string,mixed>> The fully‑validated and ordered agent groups.
    285238     */
    286239    private function get_agent_groups() {
     
    332285
    333286    /**
    334      * Returns a flattened map of bot keys to regex pattern strings.
    335      *
    336      * Why:  The recording logic (`maybe_record_last_seen`) needs a
    337      *       simple `foreach ($agent_key => $pattern)` loop.  Flattening
    338      *       removes the overhead of walking the nested group structure
    339      *       for every request.
    340      *
    341      * @return array Key → regex string.
     287     * Returns a flattened map of every bot key to its compiled regular‑expression pattern.
     288     *
     289     * The function iterates over the effective agent groups obtained via {@see get_agent_groups()}
     290     * and builds a key‑=>pattern list.  The result is cached for the current request.
     291     *
     292     * @return array<string,string> Bot key → regex pattern.
    342293     */
    343294    private function get_agents_flat() {
     
    356307
    357308    /**
    358      * Looks up the human‑readable label for a bot key.
    359      *
    360      * Why:  The UI shows the label, not the internal key.  The helper
    361      *       centralises the lookup and provides a default in case the
    362      *       key is unknown.
    363      *
    364      * @param string $key Internal bot identifier.
    365      * @return string Human‑readable label.
     309     * Looks up a human‑readable label for a bot key in the current agent‑group configuration.
     310     *
     311     * If the key cannot be found, its raw value is returned as a fallback.  This helper is
     312     * used to keep UI code terse while still presenting friendly names to site editors.
     313     *
     314     * @param string $key The internal identifier for a bot (e.g. 'googlebot_desktop').
     315     * @return string The user‑friendly label.
    366316     */
    367317    private function get_label_for_key( $key ) {
     
    375325
    376326    /**
    377      * Builds the canonical URL of the current front‑end request.
    378      *
    379      * Why:  Recorded timestamps are stored per‑URL, so we need an exact,
    380      *       normalised URL.  The function normalises the scheme
    381      *       (`http/https`), host, path, and applies the
    382      *       `dzcr_normalize_url` filter (useful for stripping tracking
    383      *       parameters or converting to a canonical form).
    384      *
    385      * @return string Full URL (or empty string in admin/invalid state).
    386      */
    387     private function current_url() {
    388         if ( is_admin() ) {
     327     * Generates the canonical, normalized permalink for the queried singular post.
     328     *
     329     * The function guarantees the returned URL does **not** contain a query string,
     330     * a fragment identifier, or an empty path.  The result is suitable for storage
     331     * in post meta and for robots‑txt matching.
     332     *
     333     * @param int $post_id By reference, receives the resolved post ID or 0 if no singular.
     334     * @return string The sanitized absolute URL, or an empty string when no query can be resolved.
     335     */
     336    private function current_url_for_post( &$post_id = 0 ) {
     337        if ( is_admin() || ! is_singular() ) {
    389338            return '';
    390339        }
    391         if ( ! isset( $_SERVER['HTTP_HOST'], $_SERVER['REQUEST_URI'] ) ) {
     340
     341        $post_id = (int) get_queried_object_id();
     342        if ( ! $post_id ) {
    392343            return '';
    393344        }
    394         $scheme = ( ! empty( $_SERVER['HTTPS'] ) && 'off' !== $_SERVER['HTTPS'] ) ? 'https' : 'http';
    395         $host   = sanitize_text_field( wp_unslash( $_SERVER['HTTP_HOST'] ) );
    396         $uri    = esc_url_raw( wp_unslash( $_SERVER['REQUEST_URI'] ) );
    397 
    398         if ( '' === $host ) {
     345
     346        $url = get_permalink( $post_id );
     347        if ( ! $url ) {
    399348            return '';
    400349        }
    401         $url = $scheme . '://' . $host . $uri;
    402         $url = apply_filters( 'dzcr_normalize_url', $url );
    403         if ( strlen( $url ) > 2048 ) {
    404             $url = substr( $url, 0, 2048 );
    405         }
    406         return $url;
    407     }
    408 
    409     /**
    410      * Records the last “seen” timestamp when a crawler requests a page.
    411      *
    412      * Why:  This method is the heart of the plugin.  It is called on
    413      *       `template_redirect` and performs the following actions:
    414      *
    415      *   1. Skips previews, feeds, REST, admin, AJAX, and cron requests.
    416      *   2. Only processes GET/HEAD requests, which are the standard
    417      *      HTTP verbs used by crawlers.
    418      *   3. Trims and sanitises the UA string (max 512 chars) and
    419      *      normalises it to avoid runaway regexes.
    420      *   4. Throttles writes using a 10‑minute window (configurable via
    421      *      `dzcr_throttle_window`) to avoid spamming the DB when the same
    422      *      bot visits the page repeatedly.
    423      *   5. Stores three pieces of information:
    424      *        - Per‑post meta (`_dzcr_last_seen_[bot]`).
    425      *        - Per‑URL options (`_dzcr_url_last_[bot]_[hash]`) plus companion
    426      *          `_url_` and `_ua_` options.
    427      *        - Site‑wide option (`_dzcr_site_last_[bot]`).
    428      *
    429      * The method also updates the “last UA string” meta/option so the
    430      * UI can show exactly what the bot sent.
    431      *
     350
     351        // Normalize: strip query string and fragment
     352        $parts = wp_parse_url( $url );
     353        if ( empty( $parts['host'] ) || empty( $parts['path'] ) ) {
     354            return '';
     355        }
     356
     357        $scheme = $parts['scheme'] ?? 'https';
     358        return $scheme . '://' . $parts['host'] . $parts['path'];
     359    }
     360
     361    /**
     362     * Core routine that records bot visit timestamps when the current request matches a known crawler.
     363     *
     364     * The method is executed early in the front‑end rendering cycle (`template_redirect`).
     365     * 1. It exits immediately for admin screens, REST requests, feeds, `HEAD` requests, or
     366     *    when the `User‑Agent` header is missing or too long.
     367     * 2. The UA string is matched against the compiled regex list.  Only the first matching
     368     *    rule is considered.
     369     * 3. A per‑post meta key and a global option are updated if the throttle window
     370     *    (10 minutes by default) allows it.  The method also records the exact UA string
     371     *    that triggered the update so editors can inspect it later.
     372     *
     373     * @param int $post_id Optional post ID when the query resolves to a single post.  Passed by value.
    432374     * @return void
     375     * @throws RuntimeException If the PHP `set_transient` failures occur (unlikely, but reported).
    433376     */
    434377    public function maybe_record_last_seen() {
     
    445388
    446389        $now     = microtime( true );
    447         $post_id = is_singular() ? (int) get_queried_object_id() : 0;
    448         $url     = $this->current_url();
    449         $urlhash = $url ? md5( $url ) : '';
     390
     391        $post_id = 0;
     392        $url     = $this->current_url_for_post( $post_id );
    450393
    451394        $throttle = (int) apply_filters( 'dzcr_throttle_window', 10 * MINUTE_IN_SECONDS );
     
    470413            }
    471414
    472             // Per URL
    473             if ( $urlhash ) {
    474                 $opt_key     = $this->dzcr_url_prefix . $key . '_' . $urlhash;
    475                 $opt_url_key = $this->dzcr_url_prefix . $key . '_url_' . $urlhash;
    476                 $opt_ua_key  = $this->dzcr_url_prefix . $key . '_ua_'  . $urlhash; // store last UA for this URL+agent
    477                 $t_url       = 'dzcr_seen_url_' . $key . '_' . $urlhash;
    478 
    479                 if ( ! $throttle || ! get_transient( $t_url ) ) {
    480                     $prev = (float) get_option( $opt_key, 0 );
    481                     if ( $now > $prev ) {
    482                         update_option( $opt_key, (string) $now, false );
    483                         update_option( $opt_url_key, $url, false );
    484                         update_option( $opt_ua_key,  $ua,  false );
    485                     }
    486                     if ( $throttle ) {
    487                         set_transient( $t_url, 1, $throttle );
    488                     }
     415            // Site-wide latest real post/page
     416            if ( $post_id && $url ) {
     417                $latest_key = 'dzcr_latest_url_' . $key;
     418                $prev       = get_option( $latest_key, [] );
     419                $prev_ts    = isset( $prev['ts'] ) ? (float) $prev['ts'] : 0.0;
     420
     421                if ( $now > $prev_ts ) {
     422                    update_option(
     423                        $latest_key,
     424                        [
     425                            'ts'  => (string) $now,
     426                            'url' => $url,
     427                            'ua'  => $ua,
     428                        ],
     429                        false
     430                    );
    489431                }
    490432            }
     
    501443
    502444    /**
    503      * Safely matches a UA string against a stored pattern or literal value.
    504      *
    505      * Why:  Direct use of `preg_match()` on untrusted user‑agent strings
    506      *       can trigger PHP warnings/errors (e.g. malformed regex).
    507      *       This wrapper:
    508      *   - Ensures the pattern begins with a `#` and ends with `#i`.
    509      *   - Suppresses `preg_match()` errors with the `@` operator.
    510      *   - Validates that the regex is syntactically correct.
    511      *   - Fallbacks to literal `hash_equals()` comparison when a plain string
    512      *     is supplied.
    513      *
    514      * @param string $ua             The UA string from the request.
    515      * @param string $pattern_or_exact  Either a regex string or a literal.
    516      * @return bool True if the UA matches the pattern or string.
     445     * Safely tests whether a given User‑Agent string matches a pattern.
     446     *
     447     * Handles both plain strings (exact comparison) and regular‑expressions by inspecting
     448     * the first character.  The function avoids the costly `preg_match` errors through
     449     * the `@` error‑control operator and checks the regex error code.
     450     *
     451     * @param string $ua The User‑Agent string to test.
     452     * @param string $pattern_or_exact Either a regex (enclosed in `#...#i`) or a literal string.
     453     * @return bool True if the UA satisfies the pattern; otherwise false.
    517454     */
    518455    private function ua_matches_safe( $ua, $pattern_or_exact ) {
     
    542479
    543480    /**
    544      * Registers the “Crawler Record” meta‑box that appears on the post
    545      * editing screen.
    546      *
    547      * Why:  Post editors need instant insight into which crawlers have
    548      *       visited the post and their access status.  The meta‑box
    549      *       appears in the normal “normal” context with high priority,
    550      *       making it highly visible without adding extra menu items.
     481     * Registers the “Crawler Record” meta‑box on the post editing screen.
     482     *
     483     * The meta‑box is added to the *Advanced* context so that it does not clutter the editor
     484     * for non‑technical users.  It is only visible to capable editors (users with
     485     * `edit_post` capability).
     486     *
    551487     */
    552488    public function register_meta_box() {
     
    562498
    563499    /**
    564      * Renders the per‑post “Crawler Record” meta‑box.
    565      *
    566      * Why:  The box shows a table of all bots, each with:
    567      *   - A human‑readable label.
    568      *   - The last time the bot accessed the post (`Last Seen`).
    569      *   - A status badge that tells you whether the URL is allowed by
    570      *     `robots.txt`.
    571      *   - A collapsible section that reveals the exact UA string
    572      *     the bot used when it visited.
    573      *
    574      * The function also prints a warning if the site is globally
    575      * blocking all crawlers (WordPress “Discourage search‑engine” setting).
    576      *
    577      * @param WP_Post $post Current post object.
     500     * Renders the contents of the “Crawler Record” meta‑box.
     501     *
     502     * Generates a table that lists every tracked agent, its last‑seen timestamp, and a
     503     * robots‑txt allow/blocked status.  The UI includes a collapsible `details` element
     504     * that shows the exact User‑Agent string used for the most recent hit.
     505     *
     506     * @param WP_Post $post The post object currently being edited.
    578507     * @return void
    579508     */
     
    618547                'diag'   => $diag,
    619548                'ua'     => $last_ua,
    620                 'na'     => ( 'google_extended' === $key ), // treat Google-Extended as non-crawler
     549                'na' => $this->is_robots_only_agent( $key ),
    621550            ];
    622551        }
     
    662591
    663592    /**
    664      * Adds a “Crawler Record” entry to the front‑end admin bar.
    665      *
    666      * Why:  Site admins often browse the front‑end while logged in.
    667      *       The toolbar provides a one‑click shortcut to the
    668      *       detailed bot‑status page and shows a quick list of
    669      *       timestamps for each bot, whether looking at a single page,
    670      *       a specific URL, or the entire site.
    671      *
    672      * @param WP_Admin_Bar $wp_admin_bar Admin‑bar object.
     593     * Adds a “Crawler Record” node to the front‑end admin bar with context‑aware children.
     594     *
     595     * The node is only shown to logged‑in users viewing the front‑end (not the admin).  Depending
     596     * on whether the current page is singular or generic, the children list will display
     597     * per‑post or site‑wide last‑seen data for each crawler.
     598     *
     599     * @param WP_Admin_Bar $wp_admin_bar The admin bar instance.
    673600     * @return void
    674601     */
     
    679606        }
    680607
    681         $admin_page_url = admin_url( 'admin.php?page=cls-crawler-record' );
     608        $admin_page_url = admin_url( 'admin.php?page=' . DZCR_ADMIN_SLUG );
    682609
    683610        // Parent link
     
    700627
    701628                foreach ( $wrap['items'] as $key => $_def ) {
    702                     if ( 'google_extended' === $key ) {
    703                         continue; // do not show Google-Extended in admin bar
     629                    if ( $this->is_robots_only_agent( $key ) ) {
     630                        continue;
    704631                    }
     632
    705633                    $label = $this->get_label_for_key( $key );
    706634
     
    736664
    737665    /**
    738      * Registers the plugin’s top‑level admin page.
    739      *
    740      * Why:  The settings page is a convenient place to:
    741      *   - See all bots and their global timestamps.
    742      *   - Inspect the last page each bot has seen.
    743      *   - View robots.txt diagnostics for each bot.
     666     * Registers the top‑level “Crawler Record” admin submenu page.
     667     *
     668     * The submenu is created under the **Settings** root so that it is consistently
     669     * reachable regardless of the user role or the presence of other plugins.
    744670     *
    745671     * @return void
     
    750676            __( 'Crawler Record', 'crawler-record' ),
    751677            'manage_options',
    752             'cls-crawler-record',
     678            DZCR_ADMIN_SLUG,
    753679            [ $this, 'render_admin_page' ],
    754680            'dashicons-search',
     
    758684
    759685    /**
    760      * Renders the entire admin page.
    761      *
    762      * Why:  The page shows a per‑bot table containing:
    763      *   - The bot’s human label.
    764      *   - The most recent global timestamp.
    765      *   - The link to the most recent page the bot visited.
    766      *   - A status badge indicating robots.txt permissibility.
    767      *
    768      * The table also has a useful “Important” notice if
    769      * WordPress is blocking all crawlers.
     686     * Renders the public dashboard for all tracked agents.
     687     *
     688     * The page lists each bot’s last‑seen timestamp, the last URL it visited, and the
     689     * result of a robots‑txt evaluation for the site’s home path.  A small embedded video
     690     * tutorial is shown in the right‑hand column.
    770691     *
    771692     * @return void
     
    847768                    : $this->robots_status_for_agent_path( $key, $home_path, $diag );
    848769
    849                 $show_last_fields_as_na = ( 'google_extended' === $key );
     770                $show_last_fields_as_na = $this->is_robots_only_agent( $key );
    850771
    851772                // Agent cell (expandable UA string unless NA)
     
    879800
    880801    /**
    881      * Checks if the site is configured to “Discourage search engines”.
    882      *
    883      * Why:  If WordPress’s global setting is enabled, all crawlers are
    884      *       blocked regardless of `robots.txt`.  The function lets us
    885      *       quickly short‑circuit further checks.
    886      *
    887      * @return bool True if the site is discouraging crawlers.
     802     * Determines if WordPress is currently discouraging search engines (`blog_public` = 0).
     803     *
     804     * The helper is used to short‑circuit logic that would otherwise record activity,
     805     * and to display a warning banner on admin screens.
     806     *
     807     * @return bool True when site‑wide crawling is disabled.
    888808     */
    889809    private function is_site_discouraged() {
     
    892812
    893813    /**
    894      * Returns a status array for the “blocked by WordPress” case.
    895      *
    896      * Why:  Used when `is_site_discouraged()` is true; the admin UI
    897      *       needs a consistent structure (`state` & `reason`) for
    898      *       rendering the status badge.
    899      *
    900      * @return array `state` (blocked) and a human‑readable reason.
     814     * Builds a standard “blocked by WordPress setting” status array.
     815     *
     816     * Returned from {@see robots_status_for_agent_path()} when the site is
     817     * configured to discourage search engines.
     818     *
     819     * @return array{state:string,reason:string} Blocking information.
    901820     */
    902821    private function forced_block_status() {
     
    908827
    909828    /**
    910      * Maps an internal bot key to the list of UA strings that the bot
    911      * uses (used to choose the “most representative” UA for robots.txt
    912      * checks).
    913      *
    914      * Why:  Different bots have distinct patterns for desktop/mobile,
    915      *       legacy, and AI‑bot forms.  This helper returns the set of
    916      *       tokens that must be matched against the `robots.txt` filter
    917      *       logic.
    918      *
    919      * @param string $key Internal bot key.
    920      * @return array List of tokens.
     829     * Maps an internal bot key to the canonical User‑Agent string(s) that can appear
     830     * in the site’s robots.txt.
     831     *
     832     * The mapping is used for robots‑txt group selection.  When a key maps to
     833     * multiple tokens it is returned as an array of strings; otherwise a single‑element array.
     834     *
     835     * @param string $key The internal identifier for a bot.
     836     * @return array<string> The list of UA strings to check in robots.txt.
    921837     */
    922838    private function robots_tokens_for_key(string $key): array {
     
    953869            case 'perplexity_user':      return ['Perplexity-User']; // normalize Unicode hyphens
    954870
     871            // Meta
     872            case 'meta_externalagent':
     873                return ['meta-externalagent'];
     874            case 'meta_webindexer':
     875                return ['Meta-WebIndexer'];
     876
     877            // Apple
     878            case 'applebot':
     879                return ['Applebot'];
     880            case 'applebot_extended':
     881                return ['Applebot-Extended'];
     882
     883
    955884            // DuckDuckGo
    956885            case 'duckduckgo_search':    return ['DuckDuckBot'];
     
    964893
    965894    /**
    966      * Chooses the appropriate `User‑Agent` group in robots.txt for
    967      * a given bot.
    968      *
    969      * Why:  A robot may have multiple `User‑Agent` lines (desktop/mobile).
    970      *       The longest exact match wins; if none match we fallback to
    971      *       the wildcard group (`*`).  This function returns the entire
    972      *       group structure (agents + rules) for that bot.
    973      *
    974      * @param string $robots_txt Raw text of robots.txt.
    975      * @param array  $tokens     List of tokens that identify the bot.
    976      * @return array Group with `agents` and `rules` keys.
     895     * Returns true for bots that only check robots.txt and never fetch pages.
     896     *
     897     * For these bots the per‑post “last‑seen” columns are marked “N/A” because it is
     898     * impossible to record an actual page visit.
     899     *
     900     * @param string $key The internal bot identifier.
     901     * @return bool True when the bot is robots‑only.
     902     */
     903    private function is_robots_only_agent( string $key ): bool {
     904        return in_array(
     905            $key,
     906            [
     907                'google_extended',
     908                'applebot_extended',
     909            ],
     910            true
     911        );
     912    }
     913
     914    /**
     915     * Chooses the most specific robots.txt group that matches the requested bot tokens.
     916     *
     917     * The grouping algorithm follows the order of rules in the file: a group that
     918     * lists a specific User‑Agent token and the longest matching rule wins.
     919     * If no group matches, the first wildcard `*` group is returned (if present).
     920     *
     921     * @param string $robots_txt Raw robots.txt contents.
     922     * @param array<string> $tokens List of canonical UA strings for the bot.
     923     * @return array{agents:list<string>,rules:list<mixed>} The selected group’s agents and rules.
    977924     */
    978925    private function select_robots_group(string $robots_txt, array $tokens) {
     
    1003950
    1004951    /**
    1005      * Checks whether a path starts with the rule prefix.
    1006      *
    1007      * Why:  `robots.txt` rules are prefix based; e.g. `Disallow: /wp-admin`
    1008      *       blocks everything under `/wp-admin`.  A simple `strncmp()`
    1009      *       suffices because we already normalised the rule to start with
    1010      *       a slash and have removed any trailing `$` terminator.
    1011      *
    1012      * @param string $path Path from the request.
    1013      * @param string $rule Rule from robots.txt.
    1014      * @return bool True if the path matches the rule.
     952     * Tests whether the supplied robots rule path is a strict prefix of $path.
     953     *
     954     * The helper is used by {@see robots_txt_allows()} when applying the longest‑prefix
     955     * rule logic.
     956     *
     957     * @param string $path The request path.
     958     * @param string $rule The rule path defined in robots.txt.
     959     * @return bool True when $rule is a prefix of $path.
    1015960     */
    1016961    private function path_prefix_match(string $path, string $rule): bool {
     
    1019964
    1020965    /**
    1021      * Normalises hyphen characters in a string; replaces various Unicode
    1022      * hyphens with the ASCII hyphen.
    1023      *
    1024      * Why:  Some bots (e.g., Perplexity) use non‑standard hyphens in
    1025      *       their UA strings.  Normalising them guarantees that
    1026      *       string‑based matching (e.g. `strcasecmp()`) works
    1027      *       regardless of the particular hyphen glyph.
     966     * Normalises a URL path so that an empty string becomes “/”.
     967     *
     968     * An empty path can cause subtle bugs in robots‑txt evaluation and is treated
     969     * as the root path by WordPress.
     970     *
     971     * @param string $p The original path.
     972     * @return string Normalised path.
     973     */
     974    private function normalize_path(string $p): string {
     975        return ($p === '') ? '/' : $p;
     976    }
     977
     978    /**
     979     * Replaces every form of Unicode hyphen (–, ‑, -, …) with a simple ASCII hyphen.
     980     *
     981     * The rule set is chosen to avoid false mismatches when a site’s robots.txt
     982     * contains variant hyphens in its UA tokens.
    1028983     *
    1029984     * @param string $s Input string.
    1030985     * @return string Normalised string.
    1031986     */
    1032     private function normalize_path(string $p): string {
    1033         return ($p === '') ? '/' : $p;
    1034     }
    1035 
    1036     /**
    1037      * Normalises hyphen characters in a string; replaces various Unicode
    1038      * hyphens with the ASCII hyphen.
    1039      *
    1040      * Why:  Some bots (e.g., Perplexity) use non‑standard hyphens in
    1041      *       their UA strings.  Normalising them guarantees that
    1042      *       string‑based matching (e.g. `strcasecmp()`) works
    1043      *       regardless of the particular hyphen glyph.
    1044      *
    1045      * @param string $s Input string.
    1046      * @return string Normalised string.
    1047      */
    1048987    private function normalize_hyphens(string $s): string {
    1049988        return preg_replace('/[\x{2010}-\x{2015}\x{2212}\x{FE58}\x{FE63}\x{FF0D}]/u','-',$s);
     
    1051990
    1052991    /**
    1053      * Parses a raw `robots.txt` string into an array of groups.
    1054      *
    1055      * Why:  The plugin re‑implements the minimal parsing logic needed
    1056      *       for our use‑case (User‑Agent, Allow, Disallow).  The function
    1057      *       ignores other directives (Sitemap, Crawl‑delay) and keeps
    1058      *       groups and rules in the order they appear, which is essential
    1059      *       for the longest‑path‑wins logic.
    1060      *
    1061      * @param string $txt Raw content of robots.txt.
    1062      * @return array Array of groups, each with `agents` and `rules`.
     992     * Parses the raw robots.txt into an array of agent‑group objects.
     993     *
     994     * The parser emits structures of the form:
     995     *   [{'agents'=>[...], 'rules'=>[['allow'|'disallow', '/path'], ...]}, ...]
     996     *  * Non‑essential lines (`Sitemap:`, `Crawl‑delay:`, comments) are ignored.
     997     *  * Blank lines reset state, but a group is only flushed when a director is encountered.
     998     *
     999     * @param string $txt Raw robots.txt string.
     1000     * @return array<List<string>,list<mixed>> List of groups, each with `agents` and `rules`.
    10631001     */
    10641002    private function parse_robots_groups(string $txt): array {
     
    11051043
    11061044    /**
    1107      * Retrieves the raw `robots.txt` content for the current site.
    1108      *
    1109      * Why:  We avoid an HTTP request to the public `robots.txt`.
    1110      *       WordPress can generate it via `do_robots()` or the file can
    1111      *       be read directly.  The function falls back to the filter
    1112      *       `robots_txt` if no file or `do_robots()` output is
    1113      *       available.
    1114      *
    1115      * @return string|null Raw robots.txt body or null if unavailable.
     1045     * Retrieves the active robots.txt for the current site.
     1046     *
     1047     * The method respects the order imposed by the WordPress core:
     1048     *   1. Physical `robots.txt` file
     1049     *   2. `wp_robots()` (WP 5.7+)
     1050     *   3. `do_robots` action
     1051     *   4. `robots_txt` filter
     1052     *
     1053     * @return string|null The robots.txt contents, or `null` if none could be generated.
    11161054     */
    11171055    private function get_local_robots_txt() {
     1056        // 1. Physical robots.txt file takes precedence
    11181057        $file = ABSPATH . 'robots.txt';
    11191058        if ( @is_readable( $file ) ) {
    11201059            $body = @file_get_contents( $file );
    1121             if ( is_string( $body ) && '' !== $body ) return $body;
    1122         }
     1060            if ( is_string( $body ) && '' !== trim( $body ) ) {
     1061                return $body;
     1062            }
     1063        }
     1064
     1065        // 2. Prefer wp_robots() when available (WP 5.7+)
     1066        if ( function_exists( 'wp_robots' ) ) {
     1067            ob_start();
     1068            wp_robots();
     1069            $out = ob_get_clean();
     1070            if ( is_string( $out ) && '' !== trim( $out ) ) {
     1071                return $out;
     1072            }
     1073        }
     1074
     1075        // 3. Fallback to core do_robots action
     1076        // do_robots is a WordPress core action used to generate robots.txt output.
    11231077        ob_start();
     1078        // phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedHooknameFound
    11241079        do_action( 'do_robots' );
    11251080        $out = ob_get_clean();
    1126         if ( is_string( $out ) && '' !== trim( $out ) ) return $out;
    1127 
     1081        if ( is_string( $out ) && '' !== trim( $out ) ) {
     1082            return $out;
     1083        }
     1084
     1085        // 4. Final fallback via robots_txt filter
     1086        // phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedHooknameFound
    11281087        $output = apply_filters( 'robots_txt', '', get_option( 'blog_public' ) );
    1129         if ( is_string( $output ) && '' !== trim( $output ) ) return $output;
     1088        if ( is_string( $output ) && '' !== trim( $output ) ) {
     1089            return $output;
     1090        }
    11301091
    11311092        return null;
     
    11331094
    11341095    /**
    1135      * Returns true if the robots.txt rules permit the bot to access the path.
    1136      *
    1137      * Why:  The function implements a simplified rule engine that:
    1138      *   1. Chooses the best User‑Agent group for the bot (exact match
    1139      *      or wildcard).
    1140      *   2. Normalises the path and checks it against each rule in the
    1141      *      selected group.
    1142      *   3. Uses longest‑path‑wins; disallow wins only if it
    1143      *      matches a longer path than any allow.  If no rule matches,
    1144      *      the URL is allowed by default.
    1145      *
    1146      * @param string $robots_txt Raw robots.txt text.
    1147      * @param string $agent_key  Internal bot key.
    1148      * @param string $path       Requested path.
    1149      * @return bool True if the path is allowed for the bot.
     1096     * Determines whether the given bot is allowed to fetch $path based on the active robots.txt.
     1097     *
     1098     * Implements the standard longest‑prefix‑wins rule for a single bot’s group.  The method
     1099     * returns `true` for ``allow`` or for paths that match no rule (default allow).
     1100     *
     1101     * @param string $robots_txt The retrieved robots.txt contents.
     1102     * @param string $agent_key The internal identifier for the bot.
     1103     * @param string $path The requested URL path (starting with “/”).
     1104     * @return bool True when crawling is permitted, false when disallowed.
    11501105     */
    11511106    private function robots_txt_allows(string $robots_txt, string $agent_key, string $path): bool {
     
    11751130
    11761131    /**
    1177      * Public wrapper that provides a “state” + “reason” array describing
    1178      * whether the bot is allowed to crawl the path, and populates
    1179      * diagnostics output for the UI.
    1180      *
    1181      * Why:  The admin pages display a status badge; this method produces
    1182      *       the data the badge expects and also records diagnostic
    1183      *       information (matched group & rule) that can be shown in
    1184      *       an expandable details box.
    1185      *
    1186      * @param string $agent_key Bot key.
    1187      * @param string $path      Requested path.
    1188      * @param array  &$diag     Output diagnostics (group, rule).
    1189      * @return array (`state` => 'allowed'|'blocked'|'unknown',  `reason` => string).
     1132     * Public API that reports the allow/blocked status for a bot and a specific path.
     1133     *
     1134     * The method first checks whether the site globally discourages crawlers (`blog_public=0`).
     1135     * If not, it loads the robots.txt, selects the appropriate group, and evaluates
     1136     * the path.  A detailed diagnostic array (group name, rule used) is returned
     1137     * alongside the status.  When a global block applies, the status is `blocked`
     1138     * with a specific reason.
     1139     *
     1140     * @param string $agent_key The bot’s internal key (e.g. 'googlebot_desktop').
     1141     * @param string $path Path fragment (starting with “/”) to test.
     1142     * @param array|null $diag Optional reference parameter that receives diagnostics about
     1143     *                         the matching group and rule; can be omitted if not needed.
     1144     * @return array{state:string,reason:string}<p>Possible `state` values: `allowed`, `blocked`, `unknown`.</p>
    11901145     */
    11911146    public function robots_status_for_agent_path( $agent_key, $path, &$diag = null ) {
     
    12101165
    12111166    /**
    1212      * Renders a status badge and an expandable details section.
    1213      *
    1214      * Why:  The status badge (green check, red X, gray) gives a quick visual
    1215      *       cue.  The details expand on hover/click and show the exact
    1216      *       robots.txt group and rule that determined the decision.
    1217      *
    1218      * @param array $status Associative array returned by
    1219      *                      `robots_status_for_agent_path`.
    1220      * @param array $diag   Diagnostics array (group & rule).
    1221      * @return string HTML safe snippet for the badge + details.
     1167     * Renders a colour‑coded badge indicating the robots.txt status and a collapsible
     1168     * details section that shows which rule was matched.
     1169     *
     1170     * The badge colors are:
     1171     *   * Green with a checkmark for “allowed”
     1172     *   * Red with a cross for “blocked”
     1173     *   * Grey for “unknown”
     1174     *
     1175     * @param array{state:string,reason:string} $status Returned from {@see robots_status_for_agent_path()}.
     1176     * @param array{group:string,rule:string} $diag Diagnostics array for detail output.
     1177     * @return string Safe HTML for the badge + details element.
    12221178     */
    12231179    private function render_status_badge_expandable( $status, $diag ) {
     
    12441200
    12451201    /**
    1246      * Formats a timestamp in “human‑readable” form and a precise
    1247      * full‑timestamp (with micro‑seconds) for display.
    1248      *
    1249      * Why:  Users want to see “3 days ago (2025‑08‑12 15:04:23.123456)”.
    1250      *       This helper keeps the code in the main rendering loop
    1251      *       terse and centralises the formatting logic.
    1252      *
    1253      * @param float $ts Timestamp from the database.
    1254      * @return string HTML safe representation.
     1202     * Formats a UNIX timestamp (with microseconds) into a human‑readable “time‑ago” string
     1203     * plus the exact date/time in the site’s configured timezone.
     1204     *
     1205     * The output is safe for HTML rendering and is not localized beyond what
     1206     * WordPress’ `human_time_diff()` and `wp_date()` provide.
     1207     *
     1208     * @param float $ts Timestamp value returned by `microtime(true)`.
     1209     * @return string The formatted cell content; if $ts is falsy, “Not Yet” is returned.
    12551210     */
    12561211    private function format_last_seen_cell( $ts ) {
     
    12701225
    12711226    /**
    1272      * Formats the string shown in the admin bar for each bot.
    1273      *
    1274      * Why:  The toolbar entry should be compact (label + timestamp)
    1275      *       but still show the exact time.  This helper keeps the
    1276      *       formatting consistent between the toolbar and the
    1277      *       meta‑box.
    1278      *
    1279      * @param string $label Label for the bot.
    1280      * @param float  $ts    Timestamp (may be 0).
    1281      * @param string $suffix Optional small label (e.g., “today”).
    1282      * @return string Safe html string.
     1227     * Builds the display string used in admin‑bar nodes.
     1228     *
     1229     * It shows the label, the relative “time‑ago” string, and the absolute timestamp
     1230     * (with microseconds).  If no timestamp is available, “Not Yet” is used.
     1231     *
     1232     * @param string $label Human‑readable name of the agent.
     1233     * @param float $ts Timestamp value or 0 for “Not Yet”.
     1234     * @param string $suffix Optional suffix string to append after the label.
     1235     * @return string The formatted admin‑bar line.
    12831236     */
    12841237    private function format_admin_bar_line( $label, $ts, $suffix = '' ) {
     
    12971250
    12981251    /**
    1299      * Formats the “last page” cell that appears in the admin page.
    1300      *
    1301      * Why:  In the list view we only want the URL, not the post title.
    1302      *       The helper converts the stored data from `compute_agent_latest()`
    1303      *       into a link or an “–” if the data is missing.
    1304      *
    1305      * @param array $latest Associative array from `compute_agent_latest`.
    1306      * @return string HTML markup.
     1252     * Formats the “last page” cell of the admin dashboard table.
     1253     *
     1254     * Accepts either a post ID or a raw URL string.  The output is a clickable link
     1255     * that opens the page in a new tab.  Empty or missing values become a dash.
     1256     *
     1257     * @param array{ts:float,type:string,post_id:int,url:string,ua:string} $latest Data from {@see compute_agent_latest()}.
     1258     * @return string The safe HTML for the link or a dash.
    13071259     */
    13081260    private function format_context_cell_url_only( $latest ) {
     
    13221274
    13231275    /**
    1324      * Converts a UTC timestamp to the administrator’s timezone string.
    1325      *
    1326      * Why:  All timestamps are stored in UTC; showing them in the
    1327      *       local timezone (configured in Settings → General)
    1328      *       is far more user‑friendly.  The helper uses `wp_date()` if
    1329      *       available, else falls back to `date_i18n()`.
    1330      *
    1331      * @param int    $sec    Unix timestamp.
    1332      * @param string $format Optional format string (uses WordPress defaults if omitted).
    1333      * @return string Localised timestamp.
     1276     * Returns the current site time in the configured timezone, formatted with the given mask.
     1277     *
     1278     * Wrapper around `wp_date()` (WordPress 5.5+) and falls back to
     1279     * `date_i18n()` on older cores.  The function takes a UNIX timestamp
     1280     * (seconds since the epoch) and a PHP date format string.
     1281     *
     1282     * @param int $sec Unix timestamp in seconds.
     1283     * @param string $format PHP date format mask.
     1284     * @return string Formatted date/time string.
    13341285     */
    13351286    private function format_site_tz( $sec, $format ) {
     
    13411292
    13421293    /**
    1343      * Computes the most recent known access time for a given bot.
    1344      *
    1345      * Why:  The admin page needs to show the “last seen” across the
    1346      *       entire site.  This method:
    1347      *   1. Fetches the cached “latest post” option and reads the
    1348      *      associated meta.  If that timestamp is newer, it is used.
    1349      *   2. Reads the cached “latest URL” option (created during recording)
    1350      *      which holds the exact URL and UA.  If that is newer than the
    1351      *      post timestamp, it wins.
    1352      *   3. Returns an associative array (`ts`, `type`, `post_id`, `url`, `ua`).
    1353      *
    1354      * @param string $key Bot key.
    1355      * @return array Latest visit info.
     1294     * Computes the most recent event for a given bot across the entire site.
     1295     *
     1296     * The algorithm inspects:
     1297     *   1. The post‑meta value of the last seen timestamp and associated UA for the last post that bot hit.
     1298     *   2. The `dzcr_latest_url_{$key}` option that records the most recently accessed URL.
     1299     *   3. Falls back to a default empty result if nothing is found.
     1300     *
     1301     * The function returns the candidate that has the greatest timestamp, together with
     1302     * the type (“post” or “url”) and details of the location.
     1303     *
     1304     * @param string $key Internal bot identifier.
     1305     * @return array{ts:float,type:string,post_id:int,url:string,ua:string} The best match.
    13561306     */
    13571307    private function compute_agent_latest( $key ) {
     
    13931343
    13941344    /**
    1395      * Adds “Settings” and “Documentation” links to the plugin’s row
    1396      * in the WordPress Plugins page.
    1397      *
    1398      * Why:  Site admins often look for quick access to the plugin’s
    1399      *       configuration.  Adding these links saves a few clicks.
    1400      *
    1401      * @param array $links Existing action links.
    1402      * @return array Updated array with new links.
     1345     * Adds “Settings” and “Documentation” links to the plugin’s row in the admin‑plugins page.
     1346     *
     1347     * The links open in the same window and in a new tab, respectively.
     1348     *
     1349     * @param array<string,string> $links Array of existing action links.
     1350     * @return array<string,string> The extended links array.
    14031351     */
    14041352    public function plugin_action_links( $links ) {
    1405         $settings_url = admin_url( 'admin.php?page=cls-crawler-record' );
     1353        $settings_url = admin_url( 'admin.php?page=' . DZCR_ADMIN_SLUG );
    14061354        $docs_url     = 'https://www.dizzysoft.com/crawler-record-plugin-for-wordpress/';
    14071355
  • crawler-record/trunk/readme.txt

    r3400643 r3423919  
    33Tags: googlebot, bingbot, gptbot, seo, robots
    44Requires at least: 6.0
    5 Tested up to: 6.8
     5Tested up to: 6.9
    66Requires PHP: 7.4
    77Stable tag: 0.8.0
     
    5858
    5959== Changelog ==
     60= 0.9.0 =
     61* Now monitoring for Meta and Apple User Agents
     62* More accurate site-wide UA reporting.
     63* Ensured video tutorial appears on all admin screens.
     64* Fixed small code errors.
     65
    6066
    6167= 0.8.0 =
  • crawler-record/trunk/uninstall.php

    r3366584 r3423919  
    6363
    6464// 2) Remove any known discrete options (if you introduced settings later).
    65 $discrete_options = array(
    66     'dzcr_settings',       // reserved for future settings array.
    67     'dzcr_agents_custom',  // reserved if you ever allow custom agent configs.
     65$dzcr_discrete_options = array(
     66    'dzcr_settings',
     67    'dzcr_agents_custom',
    6868);
    6969
    70 foreach ( $discrete_options as $opt ) {
    71     delete_option( $opt );
     70foreach ( $dzcr_discrete_options as $dzcr_opt ) {
     71    delete_option( $dzcr_opt );
    7272}
    7373
    7474// 3) Optional: purge post meta set by the plugin (disabled by default).
    7575// Enable by defining DZCR_PURGE_POSTMETA true in wp-config.php OR using the filter below.
    76 $purge_postmeta = ( defined( 'DZCR_PURGE_POSTMETA' ) && DZCR_PURGE_POSTMETA )
     76$dzcr_purge_postmeta = ( defined( 'DZCR_PURGE_POSTMETA' ) && DZCR_PURGE_POSTMETA )
    7777    || apply_filters( 'dzcr_uninstall_purge_postmeta', false );
    7878
    79 if ( $purge_postmeta ) {
     79if ( $dzcr_purge_postmeta ) {
    8080    // Delete meta keys written per post:
    8181    //  - _dzcr_last_seen_{agent}
Note: See TracChangeset for help on using the changeset viewer.