Changeset 3423919
- Timestamp:
- 12/19/2025 06:10:18 PM (3 weeks ago)
- Location:
- crawler-record
- Files:
-
- 1 added
- 3 edited
- 5 copied
-
tags/0.9.0 (added)
-
tags/0.9.0/LICENSE (copied) (copied from crawler-record/trunk/LICENSE)
-
tags/0.9.0/crawler-record.php (copied) (copied from crawler-record/trunk/crawler-record.php) (41 diffs)
-
tags/0.9.0/languages (copied) (copied from crawler-record/trunk/languages)
-
tags/0.9.0/readme.txt (copied) (copied from crawler-record/trunk/readme.txt) (2 diffs)
-
tags/0.9.0/uninstall.php (copied) (copied from crawler-record/trunk/uninstall.php) (1 diff)
-
trunk/crawler-record.php (modified) (41 diffs)
-
trunk/readme.txt (modified) (2 diffs)
-
trunk/uninstall.php (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
crawler-record/tags/0.9.0/crawler-record.php
r3400651 r3423919 4 4 * Plugin URI: https://www.dizzysoft.com/crawler-record-plugin-for-wordpress/ 5 5 * Description: Are the most common search and chat (AI/LLM) bots able to access the pages on your website? Crawler Record can tell you the last time each of the most common search/chat bots visited -and which pages at which they looked. 6 * Version: 0. 8.06 * Version: 0.9.0 7 7 * Requires at least: 6.0 8 * Tested up to: 6. 88 * Tested up to: 6.9 9 9 * Requires PHP: 7.4 10 10 * Author: dizzysoft … … 20 20 } 21 21 22 define( 'CRAWLER_RECORD_VERSION', '0.8.0' ); 22 define( 'CRAWLER_RECORD_VERSION', '0.9.0' ); 23 define( 'DZCR_ADMIN_SLUG', 'dzcr-crawler-record' ); 23 24 24 25 class Crawler_Record { 25 26 26 /** 27 * Holds a list of all bot groups and their UA patterns. 28 * 29 * Why: The agent list is small, stable, and rarely changes; storing 30 * it in code removes the need for a custom database table 31 * and keeps migration simple. The `apply_filters()` wrapper 32 * lets developers add or replace bots without touching this file. 33 * 34 * @var array 35 */ 27 // https://www.searchenginejournal.com/ai-crawler-user-agents-list/558130/ 36 28 private $dzcr_default_agent_groups = [ 37 29 'Google' => [ … … 144 136 ], 145 137 138 'Meta' => [ 139 'doc' => 'https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/', 140 'items' => [ 141 'meta_externalagent' => [ 142 'label' => 'Meta-ExternalAgent', 143 'pattern' => '#meta-externalagent/\d+(?:\.\d+)?#i', 144 ], 145 'meta_webindexer' => [ 146 'label' => 'Meta-WebIndexer', 147 'pattern' => '#meta-webindexer/\d+(?:\.\d+)?#i', 148 ], 149 ], 150 ], 151 152 'Apple' => [ 153 'doc' => 'https://support.apple.com/en-us/119829', 154 'items' => [ 155 'applebot' => [ 156 'label' => 'Applebot', 157 'pattern' => '#Applebot/\d+(?:\.\d+)?#i', 158 ], 159 160 // Robots-only (does not crawl pages) 161 'applebot_extended' => [ 162 'label' => 'Applebot-Extended (AI)', 163 'pattern' => '#(?!.)#', 164 ], 165 ], 166 ], 167 146 168 'DuckDuckGo' => [ 147 169 'doc' => 'https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot', … … 169 191 ]; 170 192 171 172 /**173 * Cache of the resolved, filtered agent list.174 *175 * Why: Resolving the agent list once per request saves expensive work176 * (filtering & validation). Stored in memory for the duration of177 * the request; no persistence needed.178 *179 * @var array|null180 */181 193 private $dzcr_agents_effective_cache = null; 182 194 183 /**184 * Cache of the flat list of UA patterns keyed by the agent key.185 *186 * Why: Many look-ups need a simple key > regex map; building it once avoids187 * iterating through the nested agent group structure each time.188 *189 * @var array|null190 */191 195 private $dzcr_agents_flat_cache = null; 192 196 193 /**194 * Prefix used for per-post Meta keys that store the last timestamp a bot195 * visited the post.196 *197 * Why: Namespacing the meta key prevents collisions with other plugins.198 * The prefix is combined with the unique agent key.199 *200 * @var string201 */202 197 private $dzcr_meta_prefix = '_dzcr_last_seen_'; 203 198 204 /**205 * Prefix used for site-wide last-seen Meta keys (stored as options).206 *207 * Why: Allows quick access to the most recent time *any* page was seen by208 * each bot without scanning the entire database.209 *210 * @var string211 */212 199 private $dzcr_site_prefix = '_dzcr_site_last_'; 213 200 214 /**215 * Prefix used for per-URL last-seen keys (stored as options).216 *217 * Why: URLs are not tied to a post, so options keep arbitrary218 * key/value pairs without cluttering postmeta.219 *220 * @var string221 */222 201 private $dzcr_url_prefix = '_dzcr_url_last_'; 223 202 224 /**225 * Prefix used for the last-post-ID per bot (stored as options).226 *227 * Why: This is a fast index: if the last post ID is known we can fetch228 * its meta immediately, avoiding a full table scan.229 *230 * @var string231 */232 203 private $dzcr_lastpost_prefix = '_dzcr_last_post_'; 233 204 234 235 /**236 * Class constructor.237 *238 * Why: Hooks the core functionality into WordPress:239 * - `template_redirect` records when a crawler requests a page.240 * - `add_meta_boxes` adds the per-post "Crawler Record" box.241 * - `admin_bar_menu` inserts a quick toolbar entry for logged-in users.242 * - `admin_menu` registers the settings page in the admin sidebar.243 * - `plugin_action_links_*` adds Settings/Documentation links on the244 * Plugins screen.245 *246 * This is the entry point of the plugin – everything else is called247 * indirectly via these actions.248 */249 205 public function __construct() { 250 206 add_action( 'template_redirect', [ $this, 'maybe_record_last_seen' ] ); … … 258 214 public function enqueue_admin_assets() { 259 215 $screen = function_exists('get_current_screen') ? get_current_screen() : null; 260 if ( ! $screen || $screen->id !== 'toplevel_page_ crawler-record') {216 if ( ! $screen || $screen->id !== 'toplevel_page_' . DZCR_ADMIN_SLUG ) { 261 217 return; // only on the plugin admin page 262 218 } … … 265 221 'https://player.vimeo.com/api/player.js', 266 222 [], 267 $ver,223 CRAWLER_RECORD_VERSION, 268 224 true 269 225 ); … … 271 227 } 272 228 273 274 /** 275 * Retrieves the effective agent groups after filters, validation, 276 * and custom ordering have been applied. 277 * 278 * Why: The default list is processed once per request. This method 279 * returns a validated array that can be cached for all other 280 * look‑ups (`get_agent_groups()`). The validation step removes 281 * malformed entries and guarantees the presence of a `doc` 282 * URL and a `label` for each item. 283 * 284 * @return array The resolved, validated agent groups. 229 /** 230 * Lazily resolves the complete agent‑group tree, applying the `dzcr_agent_groups` 231 * and `dzcr_agents_order` filters, and caches the result for the lifetime of the request. 232 * 233 * The returned array maps group names to an array containing a `doc` URL and a list of 234 * individual bot definitions (`label` and `pattern`). The data is filtered so that 235 * malformed entries (missing patterns or labels) are removed automatically. 236 * 237 * @return array<string,array<string,mixed>> The fully‑validated and ordered agent groups. 285 238 */ 286 239 private function get_agent_groups() { … … 332 285 333 286 /** 334 * Returns a flattened map of bot keys to regex pattern strings. 335 * 336 * Why: The recording logic (`maybe_record_last_seen`) needs a 337 * simple `foreach ($agent_key => $pattern)` loop. Flattening 338 * removes the overhead of walking the nested group structure 339 * for every request. 340 * 341 * @return array Key → regex string. 287 * Returns a flattened map of every bot key to its compiled regular‑expression pattern. 288 * 289 * The function iterates over the effective agent groups obtained via {@see get_agent_groups()} 290 * and builds a key‑=>pattern list. The result is cached for the current request. 291 * 292 * @return array<string,string> Bot key → regex pattern. 342 293 */ 343 294 private function get_agents_flat() { … … 356 307 357 308 /** 358 * Looks up the human‑readable label for a bot key. 359 * 360 * Why: The UI shows the label, not the internal key. The helper 361 * centralises the lookup and provides a default in case the 362 * key is unknown. 363 * 364 * @param string $key Internal bot identifier. 365 * @return string Human‑readable label. 309 * Looks up a human‑readable label for a bot key in the current agent‑group configuration. 310 * 311 * If the key cannot be found, its raw value is returned as a fallback. This helper is 312 * used to keep UI code terse while still presenting friendly names to site editors. 313 * 314 * @param string $key The internal identifier for a bot (e.g. 'googlebot_desktop'). 315 * @return string The user‑friendly label. 366 316 */ 367 317 private function get_label_for_key( $key ) { … … 375 325 376 326 /** 377 * Builds the canonical URL of the current front‑end request. 378 * 379 * Why: Recorded timestamps are stored per‑URL, so we need an exact, 380 * normalised URL. The function normalises the scheme 381 * (`http/https`), host, path, and applies the 382 * `dzcr_normalize_url` filter (useful for stripping tracking 383 * parameters or converting to a canonical form). 384 * 385 * @return string Full URL (or empty string in admin/invalid state). 386 */ 387 private function current_url() { 388 if ( is_admin() ) { 327 * Generates the canonical, normalized permalink for the queried singular post. 328 * 329 * The function guarantees the returned URL does **not** contain a query string, 330 * a fragment identifier, or an empty path. The result is suitable for storage 331 * in post meta and for robots‑txt matching. 332 * 333 * @param int $post_id By reference, receives the resolved post ID or 0 if no singular. 334 * @return string The sanitized absolute URL, or an empty string when no query can be resolved. 335 */ 336 private function current_url_for_post( &$post_id = 0 ) { 337 if ( is_admin() || ! is_singular() ) { 389 338 return ''; 390 339 } 391 if ( ! isset( $_SERVER['HTTP_HOST'], $_SERVER['REQUEST_URI'] ) ) { 340 341 $post_id = (int) get_queried_object_id(); 342 if ( ! $post_id ) { 392 343 return ''; 393 344 } 394 $scheme = ( ! empty( $_SERVER['HTTPS'] ) && 'off' !== $_SERVER['HTTPS'] ) ? 'https' : 'http'; 395 $host = sanitize_text_field( wp_unslash( $_SERVER['HTTP_HOST'] ) ); 396 $uri = esc_url_raw( wp_unslash( $_SERVER['REQUEST_URI'] ) ); 397 398 if ( '' === $host ) { 345 346 $url = get_permalink( $post_id ); 347 if ( ! $url ) { 399 348 return ''; 400 349 } 401 $url = $scheme . '://' . $host . $uri; 402 $url = apply_filters( 'dzcr_normalize_url', $url ); 403 if ( strlen( $url ) > 2048 ) { 404 $url = substr( $url, 0, 2048 ); 405 } 406 return $url; 407 } 408 409 /** 410 * Records the last “seen” timestamp when a crawler requests a page. 411 * 412 * Why: This method is the heart of the plugin. It is called on 413 * `template_redirect` and performs the following actions: 414 * 415 * 1. Skips previews, feeds, REST, admin, AJAX, and cron requests. 416 * 2. Only processes GET/HEAD requests, which are the standard 417 * HTTP verbs used by crawlers. 418 * 3. Trims and sanitises the UA string (max 512 chars) and 419 * normalises it to avoid runaway regexes. 420 * 4. Throttles writes using a 10‑minute window (configurable via 421 * `dzcr_throttle_window`) to avoid spamming the DB when the same 422 * bot visits the page repeatedly. 423 * 5. Stores three pieces of information: 424 * - Per‑post meta (`_dzcr_last_seen_[bot]`). 425 * - Per‑URL options (`_dzcr_url_last_[bot]_[hash]`) plus companion 426 * `_url_` and `_ua_` options. 427 * - Site‑wide option (`_dzcr_site_last_[bot]`). 428 * 429 * The method also updates the “last UA string” meta/option so the 430 * UI can show exactly what the bot sent. 431 * 350 351 // Normalize: strip query string and fragment 352 $parts = wp_parse_url( $url ); 353 if ( empty( $parts['host'] ) || empty( $parts['path'] ) ) { 354 return ''; 355 } 356 357 $scheme = $parts['scheme'] ?? 'https'; 358 return $scheme . '://' . $parts['host'] . $parts['path']; 359 } 360 361 /** 362 * Core routine that records bot visit timestamps when the current request matches a known crawler. 363 * 364 * The method is executed early in the front‑end rendering cycle (`template_redirect`). 365 * 1. It exits immediately for admin screens, REST requests, feeds, `HEAD` requests, or 366 * when the `User‑Agent` header is missing or too long. 367 * 2. The UA string is matched against the compiled regex list. Only the first matching 368 * rule is considered. 369 * 3. A per‑post meta key and a global option are updated if the throttle window 370 * (10 minutes by default) allows it. The method also records the exact UA string 371 * that triggered the update so editors can inspect it later. 372 * 373 * @param int $post_id Optional post ID when the query resolves to a single post. Passed by value. 432 374 * @return void 375 * @throws RuntimeException If the PHP `set_transient` failures occur (unlikely, but reported). 433 376 */ 434 377 public function maybe_record_last_seen() { … … 445 388 446 389 $now = microtime( true ); 447 $post_id = is_singular() ? (int) get_queried_object_id() : 0; 448 $ url = $this->current_url();449 $url hash = $url ? md5( $url ) : '';390 391 $post_id = 0; 392 $url = $this->current_url_for_post( $post_id ); 450 393 451 394 $throttle = (int) apply_filters( 'dzcr_throttle_window', 10 * MINUTE_IN_SECONDS ); … … 470 413 } 471 414 472 // Per URL 473 if ( $urlhash ) { 474 $opt_key = $this->dzcr_url_prefix . $key . '_' . $urlhash; 475 $opt_url_key = $this->dzcr_url_prefix . $key . '_url_' . $urlhash; 476 $opt_ua_key = $this->dzcr_url_prefix . $key . '_ua_' . $urlhash; // store last UA for this URL+agent 477 $t_url = 'dzcr_seen_url_' . $key . '_' . $urlhash; 478 479 if ( ! $throttle || ! get_transient( $t_url ) ) { 480 $prev = (float) get_option( $opt_key, 0 ); 481 if ( $now > $prev ) { 482 update_option( $opt_key, (string) $now, false ); 483 update_option( $opt_url_key, $url, false ); 484 update_option( $opt_ua_key, $ua, false ); 485 } 486 if ( $throttle ) { 487 set_transient( $t_url, 1, $throttle ); 488 } 415 // Site-wide latest real post/page 416 if ( $post_id && $url ) { 417 $latest_key = 'dzcr_latest_url_' . $key; 418 $prev = get_option( $latest_key, [] ); 419 $prev_ts = isset( $prev['ts'] ) ? (float) $prev['ts'] : 0.0; 420 421 if ( $now > $prev_ts ) { 422 update_option( 423 $latest_key, 424 [ 425 'ts' => (string) $now, 426 'url' => $url, 427 'ua' => $ua, 428 ], 429 false 430 ); 489 431 } 490 432 } … … 501 443 502 444 /** 503 * Safely matches a UA string against a stored pattern or literal value. 504 * 505 * Why: Direct use of `preg_match()` on untrusted user‑agent strings 506 * can trigger PHP warnings/errors (e.g. malformed regex). 507 * This wrapper: 508 * - Ensures the pattern begins with a `#` and ends with `#i`. 509 * - Suppresses `preg_match()` errors with the `@` operator. 510 * - Validates that the regex is syntactically correct. 511 * - Fallbacks to literal `hash_equals()` comparison when a plain string 512 * is supplied. 513 * 514 * @param string $ua The UA string from the request. 515 * @param string $pattern_or_exact Either a regex string or a literal. 516 * @return bool True if the UA matches the pattern or string. 445 * Safely tests whether a given User‑Agent string matches a pattern. 446 * 447 * Handles both plain strings (exact comparison) and regular‑expressions by inspecting 448 * the first character. The function avoids the costly `preg_match` errors through 449 * the `@` error‑control operator and checks the regex error code. 450 * 451 * @param string $ua The User‑Agent string to test. 452 * @param string $pattern_or_exact Either a regex (enclosed in `#...#i`) or a literal string. 453 * @return bool True if the UA satisfies the pattern; otherwise false. 517 454 */ 518 455 private function ua_matches_safe( $ua, $pattern_or_exact ) { … … 542 479 543 480 /** 544 * Registers the “Crawler Record” meta‑box that appears on the post 545 * editing screen. 546 * 547 * Why: Post editors need instant insight into which crawlers have 548 * visited the post and their access status. The meta‑box 549 * appears in the normal “normal” context with high priority, 550 * making it highly visible without adding extra menu items. 481 * Registers the “Crawler Record” meta‑box on the post editing screen. 482 * 483 * The meta‑box is added to the *Advanced* context so that it does not clutter the editor 484 * for non‑technical users. It is only visible to capable editors (users with 485 * `edit_post` capability). 486 * 551 487 */ 552 488 public function register_meta_box() { … … 562 498 563 499 /** 564 * Renders the per‑post “Crawler Record” meta‑box. 565 * 566 * Why: The box shows a table of all bots, each with: 567 * - A human‑readable label. 568 * - The last time the bot accessed the post (`Last Seen`). 569 * - A status badge that tells you whether the URL is allowed by 570 * `robots.txt`. 571 * - A collapsible section that reveals the exact UA string 572 * the bot used when it visited. 573 * 574 * The function also prints a warning if the site is globally 575 * blocking all crawlers (WordPress “Discourage search‑engine” setting). 576 * 577 * @param WP_Post $post Current post object. 500 * Renders the contents of the “Crawler Record” meta‑box. 501 * 502 * Generates a table that lists every tracked agent, its last‑seen timestamp, and a 503 * robots‑txt allow/blocked status. The UI includes a collapsible `details` element 504 * that shows the exact User‑Agent string used for the most recent hit. 505 * 506 * @param WP_Post $post The post object currently being edited. 578 507 * @return void 579 508 */ … … 618 547 'diag' => $diag, 619 548 'ua' => $last_ua, 620 'na' => ( 'google_extended' === $key ), // treat Google-Extended as non-crawler549 'na' => $this->is_robots_only_agent( $key ), 621 550 ]; 622 551 } … … 662 591 663 592 /** 664 * Adds a “Crawler Record” entry to the front‑end admin bar. 665 * 666 * Why: Site admins often browse the front‑end while logged in. 667 * The toolbar provides a one‑click shortcut to the 668 * detailed bot‑status page and shows a quick list of 669 * timestamps for each bot, whether looking at a single page, 670 * a specific URL, or the entire site. 671 * 672 * @param WP_Admin_Bar $wp_admin_bar Admin‑bar object. 593 * Adds a “Crawler Record” node to the front‑end admin bar with context‑aware children. 594 * 595 * The node is only shown to logged‑in users viewing the front‑end (not the admin). Depending 596 * on whether the current page is singular or generic, the children list will display 597 * per‑post or site‑wide last‑seen data for each crawler. 598 * 599 * @param WP_Admin_Bar $wp_admin_bar The admin bar instance. 673 600 * @return void 674 601 */ … … 679 606 } 680 607 681 $admin_page_url = admin_url( 'admin.php?page= cls-crawler-record');608 $admin_page_url = admin_url( 'admin.php?page=' . DZCR_ADMIN_SLUG ); 682 609 683 610 // Parent link … … 700 627 701 628 foreach ( $wrap['items'] as $key => $_def ) { 702 if ( 'google_extended' === $key) {703 continue; // do not show Google-Extended in admin bar629 if ( $this->is_robots_only_agent( $key ) ) { 630 continue; 704 631 } 632 705 633 $label = $this->get_label_for_key( $key ); 706 634 … … 736 664 737 665 /** 738 * Registers the plugin’s top‑level admin page. 739 * 740 * Why: The settings page is a convenient place to: 741 * - See all bots and their global timestamps. 742 * - Inspect the last page each bot has seen. 743 * - View robots.txt diagnostics for each bot. 666 * Registers the top‑level “Crawler Record” admin submenu page. 667 * 668 * The submenu is created under the **Settings** root so that it is consistently 669 * reachable regardless of the user role or the presence of other plugins. 744 670 * 745 671 * @return void … … 750 676 __( 'Crawler Record', 'crawler-record' ), 751 677 'manage_options', 752 'cls-crawler-record',678 DZCR_ADMIN_SLUG, 753 679 [ $this, 'render_admin_page' ], 754 680 'dashicons-search', … … 758 684 759 685 /** 760 * Renders the entire admin page. 761 * 762 * Why: The page shows a per‑bot table containing: 763 * - The bot’s human label. 764 * - The most recent global timestamp. 765 * - The link to the most recent page the bot visited. 766 * - A status badge indicating robots.txt permissibility. 767 * 768 * The table also has a useful “Important” notice if 769 * WordPress is blocking all crawlers. 686 * Renders the public dashboard for all tracked agents. 687 * 688 * The page lists each bot’s last‑seen timestamp, the last URL it visited, and the 689 * result of a robots‑txt evaluation for the site’s home path. A small embedded video 690 * tutorial is shown in the right‑hand column. 770 691 * 771 692 * @return void … … 847 768 : $this->robots_status_for_agent_path( $key, $home_path, $diag ); 848 769 849 $show_last_fields_as_na = ( 'google_extended' ===$key );770 $show_last_fields_as_na = $this->is_robots_only_agent( $key ); 850 771 851 772 // Agent cell (expandable UA string unless NA) … … 879 800 880 801 /** 881 * Checks if the site is configured to “Discourage search engines”. 882 * 883 * Why: If WordPress’s global setting is enabled, all crawlers are 884 * blocked regardless of `robots.txt`. The function lets us 885 * quickly short‑circuit further checks. 886 * 887 * @return bool True if the site is discouraging crawlers. 802 * Determines if WordPress is currently discouraging search engines (`blog_public` = 0). 803 * 804 * The helper is used to short‑circuit logic that would otherwise record activity, 805 * and to display a warning banner on admin screens. 806 * 807 * @return bool True when site‑wide crawling is disabled. 888 808 */ 889 809 private function is_site_discouraged() { … … 892 812 893 813 /** 894 * Returns a status array for the “blocked by WordPress” case. 895 * 896 * Why: Used when `is_site_discouraged()` is true; the admin UI 897 * needs a consistent structure (`state` & `reason`) for 898 * rendering the status badge. 899 * 900 * @return array `state` (blocked) and a human‑readable reason. 814 * Builds a standard “blocked by WordPress setting” status array. 815 * 816 * Returned from {@see robots_status_for_agent_path()} when the site is 817 * configured to discourage search engines. 818 * 819 * @return array{state:string,reason:string} Blocking information. 901 820 */ 902 821 private function forced_block_status() { … … 908 827 909 828 /** 910 * Maps an internal bot key to the list of UA strings that the bot 911 * uses (used to choose the “most representative” UA for robots.txt 912 * checks). 913 * 914 * Why: Different bots have distinct patterns for desktop/mobile, 915 * legacy, and AI‑bot forms. This helper returns the set of 916 * tokens that must be matched against the `robots.txt` filter 917 * logic. 918 * 919 * @param string $key Internal bot key. 920 * @return array List of tokens. 829 * Maps an internal bot key to the canonical User‑Agent string(s) that can appear 830 * in the site’s robots.txt. 831 * 832 * The mapping is used for robots‑txt group selection. When a key maps to 833 * multiple tokens it is returned as an array of strings; otherwise a single‑element array. 834 * 835 * @param string $key The internal identifier for a bot. 836 * @return array<string> The list of UA strings to check in robots.txt. 921 837 */ 922 838 private function robots_tokens_for_key(string $key): array { … … 953 869 case 'perplexity_user': return ['Perplexity-User']; // normalize Unicode hyphens 954 870 871 // Meta 872 case 'meta_externalagent': 873 return ['meta-externalagent']; 874 case 'meta_webindexer': 875 return ['Meta-WebIndexer']; 876 877 // Apple 878 case 'applebot': 879 return ['Applebot']; 880 case 'applebot_extended': 881 return ['Applebot-Extended']; 882 883 955 884 // DuckDuckGo 956 885 case 'duckduckgo_search': return ['DuckDuckBot']; … … 964 893 965 894 /** 966 * Chooses the appropriate `User‑Agent` group in robots.txt for 967 * a given bot. 968 * 969 * Why: A robot may have multiple `User‑Agent` lines (desktop/mobile). 970 * The longest exact match wins; if none match we fallback to 971 * the wildcard group (`*`). This function returns the entire 972 * group structure (agents + rules) for that bot. 973 * 974 * @param string $robots_txt Raw text of robots.txt. 975 * @param array $tokens List of tokens that identify the bot. 976 * @return array Group with `agents` and `rules` keys. 895 * Returns true for bots that only check robots.txt and never fetch pages. 896 * 897 * For these bots the per‑post “last‑seen” columns are marked “N/A” because it is 898 * impossible to record an actual page visit. 899 * 900 * @param string $key The internal bot identifier. 901 * @return bool True when the bot is robots‑only. 902 */ 903 private function is_robots_only_agent( string $key ): bool { 904 return in_array( 905 $key, 906 [ 907 'google_extended', 908 'applebot_extended', 909 ], 910 true 911 ); 912 } 913 914 /** 915 * Chooses the most specific robots.txt group that matches the requested bot tokens. 916 * 917 * The grouping algorithm follows the order of rules in the file: a group that 918 * lists a specific User‑Agent token and the longest matching rule wins. 919 * If no group matches, the first wildcard `*` group is returned (if present). 920 * 921 * @param string $robots_txt Raw robots.txt contents. 922 * @param array<string> $tokens List of canonical UA strings for the bot. 923 * @return array{agents:list<string>,rules:list<mixed>} The selected group’s agents and rules. 977 924 */ 978 925 private function select_robots_group(string $robots_txt, array $tokens) { … … 1003 950 1004 951 /** 1005 * Checks whether a path starts with the rule prefix. 1006 * 1007 * Why: `robots.txt` rules are prefix based; e.g. `Disallow: /wp-admin` 1008 * blocks everything under `/wp-admin`. A simple `strncmp()` 1009 * suffices because we already normalised the rule to start with 1010 * a slash and have removed any trailing `$` terminator. 1011 * 1012 * @param string $path Path from the request. 1013 * @param string $rule Rule from robots.txt. 1014 * @return bool True if the path matches the rule. 952 * Tests whether the supplied robots rule path is a strict prefix of $path. 953 * 954 * The helper is used by {@see robots_txt_allows()} when applying the longest‑prefix 955 * rule logic. 956 * 957 * @param string $path The request path. 958 * @param string $rule The rule path defined in robots.txt. 959 * @return bool True when $rule is a prefix of $path. 1015 960 */ 1016 961 private function path_prefix_match(string $path, string $rule): bool { … … 1019 964 1020 965 /** 1021 * Normalises hyphen characters in a string; replaces various Unicode 1022 * hyphens with the ASCII hyphen. 1023 * 1024 * Why: Some bots (e.g., Perplexity) use non‑standard hyphens in 1025 * their UA strings. Normalising them guarantees that 1026 * string‑based matching (e.g. `strcasecmp()`) works 1027 * regardless of the particular hyphen glyph. 966 * Normalises a URL path so that an empty string becomes “/”. 967 * 968 * An empty path can cause subtle bugs in robots‑txt evaluation and is treated 969 * as the root path by WordPress. 970 * 971 * @param string $p The original path. 972 * @return string Normalised path. 973 */ 974 private function normalize_path(string $p): string { 975 return ($p === '') ? '/' : $p; 976 } 977 978 /** 979 * Replaces every form of Unicode hyphen (–, ‑, -, …) with a simple ASCII hyphen. 980 * 981 * The rule set is chosen to avoid false mismatches when a site’s robots.txt 982 * contains variant hyphens in its UA tokens. 1028 983 * 1029 984 * @param string $s Input string. 1030 985 * @return string Normalised string. 1031 986 */ 1032 private function normalize_path(string $p): string {1033 return ($p === '') ? '/' : $p;1034 }1035 1036 /**1037 * Normalises hyphen characters in a string; replaces various Unicode1038 * hyphens with the ASCII hyphen.1039 *1040 * Why: Some bots (e.g., Perplexity) use non‑standard hyphens in1041 * their UA strings. Normalising them guarantees that1042 * string‑based matching (e.g. `strcasecmp()`) works1043 * regardless of the particular hyphen glyph.1044 *1045 * @param string $s Input string.1046 * @return string Normalised string.1047 */1048 987 private function normalize_hyphens(string $s): string { 1049 988 return preg_replace('/[\x{2010}-\x{2015}\x{2212}\x{FE58}\x{FE63}\x{FF0D}]/u','-',$s); … … 1051 990 1052 991 /** 1053 * Parses a raw `robots.txt` string into an array of groups. 1054 * 1055 * Why: The plugin re‑implements the minimal parsing logic needed 1056 * for our use‑case (User‑Agent, Allow, Disallow). The function 1057 * ignores other directives (Sitemap, Crawl‑delay) and keeps 1058 * groups and rules in the order they appear, which is essential 1059 * for the longest‑path‑wins logic. 1060 * 1061 * @param string $txt Raw content of robots.txt. 1062 * @return array Array of groups, each with `agents` and `rules`. 992 * Parses the raw robots.txt into an array of agent‑group objects. 993 * 994 * The parser emits structures of the form: 995 * [{'agents'=>[...], 'rules'=>[['allow'|'disallow', '/path'], ...]}, ...] 996 * * Non‑essential lines (`Sitemap:`, `Crawl‑delay:`, comments) are ignored. 997 * * Blank lines reset state, but a group is only flushed when a director is encountered. 998 * 999 * @param string $txt Raw robots.txt string. 1000 * @return array<List<string>,list<mixed>> List of groups, each with `agents` and `rules`. 1063 1001 */ 1064 1002 private function parse_robots_groups(string $txt): array { … … 1105 1043 1106 1044 /** 1107 * Retrieves the raw `robots.txt` content for the current site.1108 * 1109 * Why: We avoid an HTTP request to the public `robots.txt`.1110 * WordPress can generate it via `do_robots()` or the file can1111 * be read directly. The function falls back to the filter1112 * `robots_txt` if no file or `do_robots()` output is1113 * available.1114 * 1115 * @return string|null Raw robots.txt body or null if unavailable.1045 * Retrieves the active robots.txt for the current site. 1046 * 1047 * The method respects the order imposed by the WordPress core: 1048 * 1. Physical `robots.txt` file 1049 * 2. `wp_robots()` (WP 5.7+) 1050 * 3. `do_robots` action 1051 * 4. `robots_txt` filter 1052 * 1053 * @return string|null The robots.txt contents, or `null` if none could be generated. 1116 1054 */ 1117 1055 private function get_local_robots_txt() { 1056 // 1. Physical robots.txt file takes precedence 1118 1057 $file = ABSPATH . 'robots.txt'; 1119 1058 if ( @is_readable( $file ) ) { 1120 1059 $body = @file_get_contents( $file ); 1121 if ( is_string( $body ) && '' !== $body ) return $body; 1122 } 1060 if ( is_string( $body ) && '' !== trim( $body ) ) { 1061 return $body; 1062 } 1063 } 1064 1065 // 2. Prefer wp_robots() when available (WP 5.7+) 1066 if ( function_exists( 'wp_robots' ) ) { 1067 ob_start(); 1068 wp_robots(); 1069 $out = ob_get_clean(); 1070 if ( is_string( $out ) && '' !== trim( $out ) ) { 1071 return $out; 1072 } 1073 } 1074 1075 // 3. Fallback to core do_robots action 1076 // do_robots is a WordPress core action used to generate robots.txt output. 1123 1077 ob_start(); 1078 // phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedHooknameFound 1124 1079 do_action( 'do_robots' ); 1125 1080 $out = ob_get_clean(); 1126 if ( is_string( $out ) && '' !== trim( $out ) ) return $out; 1127 1081 if ( is_string( $out ) && '' !== trim( $out ) ) { 1082 return $out; 1083 } 1084 1085 // 4. Final fallback via robots_txt filter 1086 // phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedHooknameFound 1128 1087 $output = apply_filters( 'robots_txt', '', get_option( 'blog_public' ) ); 1129 if ( is_string( $output ) && '' !== trim( $output ) ) return $output; 1088 if ( is_string( $output ) && '' !== trim( $output ) ) { 1089 return $output; 1090 } 1130 1091 1131 1092 return null; … … 1133 1094 1134 1095 /** 1135 * Returns true if the robots.txt rules permit the bot to access the path. 1136 * 1137 * Why: The function implements a simplified rule engine that: 1138 * 1. Chooses the best User‑Agent group for the bot (exact match 1139 * or wildcard). 1140 * 2. Normalises the path and checks it against each rule in the 1141 * selected group. 1142 * 3. Uses longest‑path‑wins; disallow wins only if it 1143 * matches a longer path than any allow. If no rule matches, 1144 * the URL is allowed by default. 1145 * 1146 * @param string $robots_txt Raw robots.txt text. 1147 * @param string $agent_key Internal bot key. 1148 * @param string $path Requested path. 1149 * @return bool True if the path is allowed for the bot. 1096 * Determines whether the given bot is allowed to fetch $path based on the active robots.txt. 1097 * 1098 * Implements the standard longest‑prefix‑wins rule for a single bot’s group. The method 1099 * returns `true` for ``allow`` or for paths that match no rule (default allow). 1100 * 1101 * @param string $robots_txt The retrieved robots.txt contents. 1102 * @param string $agent_key The internal identifier for the bot. 1103 * @param string $path The requested URL path (starting with “/”). 1104 * @return bool True when crawling is permitted, false when disallowed. 1150 1105 */ 1151 1106 private function robots_txt_allows(string $robots_txt, string $agent_key, string $path): bool { … … 1175 1130 1176 1131 /** 1177 * Public wrapper that provides a “state” + “reason” array describing1178 * whether the bot is allowed to crawl the path, and populates1179 * diagnostics output for the UI.1180 * 1181 * Why: The admin pages display a status badge; this method produces1182 * the data the badge expects and also records diagnostic1183 * information (matched group & rule) that can be shown in1184 * an expandable details box.1185 * 1186 * @param string $ agent_key Bot key.1187 * @param string $path Requested path.1188 * @param array &$diag Output diagnostics (group, rule).1189 * @return array (`state` => 'allowed'|'blocked'|'unknown', `reason` => string).1132 * Public API that reports the allow/blocked status for a bot and a specific path. 1133 * 1134 * The method first checks whether the site globally discourages crawlers (`blog_public=0`). 1135 * If not, it loads the robots.txt, selects the appropriate group, and evaluates 1136 * the path. A detailed diagnostic array (group name, rule used) is returned 1137 * alongside the status. When a global block applies, the status is `blocked` 1138 * with a specific reason. 1139 * 1140 * @param string $agent_key The bot’s internal key (e.g. 'googlebot_desktop'). 1141 * @param string $path Path fragment (starting with “/”) to test. 1142 * @param array|null $diag Optional reference parameter that receives diagnostics about 1143 * the matching group and rule; can be omitted if not needed. 1144 * @return array{state:string,reason:string}<p>Possible `state` values: `allowed`, `blocked`, `unknown`.</p> 1190 1145 */ 1191 1146 public function robots_status_for_agent_path( $agent_key, $path, &$diag = null ) { … … 1210 1165 1211 1166 /** 1212 * Renders a status badge and an expandable details section. 1213 * 1214 * Why: The status badge (green check, red X, gray) gives a quick visual 1215 * cue. The details expand on hover/click and show the exact 1216 * robots.txt group and rule that determined the decision. 1217 * 1218 * @param array $status Associative array returned by 1219 * `robots_status_for_agent_path`. 1220 * @param array $diag Diagnostics array (group & rule). 1221 * @return string HTML safe snippet for the badge + details. 1167 * Renders a colour‑coded badge indicating the robots.txt status and a collapsible 1168 * details section that shows which rule was matched. 1169 * 1170 * The badge colors are: 1171 * * Green with a checkmark for “allowed” 1172 * * Red with a cross for “blocked” 1173 * * Grey for “unknown” 1174 * 1175 * @param array{state:string,reason:string} $status Returned from {@see robots_status_for_agent_path()}. 1176 * @param array{group:string,rule:string} $diag Diagnostics array for detail output. 1177 * @return string Safe HTML for the badge + details element. 1222 1178 */ 1223 1179 private function render_status_badge_expandable( $status, $diag ) { … … 1244 1200 1245 1201 /** 1246 * Formats a timestamp in “human‑readable” form and a precise 1247 * full‑timestamp (with micro‑seconds) for display. 1248 * 1249 * Why: Users want to see “3 days ago (2025‑08‑12 15:04:23.123456)”. 1250 * This helper keeps the code in the main rendering loop 1251 * terse and centralises the formatting logic. 1252 * 1253 * @param float $ts Timestamp from the database. 1254 * @return string HTML safe representation. 1202 * Formats a UNIX timestamp (with microseconds) into a human‑readable “time‑ago” string 1203 * plus the exact date/time in the site’s configured timezone. 1204 * 1205 * The output is safe for HTML rendering and is not localized beyond what 1206 * WordPress’ `human_time_diff()` and `wp_date()` provide. 1207 * 1208 * @param float $ts Timestamp value returned by `microtime(true)`. 1209 * @return string The formatted cell content; if $ts is falsy, “Not Yet” is returned. 1255 1210 */ 1256 1211 private function format_last_seen_cell( $ts ) { … … 1270 1225 1271 1226 /** 1272 * Formats the string shown in the admin bar for each bot. 1273 * 1274 * Why: The toolbar entry should be compact (label + timestamp) 1275 * but still show the exact time. This helper keeps the 1276 * formatting consistent between the toolbar and the 1277 * meta‑box. 1278 * 1279 * @param string $label Label for the bot. 1280 * @param float $ts Timestamp (may be 0). 1281 * @param string $suffix Optional small label (e.g., “today”). 1282 * @return string Safe html string. 1227 * Builds the display string used in admin‑bar nodes. 1228 * 1229 * It shows the label, the relative “time‑ago” string, and the absolute timestamp 1230 * (with microseconds). If no timestamp is available, “Not Yet” is used. 1231 * 1232 * @param string $label Human‑readable name of the agent. 1233 * @param float $ts Timestamp value or 0 for “Not Yet”. 1234 * @param string $suffix Optional suffix string to append after the label. 1235 * @return string The formatted admin‑bar line. 1283 1236 */ 1284 1237 private function format_admin_bar_line( $label, $ts, $suffix = '' ) { … … 1297 1250 1298 1251 /** 1299 * Formats the “last page” cell that appears in the admin page. 1300 * 1301 * Why: In the list view we only want the URL, not the post title. 1302 * The helper converts the stored data from `compute_agent_latest()` 1303 * into a link or an “–” if the data is missing. 1304 * 1305 * @param array $latest Associative array from `compute_agent_latest`. 1306 * @return string HTML markup. 1252 * Formats the “last page” cell of the admin dashboard table. 1253 * 1254 * Accepts either a post ID or a raw URL string. The output is a clickable link 1255 * that opens the page in a new tab. Empty or missing values become a dash. 1256 * 1257 * @param array{ts:float,type:string,post_id:int,url:string,ua:string} $latest Data from {@see compute_agent_latest()}. 1258 * @return string The safe HTML for the link or a dash. 1307 1259 */ 1308 1260 private function format_context_cell_url_only( $latest ) { … … 1322 1274 1323 1275 /** 1324 * Converts a UTC timestamp to the administrator’s timezone string. 1325 * 1326 * Why: All timestamps are stored in UTC; showing them in the 1327 * local timezone (configured in Settings → General) 1328 * is far more user‑friendly. The helper uses `wp_date()` if 1329 * available, else falls back to `date_i18n()`. 1330 * 1331 * @param int $sec Unix timestamp. 1332 * @param string $format Optional format string (uses WordPress defaults if omitted). 1333 * @return string Localised timestamp. 1276 * Returns the current site time in the configured timezone, formatted with the given mask. 1277 * 1278 * Wrapper around `wp_date()` (WordPress 5.5+) and falls back to 1279 * `date_i18n()` on older cores. The function takes a UNIX timestamp 1280 * (seconds since the epoch) and a PHP date format string. 1281 * 1282 * @param int $sec Unix timestamp in seconds. 1283 * @param string $format PHP date format mask. 1284 * @return string Formatted date/time string. 1334 1285 */ 1335 1286 private function format_site_tz( $sec, $format ) { … … 1341 1292 1342 1293 /** 1343 * Computes the most recent known access time for a given bot. 1344 * 1345 * Why: The admin page needs to show the “last seen” across the 1346 * entire site. This method: 1347 * 1. Fetches the cached “latest post” option and reads the 1348 * associated meta. If that timestamp is newer, it is used. 1349 * 2. Reads the cached “latest URL” option (created during recording) 1350 * which holds the exact URL and UA. If that is newer than the 1351 * post timestamp, it wins. 1352 * 3. Returns an associative array (`ts`, `type`, `post_id`, `url`, `ua`). 1353 * 1354 * @param string $key Bot key. 1355 * @return array Latest visit info. 1294 * Computes the most recent event for a given bot across the entire site. 1295 * 1296 * The algorithm inspects: 1297 * 1. The post‑meta value of the last seen timestamp and associated UA for the last post that bot hit. 1298 * 2. The `dzcr_latest_url_{$key}` option that records the most recently accessed URL. 1299 * 3. Falls back to a default empty result if nothing is found. 1300 * 1301 * The function returns the candidate that has the greatest timestamp, together with 1302 * the type (“post” or “url”) and details of the location. 1303 * 1304 * @param string $key Internal bot identifier. 1305 * @return array{ts:float,type:string,post_id:int,url:string,ua:string} The best match. 1356 1306 */ 1357 1307 private function compute_agent_latest( $key ) { … … 1393 1343 1394 1344 /** 1395 * Adds “Settings” and “Documentation” links to the plugin’s row 1396 * in the WordPress Plugins page. 1397 * 1398 * Why: Site admins often look for quick access to the plugin’s 1399 * configuration. Adding these links saves a few clicks. 1400 * 1401 * @param array $links Existing action links. 1402 * @return array Updated array with new links. 1345 * Adds “Settings” and “Documentation” links to the plugin’s row in the admin‑plugins page. 1346 * 1347 * The links open in the same window and in a new tab, respectively. 1348 * 1349 * @param array<string,string> $links Array of existing action links. 1350 * @return array<string,string> The extended links array. 1403 1351 */ 1404 1352 public function plugin_action_links( $links ) { 1405 $settings_url = admin_url( 'admin.php?page= cls-crawler-record');1353 $settings_url = admin_url( 'admin.php?page=' . DZCR_ADMIN_SLUG ); 1406 1354 $docs_url = 'https://www.dizzysoft.com/crawler-record-plugin-for-wordpress/'; 1407 1355 -
crawler-record/tags/0.9.0/readme.txt
r3400651 r3423919 3 3 Tags: googlebot, bingbot, gptbot, seo, robots 4 4 Requires at least: 6.0 5 Tested up to: 6. 85 Tested up to: 6.9 6 6 Requires PHP: 7.4 7 7 Stable tag: 0.8.0 … … 58 58 59 59 == Changelog == 60 = 0.9.0 = 61 * Now monitoring for Meta and Apple User Agents 62 * More accurate site-wide UA reporting. 63 * Ensured video tutorial appears on all admin screens. 64 * Fixed small code errors. 65 60 66 61 67 = 0.8.0 = -
crawler-record/tags/0.9.0/uninstall.php
r3400651 r3423919 63 63 64 64 // 2) Remove any known discrete options (if you introduced settings later). 65 $d iscrete_options = array(66 'dzcr_settings', // reserved for future settings array.67 'dzcr_agents_custom', // reserved if you ever allow custom agent configs.65 $dzcr_discrete_options = array( 66 'dzcr_settings', 67 'dzcr_agents_custom', 68 68 ); 69 69 70 foreach ( $d iscrete_options as $opt ) {71 delete_option( $ opt );70 foreach ( $dzcr_discrete_options as $dzcr_opt ) { 71 delete_option( $dzcr_opt ); 72 72 } 73 73 74 74 // 3) Optional: purge post meta set by the plugin (disabled by default). 75 75 // Enable by defining DZCR_PURGE_POSTMETA true in wp-config.php OR using the filter below. 76 $ purge_postmeta = ( defined( 'DZCR_PURGE_POSTMETA' ) && DZCR_PURGE_POSTMETA )76 $dzcr_purge_postmeta = ( defined( 'DZCR_PURGE_POSTMETA' ) && DZCR_PURGE_POSTMETA ) 77 77 || apply_filters( 'dzcr_uninstall_purge_postmeta', false ); 78 78 79 if ( $ purge_postmeta ) {79 if ( $dzcr_purge_postmeta ) { 80 80 // Delete meta keys written per post: 81 81 // - _dzcr_last_seen_{agent} -
crawler-record/trunk/crawler-record.php
r3400643 r3423919 4 4 * Plugin URI: https://www.dizzysoft.com/crawler-record-plugin-for-wordpress/ 5 5 * Description: Are the most common search and chat (AI/LLM) bots able to access the pages on your website? Crawler Record can tell you the last time each of the most common search/chat bots visited -and which pages at which they looked. 6 * Version: 0. 8.06 * Version: 0.9.0 7 7 * Requires at least: 6.0 8 * Tested up to: 6. 88 * Tested up to: 6.9 9 9 * Requires PHP: 7.4 10 10 * Author: dizzysoft … … 20 20 } 21 21 22 define( 'CRAWLER_RECORD_VERSION', '0.8.0' ); 22 define( 'CRAWLER_RECORD_VERSION', '0.9.0' ); 23 define( 'DZCR_ADMIN_SLUG', 'dzcr-crawler-record' ); 23 24 24 25 class Crawler_Record { 25 26 26 /** 27 * Holds a list of all bot groups and their UA patterns. 28 * 29 * Why: The agent list is small, stable, and rarely changes; storing 30 * it in code removes the need for a custom database table 31 * and keeps migration simple. The `apply_filters()` wrapper 32 * lets developers add or replace bots without touching this file. 33 * 34 * @var array 35 */ 27 // https://www.searchenginejournal.com/ai-crawler-user-agents-list/558130/ 36 28 private $dzcr_default_agent_groups = [ 37 29 'Google' => [ … … 144 136 ], 145 137 138 'Meta' => [ 139 'doc' => 'https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/', 140 'items' => [ 141 'meta_externalagent' => [ 142 'label' => 'Meta-ExternalAgent', 143 'pattern' => '#meta-externalagent/\d+(?:\.\d+)?#i', 144 ], 145 'meta_webindexer' => [ 146 'label' => 'Meta-WebIndexer', 147 'pattern' => '#meta-webindexer/\d+(?:\.\d+)?#i', 148 ], 149 ], 150 ], 151 152 'Apple' => [ 153 'doc' => 'https://support.apple.com/en-us/119829', 154 'items' => [ 155 'applebot' => [ 156 'label' => 'Applebot', 157 'pattern' => '#Applebot/\d+(?:\.\d+)?#i', 158 ], 159 160 // Robots-only (does not crawl pages) 161 'applebot_extended' => [ 162 'label' => 'Applebot-Extended (AI)', 163 'pattern' => '#(?!.)#', 164 ], 165 ], 166 ], 167 146 168 'DuckDuckGo' => [ 147 169 'doc' => 'https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot', … … 169 191 ]; 170 192 171 172 /**173 * Cache of the resolved, filtered agent list.174 *175 * Why: Resolving the agent list once per request saves expensive work176 * (filtering & validation). Stored in memory for the duration of177 * the request; no persistence needed.178 *179 * @var array|null180 */181 193 private $dzcr_agents_effective_cache = null; 182 194 183 /**184 * Cache of the flat list of UA patterns keyed by the agent key.185 *186 * Why: Many look-ups need a simple key > regex map; building it once avoids187 * iterating through the nested agent group structure each time.188 *189 * @var array|null190 */191 195 private $dzcr_agents_flat_cache = null; 192 196 193 /**194 * Prefix used for per-post Meta keys that store the last timestamp a bot195 * visited the post.196 *197 * Why: Namespacing the meta key prevents collisions with other plugins.198 * The prefix is combined with the unique agent key.199 *200 * @var string201 */202 197 private $dzcr_meta_prefix = '_dzcr_last_seen_'; 203 198 204 /**205 * Prefix used for site-wide last-seen Meta keys (stored as options).206 *207 * Why: Allows quick access to the most recent time *any* page was seen by208 * each bot without scanning the entire database.209 *210 * @var string211 */212 199 private $dzcr_site_prefix = '_dzcr_site_last_'; 213 200 214 /**215 * Prefix used for per-URL last-seen keys (stored as options).216 *217 * Why: URLs are not tied to a post, so options keep arbitrary218 * key/value pairs without cluttering postmeta.219 *220 * @var string221 */222 201 private $dzcr_url_prefix = '_dzcr_url_last_'; 223 202 224 /**225 * Prefix used for the last-post-ID per bot (stored as options).226 *227 * Why: This is a fast index: if the last post ID is known we can fetch228 * its meta immediately, avoiding a full table scan.229 *230 * @var string231 */232 203 private $dzcr_lastpost_prefix = '_dzcr_last_post_'; 233 204 234 235 /**236 * Class constructor.237 *238 * Why: Hooks the core functionality into WordPress:239 * - `template_redirect` records when a crawler requests a page.240 * - `add_meta_boxes` adds the per-post "Crawler Record" box.241 * - `admin_bar_menu` inserts a quick toolbar entry for logged-in users.242 * - `admin_menu` registers the settings page in the admin sidebar.243 * - `plugin_action_links_*` adds Settings/Documentation links on the244 * Plugins screen.245 *246 * This is the entry point of the plugin – everything else is called247 * indirectly via these actions.248 */249 205 public function __construct() { 250 206 add_action( 'template_redirect', [ $this, 'maybe_record_last_seen' ] ); … … 258 214 public function enqueue_admin_assets() { 259 215 $screen = function_exists('get_current_screen') ? get_current_screen() : null; 260 if ( ! $screen || $screen->id !== 'toplevel_page_ crawler-record') {216 if ( ! $screen || $screen->id !== 'toplevel_page_' . DZCR_ADMIN_SLUG ) { 261 217 return; // only on the plugin admin page 262 218 } … … 265 221 'https://player.vimeo.com/api/player.js', 266 222 [], 267 $ver,223 CRAWLER_RECORD_VERSION, 268 224 true 269 225 ); … … 271 227 } 272 228 273 274 /** 275 * Retrieves the effective agent groups after filters, validation, 276 * and custom ordering have been applied. 277 * 278 * Why: The default list is processed once per request. This method 279 * returns a validated array that can be cached for all other 280 * look‑ups (`get_agent_groups()`). The validation step removes 281 * malformed entries and guarantees the presence of a `doc` 282 * URL and a `label` for each item. 283 * 284 * @return array The resolved, validated agent groups. 229 /** 230 * Lazily resolves the complete agent‑group tree, applying the `dzcr_agent_groups` 231 * and `dzcr_agents_order` filters, and caches the result for the lifetime of the request. 232 * 233 * The returned array maps group names to an array containing a `doc` URL and a list of 234 * individual bot definitions (`label` and `pattern`). The data is filtered so that 235 * malformed entries (missing patterns or labels) are removed automatically. 236 * 237 * @return array<string,array<string,mixed>> The fully‑validated and ordered agent groups. 285 238 */ 286 239 private function get_agent_groups() { … … 332 285 333 286 /** 334 * Returns a flattened map of bot keys to regex pattern strings. 335 * 336 * Why: The recording logic (`maybe_record_last_seen`) needs a 337 * simple `foreach ($agent_key => $pattern)` loop. Flattening 338 * removes the overhead of walking the nested group structure 339 * for every request. 340 * 341 * @return array Key → regex string. 287 * Returns a flattened map of every bot key to its compiled regular‑expression pattern. 288 * 289 * The function iterates over the effective agent groups obtained via {@see get_agent_groups()} 290 * and builds a key‑=>pattern list. The result is cached for the current request. 291 * 292 * @return array<string,string> Bot key → regex pattern. 342 293 */ 343 294 private function get_agents_flat() { … … 356 307 357 308 /** 358 * Looks up the human‑readable label for a bot key. 359 * 360 * Why: The UI shows the label, not the internal key. The helper 361 * centralises the lookup and provides a default in case the 362 * key is unknown. 363 * 364 * @param string $key Internal bot identifier. 365 * @return string Human‑readable label. 309 * Looks up a human‑readable label for a bot key in the current agent‑group configuration. 310 * 311 * If the key cannot be found, its raw value is returned as a fallback. This helper is 312 * used to keep UI code terse while still presenting friendly names to site editors. 313 * 314 * @param string $key The internal identifier for a bot (e.g. 'googlebot_desktop'). 315 * @return string The user‑friendly label. 366 316 */ 367 317 private function get_label_for_key( $key ) { … … 375 325 376 326 /** 377 * Builds the canonical URL of the current front‑end request. 378 * 379 * Why: Recorded timestamps are stored per‑URL, so we need an exact, 380 * normalised URL. The function normalises the scheme 381 * (`http/https`), host, path, and applies the 382 * `dzcr_normalize_url` filter (useful for stripping tracking 383 * parameters or converting to a canonical form). 384 * 385 * @return string Full URL (or empty string in admin/invalid state). 386 */ 387 private function current_url() { 388 if ( is_admin() ) { 327 * Generates the canonical, normalized permalink for the queried singular post. 328 * 329 * The function guarantees the returned URL does **not** contain a query string, 330 * a fragment identifier, or an empty path. The result is suitable for storage 331 * in post meta and for robots‑txt matching. 332 * 333 * @param int $post_id By reference, receives the resolved post ID or 0 if no singular. 334 * @return string The sanitized absolute URL, or an empty string when no query can be resolved. 335 */ 336 private function current_url_for_post( &$post_id = 0 ) { 337 if ( is_admin() || ! is_singular() ) { 389 338 return ''; 390 339 } 391 if ( ! isset( $_SERVER['HTTP_HOST'], $_SERVER['REQUEST_URI'] ) ) { 340 341 $post_id = (int) get_queried_object_id(); 342 if ( ! $post_id ) { 392 343 return ''; 393 344 } 394 $scheme = ( ! empty( $_SERVER['HTTPS'] ) && 'off' !== $_SERVER['HTTPS'] ) ? 'https' : 'http'; 395 $host = sanitize_text_field( wp_unslash( $_SERVER['HTTP_HOST'] ) ); 396 $uri = esc_url_raw( wp_unslash( $_SERVER['REQUEST_URI'] ) ); 397 398 if ( '' === $host ) { 345 346 $url = get_permalink( $post_id ); 347 if ( ! $url ) { 399 348 return ''; 400 349 } 401 $url = $scheme . '://' . $host . $uri; 402 $url = apply_filters( 'dzcr_normalize_url', $url ); 403 if ( strlen( $url ) > 2048 ) { 404 $url = substr( $url, 0, 2048 ); 405 } 406 return $url; 407 } 408 409 /** 410 * Records the last “seen” timestamp when a crawler requests a page. 411 * 412 * Why: This method is the heart of the plugin. It is called on 413 * `template_redirect` and performs the following actions: 414 * 415 * 1. Skips previews, feeds, REST, admin, AJAX, and cron requests. 416 * 2. Only processes GET/HEAD requests, which are the standard 417 * HTTP verbs used by crawlers. 418 * 3. Trims and sanitises the UA string (max 512 chars) and 419 * normalises it to avoid runaway regexes. 420 * 4. Throttles writes using a 10‑minute window (configurable via 421 * `dzcr_throttle_window`) to avoid spamming the DB when the same 422 * bot visits the page repeatedly. 423 * 5. Stores three pieces of information: 424 * - Per‑post meta (`_dzcr_last_seen_[bot]`). 425 * - Per‑URL options (`_dzcr_url_last_[bot]_[hash]`) plus companion 426 * `_url_` and `_ua_` options. 427 * - Site‑wide option (`_dzcr_site_last_[bot]`). 428 * 429 * The method also updates the “last UA string” meta/option so the 430 * UI can show exactly what the bot sent. 431 * 350 351 // Normalize: strip query string and fragment 352 $parts = wp_parse_url( $url ); 353 if ( empty( $parts['host'] ) || empty( $parts['path'] ) ) { 354 return ''; 355 } 356 357 $scheme = $parts['scheme'] ?? 'https'; 358 return $scheme . '://' . $parts['host'] . $parts['path']; 359 } 360 361 /** 362 * Core routine that records bot visit timestamps when the current request matches a known crawler. 363 * 364 * The method is executed early in the front‑end rendering cycle (`template_redirect`). 365 * 1. It exits immediately for admin screens, REST requests, feeds, `HEAD` requests, or 366 * when the `User‑Agent` header is missing or too long. 367 * 2. The UA string is matched against the compiled regex list. Only the first matching 368 * rule is considered. 369 * 3. A per‑post meta key and a global option are updated if the throttle window 370 * (10 minutes by default) allows it. The method also records the exact UA string 371 * that triggered the update so editors can inspect it later. 372 * 373 * @param int $post_id Optional post ID when the query resolves to a single post. Passed by value. 432 374 * @return void 375 * @throws RuntimeException If the PHP `set_transient` failures occur (unlikely, but reported). 433 376 */ 434 377 public function maybe_record_last_seen() { … … 445 388 446 389 $now = microtime( true ); 447 $post_id = is_singular() ? (int) get_queried_object_id() : 0; 448 $ url = $this->current_url();449 $url hash = $url ? md5( $url ) : '';390 391 $post_id = 0; 392 $url = $this->current_url_for_post( $post_id ); 450 393 451 394 $throttle = (int) apply_filters( 'dzcr_throttle_window', 10 * MINUTE_IN_SECONDS ); … … 470 413 } 471 414 472 // Per URL 473 if ( $urlhash ) { 474 $opt_key = $this->dzcr_url_prefix . $key . '_' . $urlhash; 475 $opt_url_key = $this->dzcr_url_prefix . $key . '_url_' . $urlhash; 476 $opt_ua_key = $this->dzcr_url_prefix . $key . '_ua_' . $urlhash; // store last UA for this URL+agent 477 $t_url = 'dzcr_seen_url_' . $key . '_' . $urlhash; 478 479 if ( ! $throttle || ! get_transient( $t_url ) ) { 480 $prev = (float) get_option( $opt_key, 0 ); 481 if ( $now > $prev ) { 482 update_option( $opt_key, (string) $now, false ); 483 update_option( $opt_url_key, $url, false ); 484 update_option( $opt_ua_key, $ua, false ); 485 } 486 if ( $throttle ) { 487 set_transient( $t_url, 1, $throttle ); 488 } 415 // Site-wide latest real post/page 416 if ( $post_id && $url ) { 417 $latest_key = 'dzcr_latest_url_' . $key; 418 $prev = get_option( $latest_key, [] ); 419 $prev_ts = isset( $prev['ts'] ) ? (float) $prev['ts'] : 0.0; 420 421 if ( $now > $prev_ts ) { 422 update_option( 423 $latest_key, 424 [ 425 'ts' => (string) $now, 426 'url' => $url, 427 'ua' => $ua, 428 ], 429 false 430 ); 489 431 } 490 432 } … … 501 443 502 444 /** 503 * Safely matches a UA string against a stored pattern or literal value. 504 * 505 * Why: Direct use of `preg_match()` on untrusted user‑agent strings 506 * can trigger PHP warnings/errors (e.g. malformed regex). 507 * This wrapper: 508 * - Ensures the pattern begins with a `#` and ends with `#i`. 509 * - Suppresses `preg_match()` errors with the `@` operator. 510 * - Validates that the regex is syntactically correct. 511 * - Fallbacks to literal `hash_equals()` comparison when a plain string 512 * is supplied. 513 * 514 * @param string $ua The UA string from the request. 515 * @param string $pattern_or_exact Either a regex string or a literal. 516 * @return bool True if the UA matches the pattern or string. 445 * Safely tests whether a given User‑Agent string matches a pattern. 446 * 447 * Handles both plain strings (exact comparison) and regular‑expressions by inspecting 448 * the first character. The function avoids the costly `preg_match` errors through 449 * the `@` error‑control operator and checks the regex error code. 450 * 451 * @param string $ua The User‑Agent string to test. 452 * @param string $pattern_or_exact Either a regex (enclosed in `#...#i`) or a literal string. 453 * @return bool True if the UA satisfies the pattern; otherwise false. 517 454 */ 518 455 private function ua_matches_safe( $ua, $pattern_or_exact ) { … … 542 479 543 480 /** 544 * Registers the “Crawler Record” meta‑box that appears on the post 545 * editing screen. 546 * 547 * Why: Post editors need instant insight into which crawlers have 548 * visited the post and their access status. The meta‑box 549 * appears in the normal “normal” context with high priority, 550 * making it highly visible without adding extra menu items. 481 * Registers the “Crawler Record” meta‑box on the post editing screen. 482 * 483 * The meta‑box is added to the *Advanced* context so that it does not clutter the editor 484 * for non‑technical users. It is only visible to capable editors (users with 485 * `edit_post` capability). 486 * 551 487 */ 552 488 public function register_meta_box() { … … 562 498 563 499 /** 564 * Renders the per‑post “Crawler Record” meta‑box. 565 * 566 * Why: The box shows a table of all bots, each with: 567 * - A human‑readable label. 568 * - The last time the bot accessed the post (`Last Seen`). 569 * - A status badge that tells you whether the URL is allowed by 570 * `robots.txt`. 571 * - A collapsible section that reveals the exact UA string 572 * the bot used when it visited. 573 * 574 * The function also prints a warning if the site is globally 575 * blocking all crawlers (WordPress “Discourage search‑engine” setting). 576 * 577 * @param WP_Post $post Current post object. 500 * Renders the contents of the “Crawler Record” meta‑box. 501 * 502 * Generates a table that lists every tracked agent, its last‑seen timestamp, and a 503 * robots‑txt allow/blocked status. The UI includes a collapsible `details` element 504 * that shows the exact User‑Agent string used for the most recent hit. 505 * 506 * @param WP_Post $post The post object currently being edited. 578 507 * @return void 579 508 */ … … 618 547 'diag' => $diag, 619 548 'ua' => $last_ua, 620 'na' => ( 'google_extended' === $key ), // treat Google-Extended as non-crawler549 'na' => $this->is_robots_only_agent( $key ), 621 550 ]; 622 551 } … … 662 591 663 592 /** 664 * Adds a “Crawler Record” entry to the front‑end admin bar. 665 * 666 * Why: Site admins often browse the front‑end while logged in. 667 * The toolbar provides a one‑click shortcut to the 668 * detailed bot‑status page and shows a quick list of 669 * timestamps for each bot, whether looking at a single page, 670 * a specific URL, or the entire site. 671 * 672 * @param WP_Admin_Bar $wp_admin_bar Admin‑bar object. 593 * Adds a “Crawler Record” node to the front‑end admin bar with context‑aware children. 594 * 595 * The node is only shown to logged‑in users viewing the front‑end (not the admin). Depending 596 * on whether the current page is singular or generic, the children list will display 597 * per‑post or site‑wide last‑seen data for each crawler. 598 * 599 * @param WP_Admin_Bar $wp_admin_bar The admin bar instance. 673 600 * @return void 674 601 */ … … 679 606 } 680 607 681 $admin_page_url = admin_url( 'admin.php?page= cls-crawler-record');608 $admin_page_url = admin_url( 'admin.php?page=' . DZCR_ADMIN_SLUG ); 682 609 683 610 // Parent link … … 700 627 701 628 foreach ( $wrap['items'] as $key => $_def ) { 702 if ( 'google_extended' === $key) {703 continue; // do not show Google-Extended in admin bar629 if ( $this->is_robots_only_agent( $key ) ) { 630 continue; 704 631 } 632 705 633 $label = $this->get_label_for_key( $key ); 706 634 … … 736 664 737 665 /** 738 * Registers the plugin’s top‑level admin page. 739 * 740 * Why: The settings page is a convenient place to: 741 * - See all bots and their global timestamps. 742 * - Inspect the last page each bot has seen. 743 * - View robots.txt diagnostics for each bot. 666 * Registers the top‑level “Crawler Record” admin submenu page. 667 * 668 * The submenu is created under the **Settings** root so that it is consistently 669 * reachable regardless of the user role or the presence of other plugins. 744 670 * 745 671 * @return void … … 750 676 __( 'Crawler Record', 'crawler-record' ), 751 677 'manage_options', 752 'cls-crawler-record',678 DZCR_ADMIN_SLUG, 753 679 [ $this, 'render_admin_page' ], 754 680 'dashicons-search', … … 758 684 759 685 /** 760 * Renders the entire admin page. 761 * 762 * Why: The page shows a per‑bot table containing: 763 * - The bot’s human label. 764 * - The most recent global timestamp. 765 * - The link to the most recent page the bot visited. 766 * - A status badge indicating robots.txt permissibility. 767 * 768 * The table also has a useful “Important” notice if 769 * WordPress is blocking all crawlers. 686 * Renders the public dashboard for all tracked agents. 687 * 688 * The page lists each bot’s last‑seen timestamp, the last URL it visited, and the 689 * result of a robots‑txt evaluation for the site’s home path. A small embedded video 690 * tutorial is shown in the right‑hand column. 770 691 * 771 692 * @return void … … 847 768 : $this->robots_status_for_agent_path( $key, $home_path, $diag ); 848 769 849 $show_last_fields_as_na = ( 'google_extended' ===$key );770 $show_last_fields_as_na = $this->is_robots_only_agent( $key ); 850 771 851 772 // Agent cell (expandable UA string unless NA) … … 879 800 880 801 /** 881 * Checks if the site is configured to “Discourage search engines”. 882 * 883 * Why: If WordPress’s global setting is enabled, all crawlers are 884 * blocked regardless of `robots.txt`. The function lets us 885 * quickly short‑circuit further checks. 886 * 887 * @return bool True if the site is discouraging crawlers. 802 * Determines if WordPress is currently discouraging search engines (`blog_public` = 0). 803 * 804 * The helper is used to short‑circuit logic that would otherwise record activity, 805 * and to display a warning banner on admin screens. 806 * 807 * @return bool True when site‑wide crawling is disabled. 888 808 */ 889 809 private function is_site_discouraged() { … … 892 812 893 813 /** 894 * Returns a status array for the “blocked by WordPress” case. 895 * 896 * Why: Used when `is_site_discouraged()` is true; the admin UI 897 * needs a consistent structure (`state` & `reason`) for 898 * rendering the status badge. 899 * 900 * @return array `state` (blocked) and a human‑readable reason. 814 * Builds a standard “blocked by WordPress setting” status array. 815 * 816 * Returned from {@see robots_status_for_agent_path()} when the site is 817 * configured to discourage search engines. 818 * 819 * @return array{state:string,reason:string} Blocking information. 901 820 */ 902 821 private function forced_block_status() { … … 908 827 909 828 /** 910 * Maps an internal bot key to the list of UA strings that the bot 911 * uses (used to choose the “most representative” UA for robots.txt 912 * checks). 913 * 914 * Why: Different bots have distinct patterns for desktop/mobile, 915 * legacy, and AI‑bot forms. This helper returns the set of 916 * tokens that must be matched against the `robots.txt` filter 917 * logic. 918 * 919 * @param string $key Internal bot key. 920 * @return array List of tokens. 829 * Maps an internal bot key to the canonical User‑Agent string(s) that can appear 830 * in the site’s robots.txt. 831 * 832 * The mapping is used for robots‑txt group selection. When a key maps to 833 * multiple tokens it is returned as an array of strings; otherwise a single‑element array. 834 * 835 * @param string $key The internal identifier for a bot. 836 * @return array<string> The list of UA strings to check in robots.txt. 921 837 */ 922 838 private function robots_tokens_for_key(string $key): array { … … 953 869 case 'perplexity_user': return ['Perplexity-User']; // normalize Unicode hyphens 954 870 871 // Meta 872 case 'meta_externalagent': 873 return ['meta-externalagent']; 874 case 'meta_webindexer': 875 return ['Meta-WebIndexer']; 876 877 // Apple 878 case 'applebot': 879 return ['Applebot']; 880 case 'applebot_extended': 881 return ['Applebot-Extended']; 882 883 955 884 // DuckDuckGo 956 885 case 'duckduckgo_search': return ['DuckDuckBot']; … … 964 893 965 894 /** 966 * Chooses the appropriate `User‑Agent` group in robots.txt for 967 * a given bot. 968 * 969 * Why: A robot may have multiple `User‑Agent` lines (desktop/mobile). 970 * The longest exact match wins; if none match we fallback to 971 * the wildcard group (`*`). This function returns the entire 972 * group structure (agents + rules) for that bot. 973 * 974 * @param string $robots_txt Raw text of robots.txt. 975 * @param array $tokens List of tokens that identify the bot. 976 * @return array Group with `agents` and `rules` keys. 895 * Returns true for bots that only check robots.txt and never fetch pages. 896 * 897 * For these bots the per‑post “last‑seen” columns are marked “N/A” because it is 898 * impossible to record an actual page visit. 899 * 900 * @param string $key The internal bot identifier. 901 * @return bool True when the bot is robots‑only. 902 */ 903 private function is_robots_only_agent( string $key ): bool { 904 return in_array( 905 $key, 906 [ 907 'google_extended', 908 'applebot_extended', 909 ], 910 true 911 ); 912 } 913 914 /** 915 * Chooses the most specific robots.txt group that matches the requested bot tokens. 916 * 917 * The grouping algorithm follows the order of rules in the file: a group that 918 * lists a specific User‑Agent token and the longest matching rule wins. 919 * If no group matches, the first wildcard `*` group is returned (if present). 920 * 921 * @param string $robots_txt Raw robots.txt contents. 922 * @param array<string> $tokens List of canonical UA strings for the bot. 923 * @return array{agents:list<string>,rules:list<mixed>} The selected group’s agents and rules. 977 924 */ 978 925 private function select_robots_group(string $robots_txt, array $tokens) { … … 1003 950 1004 951 /** 1005 * Checks whether a path starts with the rule prefix. 1006 * 1007 * Why: `robots.txt` rules are prefix based; e.g. `Disallow: /wp-admin` 1008 * blocks everything under `/wp-admin`. A simple `strncmp()` 1009 * suffices because we already normalised the rule to start with 1010 * a slash and have removed any trailing `$` terminator. 1011 * 1012 * @param string $path Path from the request. 1013 * @param string $rule Rule from robots.txt. 1014 * @return bool True if the path matches the rule. 952 * Tests whether the supplied robots rule path is a strict prefix of $path. 953 * 954 * The helper is used by {@see robots_txt_allows()} when applying the longest‑prefix 955 * rule logic. 956 * 957 * @param string $path The request path. 958 * @param string $rule The rule path defined in robots.txt. 959 * @return bool True when $rule is a prefix of $path. 1015 960 */ 1016 961 private function path_prefix_match(string $path, string $rule): bool { … … 1019 964 1020 965 /** 1021 * Normalises hyphen characters in a string; replaces various Unicode 1022 * hyphens with the ASCII hyphen. 1023 * 1024 * Why: Some bots (e.g., Perplexity) use non‑standard hyphens in 1025 * their UA strings. Normalising them guarantees that 1026 * string‑based matching (e.g. `strcasecmp()`) works 1027 * regardless of the particular hyphen glyph. 966 * Normalises a URL path so that an empty string becomes “/”. 967 * 968 * An empty path can cause subtle bugs in robots‑txt evaluation and is treated 969 * as the root path by WordPress. 970 * 971 * @param string $p The original path. 972 * @return string Normalised path. 973 */ 974 private function normalize_path(string $p): string { 975 return ($p === '') ? '/' : $p; 976 } 977 978 /** 979 * Replaces every form of Unicode hyphen (–, ‑, -, …) with a simple ASCII hyphen. 980 * 981 * The rule set is chosen to avoid false mismatches when a site’s robots.txt 982 * contains variant hyphens in its UA tokens. 1028 983 * 1029 984 * @param string $s Input string. 1030 985 * @return string Normalised string. 1031 986 */ 1032 private function normalize_path(string $p): string {1033 return ($p === '') ? '/' : $p;1034 }1035 1036 /**1037 * Normalises hyphen characters in a string; replaces various Unicode1038 * hyphens with the ASCII hyphen.1039 *1040 * Why: Some bots (e.g., Perplexity) use non‑standard hyphens in1041 * their UA strings. Normalising them guarantees that1042 * string‑based matching (e.g. `strcasecmp()`) works1043 * regardless of the particular hyphen glyph.1044 *1045 * @param string $s Input string.1046 * @return string Normalised string.1047 */1048 987 private function normalize_hyphens(string $s): string { 1049 988 return preg_replace('/[\x{2010}-\x{2015}\x{2212}\x{FE58}\x{FE63}\x{FF0D}]/u','-',$s); … … 1051 990 1052 991 /** 1053 * Parses a raw `robots.txt` string into an array of groups. 1054 * 1055 * Why: The plugin re‑implements the minimal parsing logic needed 1056 * for our use‑case (User‑Agent, Allow, Disallow). The function 1057 * ignores other directives (Sitemap, Crawl‑delay) and keeps 1058 * groups and rules in the order they appear, which is essential 1059 * for the longest‑path‑wins logic. 1060 * 1061 * @param string $txt Raw content of robots.txt. 1062 * @return array Array of groups, each with `agents` and `rules`. 992 * Parses the raw robots.txt into an array of agent‑group objects. 993 * 994 * The parser emits structures of the form: 995 * [{'agents'=>[...], 'rules'=>[['allow'|'disallow', '/path'], ...]}, ...] 996 * * Non‑essential lines (`Sitemap:`, `Crawl‑delay:`, comments) are ignored. 997 * * Blank lines reset state, but a group is only flushed when a director is encountered. 998 * 999 * @param string $txt Raw robots.txt string. 1000 * @return array<List<string>,list<mixed>> List of groups, each with `agents` and `rules`. 1063 1001 */ 1064 1002 private function parse_robots_groups(string $txt): array { … … 1105 1043 1106 1044 /** 1107 * Retrieves the raw `robots.txt` content for the current site.1108 * 1109 * Why: We avoid an HTTP request to the public `robots.txt`.1110 * WordPress can generate it via `do_robots()` or the file can1111 * be read directly. The function falls back to the filter1112 * `robots_txt` if no file or `do_robots()` output is1113 * available.1114 * 1115 * @return string|null Raw robots.txt body or null if unavailable.1045 * Retrieves the active robots.txt for the current site. 1046 * 1047 * The method respects the order imposed by the WordPress core: 1048 * 1. Physical `robots.txt` file 1049 * 2. `wp_robots()` (WP 5.7+) 1050 * 3. `do_robots` action 1051 * 4. `robots_txt` filter 1052 * 1053 * @return string|null The robots.txt contents, or `null` if none could be generated. 1116 1054 */ 1117 1055 private function get_local_robots_txt() { 1056 // 1. Physical robots.txt file takes precedence 1118 1057 $file = ABSPATH . 'robots.txt'; 1119 1058 if ( @is_readable( $file ) ) { 1120 1059 $body = @file_get_contents( $file ); 1121 if ( is_string( $body ) && '' !== $body ) return $body; 1122 } 1060 if ( is_string( $body ) && '' !== trim( $body ) ) { 1061 return $body; 1062 } 1063 } 1064 1065 // 2. Prefer wp_robots() when available (WP 5.7+) 1066 if ( function_exists( 'wp_robots' ) ) { 1067 ob_start(); 1068 wp_robots(); 1069 $out = ob_get_clean(); 1070 if ( is_string( $out ) && '' !== trim( $out ) ) { 1071 return $out; 1072 } 1073 } 1074 1075 // 3. Fallback to core do_robots action 1076 // do_robots is a WordPress core action used to generate robots.txt output. 1123 1077 ob_start(); 1078 // phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedHooknameFound 1124 1079 do_action( 'do_robots' ); 1125 1080 $out = ob_get_clean(); 1126 if ( is_string( $out ) && '' !== trim( $out ) ) return $out; 1127 1081 if ( is_string( $out ) && '' !== trim( $out ) ) { 1082 return $out; 1083 } 1084 1085 // 4. Final fallback via robots_txt filter 1086 // phpcs:ignore WordPress.NamingConventions.PrefixAllGlobals.NonPrefixedHooknameFound 1128 1087 $output = apply_filters( 'robots_txt', '', get_option( 'blog_public' ) ); 1129 if ( is_string( $output ) && '' !== trim( $output ) ) return $output; 1088 if ( is_string( $output ) && '' !== trim( $output ) ) { 1089 return $output; 1090 } 1130 1091 1131 1092 return null; … … 1133 1094 1134 1095 /** 1135 * Returns true if the robots.txt rules permit the bot to access the path. 1136 * 1137 * Why: The function implements a simplified rule engine that: 1138 * 1. Chooses the best User‑Agent group for the bot (exact match 1139 * or wildcard). 1140 * 2. Normalises the path and checks it against each rule in the 1141 * selected group. 1142 * 3. Uses longest‑path‑wins; disallow wins only if it 1143 * matches a longer path than any allow. If no rule matches, 1144 * the URL is allowed by default. 1145 * 1146 * @param string $robots_txt Raw robots.txt text. 1147 * @param string $agent_key Internal bot key. 1148 * @param string $path Requested path. 1149 * @return bool True if the path is allowed for the bot. 1096 * Determines whether the given bot is allowed to fetch $path based on the active robots.txt. 1097 * 1098 * Implements the standard longest‑prefix‑wins rule for a single bot’s group. The method 1099 * returns `true` for ``allow`` or for paths that match no rule (default allow). 1100 * 1101 * @param string $robots_txt The retrieved robots.txt contents. 1102 * @param string $agent_key The internal identifier for the bot. 1103 * @param string $path The requested URL path (starting with “/”). 1104 * @return bool True when crawling is permitted, false when disallowed. 1150 1105 */ 1151 1106 private function robots_txt_allows(string $robots_txt, string $agent_key, string $path): bool { … … 1175 1130 1176 1131 /** 1177 * Public wrapper that provides a “state” + “reason” array describing1178 * whether the bot is allowed to crawl the path, and populates1179 * diagnostics output for the UI.1180 * 1181 * Why: The admin pages display a status badge; this method produces1182 * the data the badge expects and also records diagnostic1183 * information (matched group & rule) that can be shown in1184 * an expandable details box.1185 * 1186 * @param string $ agent_key Bot key.1187 * @param string $path Requested path.1188 * @param array &$diag Output diagnostics (group, rule).1189 * @return array (`state` => 'allowed'|'blocked'|'unknown', `reason` => string).1132 * Public API that reports the allow/blocked status for a bot and a specific path. 1133 * 1134 * The method first checks whether the site globally discourages crawlers (`blog_public=0`). 1135 * If not, it loads the robots.txt, selects the appropriate group, and evaluates 1136 * the path. A detailed diagnostic array (group name, rule used) is returned 1137 * alongside the status. When a global block applies, the status is `blocked` 1138 * with a specific reason. 1139 * 1140 * @param string $agent_key The bot’s internal key (e.g. 'googlebot_desktop'). 1141 * @param string $path Path fragment (starting with “/”) to test. 1142 * @param array|null $diag Optional reference parameter that receives diagnostics about 1143 * the matching group and rule; can be omitted if not needed. 1144 * @return array{state:string,reason:string}<p>Possible `state` values: `allowed`, `blocked`, `unknown`.</p> 1190 1145 */ 1191 1146 public function robots_status_for_agent_path( $agent_key, $path, &$diag = null ) { … … 1210 1165 1211 1166 /** 1212 * Renders a status badge and an expandable details section. 1213 * 1214 * Why: The status badge (green check, red X, gray) gives a quick visual 1215 * cue. The details expand on hover/click and show the exact 1216 * robots.txt group and rule that determined the decision. 1217 * 1218 * @param array $status Associative array returned by 1219 * `robots_status_for_agent_path`. 1220 * @param array $diag Diagnostics array (group & rule). 1221 * @return string HTML safe snippet for the badge + details. 1167 * Renders a colour‑coded badge indicating the robots.txt status and a collapsible 1168 * details section that shows which rule was matched. 1169 * 1170 * The badge colors are: 1171 * * Green with a checkmark for “allowed” 1172 * * Red with a cross for “blocked” 1173 * * Grey for “unknown” 1174 * 1175 * @param array{state:string,reason:string} $status Returned from {@see robots_status_for_agent_path()}. 1176 * @param array{group:string,rule:string} $diag Diagnostics array for detail output. 1177 * @return string Safe HTML for the badge + details element. 1222 1178 */ 1223 1179 private function render_status_badge_expandable( $status, $diag ) { … … 1244 1200 1245 1201 /** 1246 * Formats a timestamp in “human‑readable” form and a precise 1247 * full‑timestamp (with micro‑seconds) for display. 1248 * 1249 * Why: Users want to see “3 days ago (2025‑08‑12 15:04:23.123456)”. 1250 * This helper keeps the code in the main rendering loop 1251 * terse and centralises the formatting logic. 1252 * 1253 * @param float $ts Timestamp from the database. 1254 * @return string HTML safe representation. 1202 * Formats a UNIX timestamp (with microseconds) into a human‑readable “time‑ago” string 1203 * plus the exact date/time in the site’s configured timezone. 1204 * 1205 * The output is safe for HTML rendering and is not localized beyond what 1206 * WordPress’ `human_time_diff()` and `wp_date()` provide. 1207 * 1208 * @param float $ts Timestamp value returned by `microtime(true)`. 1209 * @return string The formatted cell content; if $ts is falsy, “Not Yet” is returned. 1255 1210 */ 1256 1211 private function format_last_seen_cell( $ts ) { … … 1270 1225 1271 1226 /** 1272 * Formats the string shown in the admin bar for each bot. 1273 * 1274 * Why: The toolbar entry should be compact (label + timestamp) 1275 * but still show the exact time. This helper keeps the 1276 * formatting consistent between the toolbar and the 1277 * meta‑box. 1278 * 1279 * @param string $label Label for the bot. 1280 * @param float $ts Timestamp (may be 0). 1281 * @param string $suffix Optional small label (e.g., “today”). 1282 * @return string Safe html string. 1227 * Builds the display string used in admin‑bar nodes. 1228 * 1229 * It shows the label, the relative “time‑ago” string, and the absolute timestamp 1230 * (with microseconds). If no timestamp is available, “Not Yet” is used. 1231 * 1232 * @param string $label Human‑readable name of the agent. 1233 * @param float $ts Timestamp value or 0 for “Not Yet”. 1234 * @param string $suffix Optional suffix string to append after the label. 1235 * @return string The formatted admin‑bar line. 1283 1236 */ 1284 1237 private function format_admin_bar_line( $label, $ts, $suffix = '' ) { … … 1297 1250 1298 1251 /** 1299 * Formats the “last page” cell that appears in the admin page. 1300 * 1301 * Why: In the list view we only want the URL, not the post title. 1302 * The helper converts the stored data from `compute_agent_latest()` 1303 * into a link or an “–” if the data is missing. 1304 * 1305 * @param array $latest Associative array from `compute_agent_latest`. 1306 * @return string HTML markup. 1252 * Formats the “last page” cell of the admin dashboard table. 1253 * 1254 * Accepts either a post ID or a raw URL string. The output is a clickable link 1255 * that opens the page in a new tab. Empty or missing values become a dash. 1256 * 1257 * @param array{ts:float,type:string,post_id:int,url:string,ua:string} $latest Data from {@see compute_agent_latest()}. 1258 * @return string The safe HTML for the link or a dash. 1307 1259 */ 1308 1260 private function format_context_cell_url_only( $latest ) { … … 1322 1274 1323 1275 /** 1324 * Converts a UTC timestamp to the administrator’s timezone string. 1325 * 1326 * Why: All timestamps are stored in UTC; showing them in the 1327 * local timezone (configured in Settings → General) 1328 * is far more user‑friendly. The helper uses `wp_date()` if 1329 * available, else falls back to `date_i18n()`. 1330 * 1331 * @param int $sec Unix timestamp. 1332 * @param string $format Optional format string (uses WordPress defaults if omitted). 1333 * @return string Localised timestamp. 1276 * Returns the current site time in the configured timezone, formatted with the given mask. 1277 * 1278 * Wrapper around `wp_date()` (WordPress 5.5+) and falls back to 1279 * `date_i18n()` on older cores. The function takes a UNIX timestamp 1280 * (seconds since the epoch) and a PHP date format string. 1281 * 1282 * @param int $sec Unix timestamp in seconds. 1283 * @param string $format PHP date format mask. 1284 * @return string Formatted date/time string. 1334 1285 */ 1335 1286 private function format_site_tz( $sec, $format ) { … … 1341 1292 1342 1293 /** 1343 * Computes the most recent known access time for a given bot. 1344 * 1345 * Why: The admin page needs to show the “last seen” across the 1346 * entire site. This method: 1347 * 1. Fetches the cached “latest post” option and reads the 1348 * associated meta. If that timestamp is newer, it is used. 1349 * 2. Reads the cached “latest URL” option (created during recording) 1350 * which holds the exact URL and UA. If that is newer than the 1351 * post timestamp, it wins. 1352 * 3. Returns an associative array (`ts`, `type`, `post_id`, `url`, `ua`). 1353 * 1354 * @param string $key Bot key. 1355 * @return array Latest visit info. 1294 * Computes the most recent event for a given bot across the entire site. 1295 * 1296 * The algorithm inspects: 1297 * 1. The post‑meta value of the last seen timestamp and associated UA for the last post that bot hit. 1298 * 2. The `dzcr_latest_url_{$key}` option that records the most recently accessed URL. 1299 * 3. Falls back to a default empty result if nothing is found. 1300 * 1301 * The function returns the candidate that has the greatest timestamp, together with 1302 * the type (“post” or “url”) and details of the location. 1303 * 1304 * @param string $key Internal bot identifier. 1305 * @return array{ts:float,type:string,post_id:int,url:string,ua:string} The best match. 1356 1306 */ 1357 1307 private function compute_agent_latest( $key ) { … … 1393 1343 1394 1344 /** 1395 * Adds “Settings” and “Documentation” links to the plugin’s row 1396 * in the WordPress Plugins page. 1397 * 1398 * Why: Site admins often look for quick access to the plugin’s 1399 * configuration. Adding these links saves a few clicks. 1400 * 1401 * @param array $links Existing action links. 1402 * @return array Updated array with new links. 1345 * Adds “Settings” and “Documentation” links to the plugin’s row in the admin‑plugins page. 1346 * 1347 * The links open in the same window and in a new tab, respectively. 1348 * 1349 * @param array<string,string> $links Array of existing action links. 1350 * @return array<string,string> The extended links array. 1403 1351 */ 1404 1352 public function plugin_action_links( $links ) { 1405 $settings_url = admin_url( 'admin.php?page= cls-crawler-record');1353 $settings_url = admin_url( 'admin.php?page=' . DZCR_ADMIN_SLUG ); 1406 1354 $docs_url = 'https://www.dizzysoft.com/crawler-record-plugin-for-wordpress/'; 1407 1355 -
crawler-record/trunk/readme.txt
r3400643 r3423919 3 3 Tags: googlebot, bingbot, gptbot, seo, robots 4 4 Requires at least: 6.0 5 Tested up to: 6. 85 Tested up to: 6.9 6 6 Requires PHP: 7.4 7 7 Stable tag: 0.8.0 … … 58 58 59 59 == Changelog == 60 = 0.9.0 = 61 * Now monitoring for Meta and Apple User Agents 62 * More accurate site-wide UA reporting. 63 * Ensured video tutorial appears on all admin screens. 64 * Fixed small code errors. 65 60 66 61 67 = 0.8.0 = -
crawler-record/trunk/uninstall.php
r3366584 r3423919 63 63 64 64 // 2) Remove any known discrete options (if you introduced settings later). 65 $d iscrete_options = array(66 'dzcr_settings', // reserved for future settings array.67 'dzcr_agents_custom', // reserved if you ever allow custom agent configs.65 $dzcr_discrete_options = array( 66 'dzcr_settings', 67 'dzcr_agents_custom', 68 68 ); 69 69 70 foreach ( $d iscrete_options as $opt ) {71 delete_option( $ opt );70 foreach ( $dzcr_discrete_options as $dzcr_opt ) { 71 delete_option( $dzcr_opt ); 72 72 } 73 73 74 74 // 3) Optional: purge post meta set by the plugin (disabled by default). 75 75 // Enable by defining DZCR_PURGE_POSTMETA true in wp-config.php OR using the filter below. 76 $ purge_postmeta = ( defined( 'DZCR_PURGE_POSTMETA' ) && DZCR_PURGE_POSTMETA )76 $dzcr_purge_postmeta = ( defined( 'DZCR_PURGE_POSTMETA' ) && DZCR_PURGE_POSTMETA ) 77 77 || apply_filters( 'dzcr_uninstall_purge_postmeta', false ); 78 78 79 if ( $ purge_postmeta ) {79 if ( $dzcr_purge_postmeta ) { 80 80 // Delete meta keys written per post: 81 81 // - _dzcr_last_seen_{agent}
Note: See TracChangeset
for help on using the changeset viewer.