summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'plugins/jetpack/modules/sitemaps/sitemaps.php')
-rw-r--r--plugins/jetpack/modules/sitemaps/sitemaps.php578
1 files changed, 578 insertions, 0 deletions
diff --git a/plugins/jetpack/modules/sitemaps/sitemaps.php b/plugins/jetpack/modules/sitemaps/sitemaps.php
new file mode 100644
index 00000000..7078fb8a
--- /dev/null
+++ b/plugins/jetpack/modules/sitemaps/sitemaps.php
@@ -0,0 +1,578 @@
+<?php
+/**
+ * Generate sitemap files in base XML as well as some namespace extensions.
+ *
+ * This module generates two different base sitemaps.
+ *
+ * 1. sitemap.xml
+ * The basic sitemap is updated regularly by wp-cron. It is stored in the
+ * database and retrieved when requested. This sitemap aims to include canonical
+ * URLs for all published content and abide by the sitemap spec. This is the root
+ * of a tree of sitemap and sitemap index xml files, depending on the number of URLs.
+ *
+ * By default the sitemap contains published posts of type 'post' and 'page', as
+ * well as the home url. To include other post types use the 'jetpack_sitemap_post_types'
+ * filter.
+ *
+ * @link http://sitemaps.org/protocol.php Base sitemaps protocol.
+ * @link https://support.google.com/webmasters/answer/178636 Image sitemap extension.
+ * @link https://developers.google.com/webmasters/videosearch/sitemaps Video sitemap extension.
+ *
+ * 2. news-sitemap.xml
+ * The news sitemap is generated on the fly when requested. It does not aim for
+ * completeness, instead including at most 1000 of the most recent published posts
+ * from the previous 2 days, per the news-sitemap spec.
+ *
+ * @link http://www.google.com/support/webmasters/bin/answer.py?answer=74288 News sitemap extension.
+ *
+ * @package Jetpack
+ * @since 3.9.0
+ * @since 4.8.0 Remove 1000 post limit.
+ * @author Automattic
+ */
+
+/* Include all of the sitemap subclasses. */
+require_once dirname( __FILE__ ) . '/sitemap-constants.php';
+require_once dirname( __FILE__ ) . '/sitemap-buffer.php';
+require_once dirname( __FILE__ ) . '/sitemap-stylist.php';
+require_once dirname( __FILE__ ) . '/sitemap-librarian.php';
+require_once dirname( __FILE__ ) . '/sitemap-finder.php';
+require_once dirname( __FILE__ ) . '/sitemap-builder.php';
+
+if ( defined( 'WP_DEBUG' ) && WP_DEBUG ) {
+ require_once dirname( __FILE__ ) . '/sitemap-logger.php';
+}
+
+/**
+ * Governs the generation, storage, and serving of sitemaps.
+ *
+ * @since 4.8.0
+ */
+class Jetpack_Sitemap_Manager {
+
+ /**
+ * Librarian object for storing and retrieving sitemap data.
+ *
+ * @see Jetpack_Sitemap_Librarian
+ * @since 4.8.0
+ * @var Jetpack_Sitemap_Librarian $librarian Librarian object for storing and retrieving sitemap data.
+ */
+ private $librarian;
+
+ /**
+ * Logger object for reporting debug messages.
+ *
+ * @see Jetpack_Sitemap_Logger
+ * @since 4.8.0
+ * @var Jetpack_Sitemap_Logger $logger Logger object for reporting debug messages.
+ */
+ private $logger;
+
+ /**
+ * Finder object for handling sitemap URIs.
+ *
+ * @see Jetpack_Sitemap_Finder
+ * @since 4.8.0
+ * @var Jetpack_Sitemap_Finder $finder Finder object for handling with sitemap URIs.
+ */
+ private $finder;
+
+ /**
+ * Construct a new Jetpack_Sitemap_Manager.
+ *
+ * @access public
+ * @since 4.8.0
+ */
+ public function __construct() {
+ $this->librarian = new Jetpack_Sitemap_Librarian();
+ $this->finder = new Jetpack_Sitemap_Finder();
+
+ if ( defined( 'WP_DEBUG' ) && ( true === WP_DEBUG ) ) {
+ $this->logger = new Jetpack_Sitemap_Logger();
+ }
+
+ // Add callback for sitemap URL handler.
+ add_action(
+ 'init',
+ array( $this, 'callback_action_catch_sitemap_urls' ),
+ defined( 'IS_WPCOM' ) && IS_WPCOM ? 100 : 10
+ );
+
+ // Add generator to wp_cron task list.
+ $this->schedule_sitemap_generation();
+
+ // Add sitemap to robots.txt.
+ add_action(
+ 'do_robotstxt',
+ array( $this, 'callback_action_do_robotstxt' ),
+ 20
+ );
+
+ // The news sitemap is cached; here we add a callback to
+ // flush the cached news sitemap when a post is published.
+ add_action(
+ 'publish_post',
+ array( $this, 'callback_action_flush_news_sitemap_cache' ),
+ 10
+ );
+
+ // In case we need to purge all sitemaps, we do this.
+ add_action(
+ 'jetpack_sitemaps_purge_data',
+ array( $this, 'callback_action_purge_data' )
+ );
+
+ /*
+ * Module parameters are stored as options in the database.
+ * This allows us to avoid having to process all of init
+ * before serving the sitemap data. The following actions
+ * process and store these filters.
+ */
+
+ // Process filters and store location string for sitemap.
+ add_action(
+ 'init',
+ array( $this, 'callback_action_filter_sitemap_location' ),
+ 999
+ );
+ }
+
+ /**
+ * Echo a raw string of given content-type.
+ *
+ * @access private
+ * @since 4.8.0
+ *
+ * @param string $the_content_type The content type to be served.
+ * @param string $the_content The string to be echoed.
+ */
+ private function serve_raw_and_die( $the_content_type, $the_content ) {
+ header( 'Content-Type: ' . $the_content_type . '; charset=UTF-8' );
+
+ global $wp_query;
+ $wp_query->is_feed = true;
+ set_query_var( 'feed', 'sitemap' );
+
+ if ( '' === $the_content ) {
+ $error = __( 'No sitemap found. Please try again later.', 'jetpack' );
+ if ( current_user_can( 'manage_options' ) ) {
+ $next = human_time_diff( wp_next_scheduled( 'jp_sitemap_cron_hook' ) );
+ /* translators: %s is a human_time_diff until next sitemap generation. */
+ $error = sprintf( __( 'No sitemap found. The system will try to build it again in %s.', 'jetpack' ), $next );
+ }
+
+ wp_die(
+ esc_html( $error ),
+ esc_html__( 'Sitemaps', 'jetpack' ),
+ array(
+ 'response' => 404,
+ )
+ );
+ }
+
+ echo $the_content;
+
+ die();
+ }
+
+ /**
+ * Callback to intercept sitemap url requests and serve sitemap files.
+ *
+ * @access public
+ * @since 4.8.0
+ */
+ public function callback_action_catch_sitemap_urls() {
+ // Regular expressions for sitemap URL routing.
+ $regex = array(
+ 'master' => '/^sitemap\.xml$/',
+ 'sitemap' => '/^sitemap-[1-9][0-9]*\.xml$/',
+ 'index' => '/^sitemap-index-[1-9][0-9]*\.xml$/',
+ 'sitemap-style' => '/^sitemap\.xsl$/',
+ 'index-style' => '/^sitemap-index\.xsl$/',
+ 'image' => '/^image-sitemap-[1-9][0-9]*\.xml$/',
+ 'image-index' => '/^image-sitemap-index-[1-9][0-9]*\.xml$/',
+ 'image-style' => '/^image-sitemap\.xsl$/',
+ 'video' => '/^video-sitemap-[1-9][0-9]*\.xml$/',
+ 'video-index' => '/^video-sitemap-index-[1-9][0-9]*\.xml$/',
+ 'video-style' => '/^video-sitemap\.xsl$/',
+ 'news' => '/^news-sitemap\.xml$/',
+ 'news-style' => '/^news-sitemap\.xsl$/',
+ );
+
+ // The raw path(+query) of the requested URI.
+ if ( isset( $_SERVER['REQUEST_URI'] ) ) { // WPCS: Input var okay.
+ $raw_uri = sanitize_text_field(
+ wp_unslash( $_SERVER['REQUEST_URI'] ) // WPCS: Input var okay.
+ );
+ } else {
+ $raw_uri = '';
+ }
+
+ $request = $this->finder->recognize_sitemap_uri( $raw_uri );
+
+ if ( isset( $request['sitemap_name'] ) ) {
+
+ /**
+ * Filter the content type used to serve the sitemap XML files.
+ *
+ * @module sitemaps
+ *
+ * @since 3.9.0
+ *
+ * @param string $xml_content_type By default, it's 'text/xml'.
+ */
+ $xml_content_type = apply_filters( 'jetpack_sitemap_content_type', 'text/xml' );
+
+ // Catch master sitemap xml.
+ if ( preg_match( $regex['master'], $request['sitemap_name'] ) ) {
+ $sitemap_content = $this->librarian->get_sitemap_text(
+ jp_sitemap_filename( JP_MASTER_SITEMAP_TYPE, 0 ),
+ JP_MASTER_SITEMAP_TYPE
+ );
+
+ // if there is no master sitemap yet, let's just return an empty sitemap with a short TTL instead of a 404
+ if ( empty( $sitemap_content ) ) {
+ $builder = new Jetpack_Sitemap_Builder();
+ $sitemap_content = $builder->empty_sitemap_xml();
+ }
+
+ $this->serve_raw_and_die(
+ $xml_content_type,
+ $sitemap_content
+ );
+ }
+
+ // Catch sitemap xml.
+ if ( preg_match( $regex['sitemap'], $request['sitemap_name'] ) ) {
+ $this->serve_raw_and_die(
+ $xml_content_type,
+ $this->librarian->get_sitemap_text(
+ $request['sitemap_name'],
+ JP_PAGE_SITEMAP_TYPE
+ )
+ );
+ }
+
+ // Catch sitemap index xml.
+ if ( preg_match( $regex['index'], $request['sitemap_name'] ) ) {
+ $this->serve_raw_and_die(
+ $xml_content_type,
+ $this->librarian->get_sitemap_text(
+ $request['sitemap_name'],
+ JP_PAGE_SITEMAP_INDEX_TYPE
+ )
+ );
+ }
+
+ // Catch sitemap xsl.
+ if ( preg_match( $regex['sitemap-style'], $request['sitemap_name'] ) ) {
+ $this->serve_raw_and_die(
+ 'application/xml',
+ Jetpack_Sitemap_Stylist::sitemap_xsl()
+ );
+ }
+
+ // Catch sitemap index xsl.
+ if ( preg_match( $regex['index-style'], $request['sitemap_name'] ) ) {
+ $this->serve_raw_and_die(
+ 'application/xml',
+ Jetpack_Sitemap_Stylist::sitemap_index_xsl()
+ );
+ }
+
+ // Catch image sitemap xml.
+ if ( preg_match( $regex['image'], $request['sitemap_name'] ) ) {
+ $this->serve_raw_and_die(
+ $xml_content_type,
+ $this->librarian->get_sitemap_text(
+ $request['sitemap_name'],
+ JP_IMAGE_SITEMAP_TYPE
+ )
+ );
+ }
+
+ // Catch image sitemap index xml.
+ if ( preg_match( $regex['image-index'], $request['sitemap_name'] ) ) {
+ $this->serve_raw_and_die(
+ $xml_content_type,
+ $this->librarian->get_sitemap_text(
+ $request['sitemap_name'],
+ JP_IMAGE_SITEMAP_INDEX_TYPE
+ )
+ );
+ }
+
+ // Catch image sitemap xsl.
+ if ( preg_match( $regex['image-style'], $request['sitemap_name'] ) ) {
+ $this->serve_raw_and_die(
+ 'application/xml',
+ Jetpack_Sitemap_Stylist::image_sitemap_xsl()
+ );
+ }
+
+ // Catch video sitemap xml.
+ if ( preg_match( $regex['video'], $request['sitemap_name'] ) ) {
+ $this->serve_raw_and_die(
+ $xml_content_type,
+ $this->librarian->get_sitemap_text(
+ $request['sitemap_name'],
+ JP_VIDEO_SITEMAP_TYPE
+ )
+ );
+ }
+
+ // Catch video sitemap index xml.
+ if ( preg_match( $regex['video-index'], $request['sitemap_name'] ) ) {
+ $this->serve_raw_and_die(
+ $xml_content_type,
+ $this->librarian->get_sitemap_text(
+ $request['sitemap_name'],
+ JP_VIDEO_SITEMAP_INDEX_TYPE
+ )
+ );
+ }
+
+ // Catch video sitemap xsl.
+ if ( preg_match( $regex['video-style'], $request['sitemap_name'] ) ) {
+ $this->serve_raw_and_die(
+ 'application/xml',
+ Jetpack_Sitemap_Stylist::video_sitemap_xsl()
+ );
+ }
+
+ // Catch news sitemap xml.
+ if ( preg_match( $regex['news'], $request['sitemap_name'] ) ) {
+ $sitemap_builder = new Jetpack_Sitemap_Builder();
+ $this->serve_raw_and_die(
+ $xml_content_type,
+ $sitemap_builder->news_sitemap_xml()
+ );
+ }
+
+ // Catch news sitemap xsl.
+ if ( preg_match( $regex['news-style'], $request['sitemap_name'] ) ) {
+ $this->serve_raw_and_die(
+ 'application/xml',
+ Jetpack_Sitemap_Stylist::news_sitemap_xsl()
+ );
+ }
+ }
+ }
+
+ /**
+ * Callback for adding sitemap-interval to the list of schedules.
+ *
+ * @access public
+ * @since 4.8.0
+ *
+ * @param array $schedules The array of WP_Cron schedules.
+ *
+ * @return array The updated array of WP_Cron schedules.
+ */
+ public function callback_add_sitemap_schedule( $schedules ) {
+ $schedules['sitemap-interval'] = array(
+ 'interval' => JP_SITEMAP_INTERVAL,
+ 'display' => __( 'Sitemap Interval', 'jetpack' ),
+ );
+ return $schedules;
+ }
+
+ /**
+ * Callback handler for sitemap cron hook
+ *
+ * @access public
+ */
+ public function callback_sitemap_cron_hook() {
+ $sitemap_builder = new Jetpack_Sitemap_Builder();
+ $sitemap_builder->update_sitemap();
+ }
+
+ /**
+ * Add actions to schedule sitemap generation.
+ * Should only be called once, in the constructor.
+ *
+ * @access private
+ * @since 4.8.0
+ */
+ private function schedule_sitemap_generation() {
+ // Add cron schedule.
+ add_filter( 'cron_schedules', array( $this, 'callback_add_sitemap_schedule' ) ); // phpcs:ignore WordPress.WP.CronInterval.ChangeDetected
+
+ add_action(
+ 'jp_sitemap_cron_hook',
+ array( $this, 'callback_sitemap_cron_hook' )
+ );
+
+ if ( ! wp_next_scheduled( 'jp_sitemap_cron_hook' ) ) {
+ /**
+ * Filter the delay in seconds until sitemap generation cron job is started.
+ *
+ * This filter allows a site operator or hosting provider to potentialy spread out sitemap generation for a
+ * lot of sites over time. By default, it will be randomly done over 15 minutes.
+ *
+ * @module sitemaps
+ * @since 6.6.1
+ *
+ * @param int $delay Time to delay in seconds.
+ */
+ $delay = apply_filters( 'jetpack_sitemap_generation_delay', MINUTE_IN_SECONDS * wp_rand( 1, 15 ) ); // Randomly space it out to start within next fifteen minutes.
+ wp_schedule_event(
+ time() + $delay,
+ 'sitemap-interval',
+ 'jp_sitemap_cron_hook'
+ );
+ }
+ }
+
+ /**
+ * Callback to add sitemap to robots.txt.
+ *
+ * @access public
+ * @since 4.8.0
+ */
+ public function callback_action_do_robotstxt() {
+
+ /**
+ * Filter whether to make the default sitemap discoverable to robots or not. Default true.
+ *
+ * @module sitemaps
+ * @since 3.9.0
+ *
+ * @param bool $discover_sitemap Make default sitemap discoverable to robots.
+ */
+ $discover_sitemap = apply_filters( 'jetpack_sitemap_generate', true );
+
+ if ( true === $discover_sitemap ) {
+ $sitemap_url = $this->finder->construct_sitemap_url( 'sitemap.xml' );
+ echo 'Sitemap: ' . esc_url( $sitemap_url ) . "\n";
+ }
+
+ /**
+ * Filter whether to make the news sitemap discoverable to robots or not. Default true.
+ *
+ * @module sitemaps
+ * @since 3.9.0
+ *
+ * @param bool $discover_news_sitemap Make default news sitemap discoverable to robots.
+ */
+ $discover_news_sitemap = apply_filters( 'jetpack_news_sitemap_generate', true );
+
+ if ( true === $discover_news_sitemap ) {
+ $news_sitemap_url = $this->finder->construct_sitemap_url( 'news-sitemap.xml' );
+ echo 'Sitemap: ' . esc_url( $news_sitemap_url ) . "\n";
+ }
+ }
+
+ /**
+ * Callback to delete the news sitemap cache.
+ *
+ * @access public
+ * @since 4.8.0
+ */
+ public function callback_action_flush_news_sitemap_cache() {
+ delete_transient( 'jetpack_news_sitemap_xml' );
+ }
+
+ /**
+ * Callback for resetting stored sitemap data.
+ *
+ * @access public
+ * @since 5.3.0
+ * @since 6.7.0 Schedules a regeneration.
+ */
+ public function callback_action_purge_data() {
+ $this->callback_action_flush_news_sitemap_cache();
+ $this->librarian->delete_all_stored_sitemap_data();
+ /** This filter is documented in modules/sitemaps/sitemaps.php */
+ $delay = apply_filters( 'jetpack_sitemap_generation_delay', MINUTE_IN_SECONDS * wp_rand( 1, 15 ) ); // Randomly space it out to start within next fifteen minutes.
+ wp_schedule_single_event( time() + $delay, 'jp_sitemap_cron_hook' );
+ }
+
+ /**
+ * Callback to set the sitemap location.
+ *
+ * @access public
+ * @since 4.8.0
+ */
+ public function callback_action_filter_sitemap_location() {
+ update_option(
+ 'jetpack_sitemap_location',
+ /**
+ * Additional path for sitemap URIs. Default value is empty.
+ *
+ * This string is any additional path fragment you want included between
+ * the home URL and the sitemap filenames. Exactly how this fragment is
+ * interpreted depends on your permalink settings. For example:
+ *
+ * Pretty permalinks:
+ * home_url() . jetpack_sitemap_location . '/sitemap.xml'
+ *
+ * Plain ("ugly") permalinks:
+ * home_url() . jetpack_sitemap_location . '/?jetpack-sitemap=sitemap.xml'
+ *
+ * PATHINFO permalinks:
+ * home_url() . '/index.php' . jetpack_sitemap_location . '/sitemap.xml'
+ *
+ * where 'sitemap.xml' is the name of a specific sitemap file.
+ * The value of this filter must be a valid path fragment per RFC 3986;
+ * in particular it must either be empty or begin with a '/'.
+ * Also take care that any restrictions on sitemap location imposed by
+ * the sitemap protocol are satisfied.
+ *
+ * The result of this filter is stored in an option, 'jetpack_sitemap_location';
+ * that option is what gets read when the sitemap location is needed.
+ * This way we don't have to wait for init to finish before building sitemaps.
+ *
+ * @link https://tools.ietf.org/html/rfc3986#section-3.3 RFC 3986
+ * @link http://www.sitemaps.org/ The sitemap protocol
+ *
+ * @since 4.8.0
+ */
+ apply_filters(
+ 'jetpack_sitemap_location',
+ ''
+ )
+ );
+ }
+
+} // End Jetpack_Sitemap_Manager class.
+
+new Jetpack_Sitemap_Manager();
+
+/**
+ * Absolute URL of the current blog's sitemap.
+ *
+ * @module sitemaps
+ *
+ * @since 3.9.0
+ * @since 4.8.1 Code uses method found in Jetpack_Sitemap_Finder::construct_sitemap_url in 4.8.0.
+ * It has been moved here to avoid fatal errors with other plugins that were expecting to find this function.
+ *
+ * @param string $filename Sitemap file name. Defaults to 'sitemap.xml', the initial sitemaps page.
+ *
+ * @return string Sitemap URL.
+ */
+function jetpack_sitemap_uri( $filename = 'sitemap.xml' ) {
+ global $wp_rewrite;
+
+ $location = Jetpack_Options::get_option_and_ensure_autoload( 'jetpack_sitemap_location', '' );
+
+ if ( $wp_rewrite->using_index_permalinks() ) {
+ $sitemap_url = home_url( '/index.php' . $location . '/' . $filename );
+ } elseif ( $wp_rewrite->using_permalinks() ) {
+ $sitemap_url = home_url( $location . '/' . $filename );
+ } else {
+ $sitemap_url = home_url( $location . '/?jetpack-sitemap=' . $filename );
+ }
+
+ /**
+ * Filter sitemap URL relative to home URL.
+ *
+ * @module sitemaps
+ *
+ * @since 3.9.0
+ *
+ * @param string $sitemap_url Sitemap URL.
+ */
+ return apply_filters( 'jetpack_sitemap_location', $sitemap_url );
+}