Skip to content

Commit 61717f2

Browse files
committed
Initial code commit
1 parent 32daf70 commit 61717f2

File tree

7 files changed

+448
-0
lines changed

7 files changed

+448
-0
lines changed

.gitignore

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Linux
2+
*~
3+
*.swp
4+
5+
# Windows
6+
Thumbs.db
7+
desktop.ini
8+
9+
# Mac OS X
10+
.DS_Store
11+
._*
12+
13+
# Composer
14+
/composer.lock
15+
/composer.phar
16+
/vendor

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Pico Robots Changelog
2+
=====================
3+
4+
### Version 1.0.0
5+
Released: -
6+
7+
Initial release

PicoRobots.php

Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
<?php
2+
3+
/**
4+
* Pico robots plugin - add a robots.txt and sitemap.xml to your website
5+
*
6+
* PicoRobots is a simple plugin that add a `robots.txt` and `sitemap.xml` to
7+
* your website. Both the robots exclusion protocol (`robots.txt`) and the
8+
* Sitemaps protocol (`sitemap.xml`) are used to communicate with web crawlers
9+
* and other web robots. `robots.txt` informs the web robot about which areas
10+
* of your website should not be processed or scanned. `sitemap.xml` allows
11+
* web robots to crawl your website more intelligently. `sitemap.xml` is a URL
12+
* inclusion protocol and complements `robots.txt`, a URL exclusion protocol.
13+
*
14+
* @author Daniel Rudolf
15+
* @link http://picocms.org
16+
* @license http://opensource.org/licenses/MIT The MIT License
17+
* @version 1.0.0
18+
*/
19+
class PicoRobots extends AbstractPicoPlugin
20+
{
21+
/**
22+
* This plugin uses Pico's API version 2 as of Pico 2.0
23+
*
24+
* @var int
25+
*/
26+
const API_VERSION = 2;
27+
28+
/**
29+
* List of robots exclusion rules
30+
*
31+
* @see PicoRobots::getRobots()
32+
* @var array[]|null
33+
*/
34+
protected $robots;
35+
36+
/**
37+
* List of sitemap records
38+
*
39+
* @see PicoRobots::getSitemap()
40+
* @var array[]|null
41+
*/
42+
protected $sitemap;
43+
44+
/**
45+
* Disables this plugin if neither robots.txt nor sitemap.xml is requested
46+
*
47+
* @see DummyPlugin::onRequestUrl()
48+
*/
49+
public function onRequestUrl(&$requestUrl)
50+
{
51+
if (!in_array($requestUrl, array('robots.txt', 'sitemap.xml'), true)) {
52+
$this->setEnabled(false);
53+
}
54+
}
55+
56+
/**
57+
* Sets a page's last modification time and its default sitemap status
58+
*
59+
* @see DummyPlugin::onSinglePageLoaded()
60+
*/
61+
public function onSinglePageLoaded(array &$pageData)
62+
{
63+
if (($this->getRequestUrl() === 'sitemap.xml') && $pageData['id']) {
64+
$fileName = $this->getConfig('content_dir') . $pageData['id'] . $this->getConfig('content_ext');
65+
if (file_exists($fileName) && !isset($pageData['modificationTime'])) {
66+
$pageData['modificationTime'] = filemtime($fileName);
67+
}
68+
69+
if (!$pageData['meta']['sitemap'] && ($pageData['meta']['sitemap'] !== false)) {
70+
$pageData['meta']['sitemap'] = true;
71+
72+
if (preg_match('/(?:^|\/)_/', $pageData['id'])) {
73+
$pageData['meta']['sitemap'] = false;
74+
} else {
75+
$robots = explode(',', $pageData['meta']['robots']);
76+
$robots = array_map('strtolower', $robots);
77+
if (in_array('noindex', $robots)) {
78+
$pageData['meta']['sitemap'] = false;
79+
}
80+
}
81+
}
82+
}
83+
}
84+
85+
/**
86+
* Tells Pico to serve the robots.txt resp. sitemap.xml
87+
*
88+
* You can overwrite the plugin's default templates for `robots.txt` and
89+
* `sitemap.xml` by simply adding a `robots.twig` resp. `sitemap.twig` to
90+
* your theme.
91+
*
92+
* @see DummyPlugin::onPageRendering()
93+
*/
94+
public function onPageRendering(&$twigTemplate, array &$twigVariables)
95+
{
96+
if ($this->getRequestUrl() === 'robots.txt') {
97+
header($_SERVER['SERVER_PROTOCOL'] . ' 200 OK');
98+
header('Content-Type: text/plain; charset=utf-8');
99+
$twigTemplate = 'robots.twig';
100+
101+
$twigVariables['robots'] = $this->getRobots();
102+
}
103+
104+
if ($this->getRequestUrl() === 'sitemap.xml') {
105+
header($_SERVER['SERVER_PROTOCOL'] . ' 200 OK');
106+
header('Content-Type: application/xml; charset=utf-8');
107+
$twigTemplate = 'sitemap.twig';
108+
109+
$twigVariables['sitemap'] = $this->getSitemap();
110+
}
111+
}
112+
113+
/**
114+
* Returns the structured contents of robots.txt
115+
*
116+
* This method triggers the `onRobots` event when the contents of
117+
* `robots.txt` weren't assembled yet.
118+
*
119+
* @return array[] list of robots exclusion rules
120+
*/
121+
public function getRobots()
122+
{
123+
if ($this->robots === null) {
124+
$this->robots = array();
125+
126+
$robotsConfig = $this->getPluginConfig('robots', array());
127+
foreach ($robotsConfig as $rule) {
128+
$userAgents = !empty($rule['user_agents']) ? (array) $rule['user_agents'] : array();
129+
$disallow = !empty($rule['disallow']) ? (array) $rule['disallow'] : array();
130+
$allow = !empty($rule['allow']) ? (array) $rule['allow'] : array();
131+
132+
$this->robots[] = array(
133+
'user_agents' => $userAgents ?: array('*'),
134+
'disallow' => $disallow ?: (!$allow ? array('*') : array()),
135+
'allow' => $allow
136+
);
137+
}
138+
139+
$this->triggerEvent('onRobots', array(&$this->robots));
140+
}
141+
142+
return $this->robots;
143+
}
144+
145+
/**
146+
* Returns the structure contents of sitemap.xml
147+
*
148+
* This method triggers the `onSitemap` event when the contents of
149+
* `sitemap.xml` weren't assembled yet.
150+
*
151+
* @return array[] list of sitemap records
152+
*/
153+
public function getSitemap()
154+
{
155+
if ($this->sitemap === null) {
156+
$changeFrequencies = array('always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never');
157+
$this->sitemap = array();
158+
159+
$pages = $this->getPages();
160+
foreach ($pages as $pageData) {
161+
if (!empty($pageData['meta']['sitemap'])) {
162+
$modificationTime = null;
163+
if (isset($pageData['meta']['sitemap']['lastmod'])) {
164+
$modificationTime = $pageData['meta']['sitemap']['lastmod'] ?: null;
165+
166+
if ($modificationTime && !is_int($modificationTime)) {
167+
$modificationTime = strtotime($modificationTime) ?: null;
168+
}
169+
} elseif (!empty($pageData['modificationTime'])) {
170+
$modificationTime = $pageData['modificationTime'];
171+
}
172+
173+
$changeFrequency = null;
174+
if (!empty($pageData['meta']['sitemap']['changefreq'])) {
175+
$changeFrequency = $pageData['meta']['sitemap']['changefreq'];
176+
}
177+
178+
$priority = null;
179+
if (isset($pageData['meta']['sitemap']['priority'])) {
180+
$priority = (float) $pageData['meta']['sitemap']['priority'];
181+
}
182+
183+
$this->sitemap[] = array(
184+
'url' => $pageData['url'],
185+
'modificationTime' => $modificationTime,
186+
'changeFrequency' => in_array($changeFrequency, $changeFrequencies) ? $changeFrequency : null,
187+
'priority' => ($priority !== null) ? min(max(round($priority, 1), 0), 1) : null
188+
);
189+
}
190+
}
191+
192+
$sitemapConfig = $this->getPluginConfig('sitemap', array());
193+
foreach ($sitemapConfig as $record) {
194+
if (!empty($record['url'])) {
195+
$modificationTime = !empty($record['lastmod']) ? $record['lastmod'] : null;
196+
$changeFrequency = !empty($record['changefreq']) ? $record['changefreq'] : null;
197+
$priority = isset($record['priority']) ? (float) $record['priority'] : null;
198+
199+
if ($modificationTime && !is_int($modificationTime)) {
200+
$modificationTime = strtotime($modificationTime) ?: null;
201+
}
202+
203+
$this->sitemap[] = array(
204+
'url' => $this->substituteUrl($record['url']),
205+
'modificationTime' => $modificationTime,
206+
'changeFrequency' => in_array($changeFrequency, $changeFrequencies) ? $changeFrequency : null,
207+
'priority' => ($priority !== null) ? min(max(round($priority, 1), 0), 1) : null
208+
);
209+
}
210+
}
211+
212+
$this->triggerEvent('onSitemap', array(&$this->sitemap));
213+
}
214+
215+
return $this->sitemap;
216+
}
217+
218+
/**
219+
* Registers the Sitemap meta header
220+
*
221+
* @see DummyPlugin::onMetaHeaders()
222+
*/
223+
public function onMetaHeaders(array &$headers)
224+
{
225+
$headers['Sitemap'] = 'sitemap';
226+
}
227+
228+
/**
229+
* Adds the plugin's theme dir to Twig's template loader
230+
*
231+
* @see DummyPlugin::onTwigRegistered()
232+
*/
233+
public function onTwigRegistered(Twig_Environment &$twig)
234+
{
235+
$twig->getLoader()->addPath(__DIR__ . '/theme');
236+
}
237+
238+
/**
239+
* Substitutes the placeholders %base_url% and %theme_url% in URLs
240+
*
241+
* @param string $url URL with (or without) placeholders
242+
*
243+
* @return string substituted URL
244+
*/
245+
protected function substituteUrl($url)
246+
{
247+
$variables = array(
248+
'%base_url%?' => $this->getBaseUrl() . (!$this->isUrlRewritingEnabled() ? '?' : ''),
249+
'%base_url%' => rtrim($this->getBaseUrl(), '/'),
250+
'%theme_url%' => $this->getBaseThemeUrl() . $this->getConfig('theme')
251+
);
252+
253+
return str_replace(array_keys($variables), $variables, $url);
254+
}
255+
}

README.md

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
Pico Robots
2+
===========
3+
4+
This is the repository of Pico's official robots plugin.
5+
6+
Pico is a stupidly simple, blazing fast, flat file CMS. See http://picocms.org/ for more info.
7+
8+
`PicoRobots` is a simple plugin that add a `robots.txt` and `sitemap.xml` to your website. Both the [robots exclusion protocol][RobotsProtocol] (`robots.txt`) and the [Sitemaps protocol][SitemapsProtocol] (`sitemap.xml`) are used to communicate with web crawlers and other web robots. `robots.txt` informs the web robot about which areas of your website should not be processed or scanned. `sitemap.xml` allows web robots to crawl your website more intelligently. `sitemap.xml` is a URL inclusion protocol and complements `robots.txt`, a URL exclusion protocol.
9+
10+
Install
11+
-------
12+
13+
If you're using a `composer`-based installation of Pico (e.g. [`picocms/pico-composer`][PicoComposer]), simply open a shell on your server, navigate to Pico's install directory (e.g. `/var/www/html`) and run `composer require phrozenbyte/pico-robots` (via [Packagist.org][]). That's it!
14+
15+
If you're rather using one of [Pico's pre-built release packages][PicoRelease], you must first create a empty `plugins/PicoRobots` directory in Pico's install directory (e.g. `/var/www/html`) on your server. Then download [`PicoRobot`'s latest source package][PicoRobotsRelease] and upload all containing files (esp. `PicoRobots.php`) into said `plugins/PicoRobots` directory (resulting in `plugins/PicoRobots/PicoRobots.php`). That's it!
16+
17+
`PicoRobots` requires Pico 2.0+
18+
19+
Config
20+
------
21+
22+
After installing `PicoRobots`, you can navigate to your `robots.txt` (http://example.com/pico/robots.txt) and `sitemap.xml` (http://example.com/pico/sitemap.xml) using your favourite web browser. As you can see, `robots.txt` just holds a reference to your website's `sitemap.xml` by default. The default `sitemap.xml` lists all your website's pages with the last modification time as specified by the Markdown file's last modification time reported by your server's operating system. Pretty convenient, right?
23+
24+
As always, you can adjust `PicoRobot`'s behavior to fit your needs. First we'll start with the `robots.txt`:
25+
26+
#### More about `robots.txt`
27+
28+
The `robots.txt` consists of a arbitrary number of URL exclusion rules separated by paragraphs (i.e. a empty line). A rule consists of one or more `User-agent` rows to state which web robots are concerned, and one or more `Disallow` rows to state which URLs shouldn't be processed or scanned by said web robots. Optionally you can add one or more `Allow` rows to specify exceptions to the `Disallow` rows. Please refer to Wikipedia's article about the [robots exclusion protocol][RobotsProtocol] for more information.
29+
30+
You can add URL exclusion rules to your `robots.txt` by using Pico's `config/config.yml` (or any other config file in the `config` dir, e.g. `config/PicoRobots.yml`). Rather than extensively describing the config syntax, a example says more than a thousand words. So, by adding the following configuration to your `config/config.yml`
31+
32+
```yml
33+
PicoRobots:
34+
robots:
35+
- user_agents: "BadBot"
36+
disallow: "/"
37+
- user_agents: "*"
38+
disallow: [ "/private/", "/admin/" ]
39+
allow: "/private/about_me"
40+
```
41+
42+
you get the following two rules in your `robots.txt`
43+
44+
```
45+
User-agent: BadBot
46+
Disallow: /
47+
48+
User-agent: *
49+
Disallow: /private/
50+
Disallow: /admin/
51+
Allow: /private/about_me
52+
```
53+
54+
The first rule (line 1 and 2) tells `BadBot` (`User-agent: BadBot`) not to crawl your website at all (`Disallow: /`). The second rule (lines 4 to 7) tells any other web robot (`User-agent: *`) not to crawl your website's `private` (`Disallow: /private/`) and `admin` (`Disallow: /admin/`) folders. As an exception to this, crawling the `private/about_me` page (`Allow: /private/about_me`) is fine. It implicitly also tells any web robot (except `BadBot`) that crawling your website is basically fine.
55+
56+
The `robots.txt` doesn't affect Pico's `Robots` meta header in the YAML Frontmatter of your Markdown files in any way (as long as your theme uses that information to add a `<meta name="robots" content="..." />` tag to your website). You should rather use the `Robots` meta header than the `robots.txt` to disallow crawling a single page.
57+
58+
`PicoRobots` uses the `theme/robots.twig` template to create the contents of `robots.txt`. If you want to add some custom logic to your `robots.txt`, simply add a `robots.twig` to your theme and use `PicoRobot`'s `theme/robots.twig` as a starting point. Pico will automatically use your theme's `robots.twig` rather than `PicoRobot`'s default one.
59+
60+
The plugin furthermore exposes a simple API to allow other plugins to access and add URL exclusion rules to your `robots.txt`. As a plugin developer you may use the `PicoRobots::getRobots()` method to get a list of all URL exclusion rules. `PicoRobots` furthermore triggers the custom `onRobots(array &$robots)` event (`$robots = [ [ 'user_agents' => [ … ], 'disallow' => [ … ], 'allow' => [ … ] ], … ]`), allowing you to add custom rules to the website's `robots.txt`.
61+
62+
Please note that URL exclusion rules in your `robots.txt` won't affect your website's `sitemap.xml` created by `PicoRobots` in any way.
63+
64+
#### More about `sitemap.xml`
65+
66+
`sitemap.xml` is a [XML][]-based protocol to help web robots to crawl your website more intelligently. It consits of one or more `<url>` records, telling web robots what URLs (the `<loc>` tag) should be crawled. You can optionally tell web robots when a page was last modified (`<lastmod>` tag) and how frequently the page may change (`<changefreq>` tag, possible values are `always`, `hourly`, `daily`, `weekly`, `monthly`, `yearly` and `never`). By adding the optional `<priority>` tag, you can suggest web robots which pages are considered more important than others (the valid range is from `0.0` to `1.0`, with `1.0` being the most important; the default value is `0.5`). Please refer to Wikipedia's article about the [Sitemaps protocol][SitemapsProtocol] for more information.
67+
68+
`PicoRobot`'s default `sitemap.xml` lists all your website's pages with the last modification time as specified by the Markdown file's last modification time reported by your server's operating system. If you don't want a particular page to be in the `sitemap.xml`, simply add the `Sitemap: false` meta header to the page's YAML Frontmatter. `PicoRobots` will automatically exclude inaccessible pages (pages whose file name starts with a `_`) and pages with the `Robots: noindex` meta header in the YAML Frontmatter.
69+
70+
The `Sitemap` meta header can also be used to specify a page's change frequency and priority, as well as overwriting the page's last modification time. If you want to tell web robots that a page is usually changed once per week, was last changed on 1 December 2017, and has a increased priority of `0.7`, add the following to the page's YAML Frontmatter:
71+
72+
```yml
73+
Sitemap:
74+
lastmod: 2017-12-01
75+
changefreq: weekly
76+
priority: 0.7
77+
```
78+
79+
You can furthermore use Pico's `config/config.yml` (or any other config file in the `config` dir, e.g. `config/PicoRobots.yml`) to add more records to your `sitemap.xml`. If you want to add a record for the dynamically created page `team/max-mustermann` (the page is dynamically created by a plugin and there is no `content/team/max-mustermann.md`, thus it isn't in the `sitemap.xml` by default) to your `sitemap.xml`, add the following to your `config/config.yml`
80+
81+
```yml
82+
PicoRobots:
83+
sitemap:
84+
- url: %base_url%?team/max-mustermann
85+
changefreq: yearly
86+
```
87+
88+
to get the following record in your `sitemap.xml`
89+
90+
```xml
91+
<url>
92+
<loc>http://example.com/pico/team/max-mustermann</loc>
93+
<changefreq>yearly</changefreq>
94+
</url>
95+
```
96+
97+
As you can see, `PicoRobots` interprets the `%base_url%` placeholder the same way as in Markdown files and replaces it by the website's base URL. As with the `Sitemap` meta header in a page's YAML Frontmatter, you may add `lastmod`, `changefreq` and `priority` keys to the config to specify a URL's last modification time, change frequency and priority.
98+
99+
`PicoRobots` uses the `theme/sitemap.twig` template to create the contents of `sitemap.xml`. If you want to add some custom logic to your `sitemap.xml`, simply add a `sitemap.twig` to your theme and use `PicoRobot`'s `theme/sitemap.twig` as a starting point. Pico will automatically use your theme's `sitemap.twig` rather than `PicoRobot`'s default one.
100+
101+
The plugin furthermore exposes a simple API to allow other plugins to access and add sitemap records to your `sitemap.xml`. As a plugin developer you may use the `PicoRobots::getSitemap()` method to get a list of all sitemap records. `PicoRobots` furthermore triggers the custom `onSitemap(array &$sitemap)` event (`$sitemap = [ [ 'url' => [ … ], 'modificationTime' => [ … ], 'changeFrequency' => [ … ], 'priority' => [ … ] ], … ]`), allowing you to add custom records to the website's `sitemap.xml`.
102+
103+
[RobotsProtocol]: https://en.wikipedia.org/wiki/Robots_exclusion_standard
104+
[SitemapsProtocol]: https://en.wikipedia.org/wiki/Sitemaps
105+
[PicoComposer]: https://github.com/picocms/pico-composer
106+
[Packagist.org]: https://packagist.org/packages/phrozenbyte/pico-robots
107+
[PicoRelease]: https://github.com/picocms/Pico/releases/latest
108+
[PicoRobotsRelease]: https://github.com/PhrozenByte/pico-robot/releases/latest
109+
[XML]: https://en.wikipedia.org/wiki/XML

0 commit comments

Comments
 (0)