Vielleicht hilft dir folgendes Script weiter.
Es sollte wöchentlich (täglich bei vielen Änderungen) per Cronjob aufgerufen werden.
Die drei Variablen am Beginn müssten entsprechend angepasst werden.
Falls es Fragen gibt, bitte

Ich habe das Script vor über einem Jahr zusammengebastelt und bin nach wie vor zufrieden damit.
- Code: Alles auswählen
<?php
$sitemap_file = "../sitemap.xml";
$sitemap_uri = "http://example.com/sitemap.xml";
$exclude = array(
"http://example.com/wp-login"
);
error_reporting(0);
ignore_user_abort(true);
class Sitemap {
private $xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd\">\n\n";
private $res = array();
private $base = "";
private $urls = array();
private $curl;
private $filter = array();
public function filter($filter) {
if(is_array($filter)) $this->filter = $filter;
else $this->filter[] = $filter;
}
public function getXml() {
$this->crawl();
$this->build();
return $this->xml;
}
private function crawl() {
$url = (
(isset($_SERVER["HTTPS"]) && strtoupper($_SERVER["HTTPS"])!=="OFF")
? "https" : "http"
)."://".$_SERVER["HTTP_HOST"]."/";
$this->urls[$url] = true;
$this->curl = curl_init();
curl_setopt_array($this->curl,array(
CURLOPT_HEADER => false,
CURLOPT_FOLLOWLOCATION => false,
CURLOPT_COOKIESESSION => true,
CURLOPT_AUTOREFERER => true,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HEADER => true,
CURLOPT_URL => $url
));
$data = curl_exec($this->curl);
if(preg_match("/<base[^>]+href=\"([^\"]+)\"/",$data,$matches)) {
$this->base = rtrim($url,"/");
}
$this->recursiveSniff($url);
}
private function recursiveSniff($url,$level=0) {
if(++$level>9) return;
foreach($this->filter as $filter) {
if(strstr($url,$filter)) return;
}
curl_setopt($this->curl,CURLOPT_URL,$url);
$data = curl_exec($this->curl);
$data = preg_split("/(\r\n|\r|\n){2}/",$data,2);
$header = $data[0];
$data = $data[1];
if(!strstr($header," 200 OK")) return;
$this->res[] = $url;
$escaped = preg_quote($url,"@");
preg_match_all("@(<a[^>]+href=\"([^\"]+)\"|<a[^>]+href='([^']+)')@",$data,$matches);
foreach($matches as $matcha) {
foreach($matcha as $match) {
if(preg_match("/^(<a[^>]+href=\"([^\"]+)\"|<a[^>]+href='([^']+)')$/",$match,$submatch)) {
$match = $submatch[2];
}
if(!preg_match("/^https?\:\/\//",$match)) {
$match = $this->base."/".ltrim($match,"/");
}
$match = preg_replace("/#.*$/","",$match);
if(!preg_match("@^$escaped@",$match) || isset($this->urls[$match])) continue;
$this->urls[$match] = true;
$this->recursiveSniff($match,$level);
}
}
}
private function build() {
sort($this->res);
foreach($this->res as $res) $this->xml.= "<url><loc>".$res."</loc></url>\n";
$this->xml.= "</urlset>\n";
}
}
$sitemap = new Sitemap;
if($exclude) $sitemap->filter($exclude);
$xml = $sitemap->getXml();
$changed = md5_file($sitemap_file)!==md5($xml);
file_put_contents($sitemap_file,$xml);
if($changed) {
file_get_contents("http://www.google.com/webmasters/tools/ping?sitemap=$sitemap_uri");
file_get_contents("http://api.moreover.com/ping?u=$sitemap_uri");
file_get_contents("http://submissions.ask.com/ping?sitemap=$sitemap_uri");
}
?>
$sitemap_file ist der relative Pfad zur sitemap.xml. Dieses File sollte über Schreibrechte verfügen.
$sitemap_uri ist die URI, unter der die sitemap.xml aufgerufen werden kann.
$exclude (optional) legt URIs fest, die NICHT in der Sitemap erscheinen sollen.
Grüße,
Gerald