File: /var/www/elite/sitemap-checker.php
<?php
unset($argv[0]);
$options = array();
foreach($argv as $arg) {
$argData = explode('=', $arg);
$options[$argData[0]] = $argData[1];
}
$start = isset($options['start']) ? $options['start'] : 1;
$siteUrl = 'https://3000077.ru';
$clearSiteUrl = '3000077.ru';
echo 'Читаем файл sitemap: ' . $siteUrl . '/sitemap.xml' . PHP_EOL;
$xmlFile = simplexml_load_file($siteUrl . '/sitemap.xml');
$num = 0;
foreach($xmlFile as $row) {
$num++;
if ($num < $start) continue;
$url = $row->loc;
echo 'Checking ' . $num . '/' . count($xmlFile) . ': ' . $url . PHP_EOL;
$pageHeaders = getHeaderUrl(trim($url));
if ($pageHeaders[0]['http_code'] !== 200) {
echo ' - error!' . $pageHeaders[0]['http_code'] . PHP_EOL;
exit;
}
echo ' - success' . PHP_EOL;
echo PHP_EOL;
}
function convert($size)
{
$unit=array('b','kb','mb','gb','tb','pb');
return @round($size/pow(1024,($i=floor(log($size,1024)))),2).' '.$unit[$i];
}
function getUrl($url, $followLocation = true) {
$ch = curl_init( $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, $followLocation);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_HTTPGET, 1);
curl_setopt($ch, CURLOPT_DNS_USE_GLOBAL_CACHE, false );
curl_setopt($ch, CURLOPT_DNS_CACHE_TIMEOUT, 2 );
curl_setopt($ch, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4 );
curl_exec($ch);
$content = curl_exec($ch);
$info = curl_getinfo($ch);
$error = curl_error($ch);
curl_close($ch);
return [$info, $content, $error];
}
function getHeaderUrl($url, $followLocation = false) {
$ch = curl_init( $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, $followLocation);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_HTTPGET, 1);
curl_setopt($ch, CURLOPT_DNS_USE_GLOBAL_CACHE, false );
curl_setopt($ch, CURLOPT_DNS_CACHE_TIMEOUT, 2 );
curl_setopt($ch, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4 );
curl_exec($ch);
$info = curl_getinfo($ch);
$error = curl_error($ch);
curl_close($ch);
return [$info, $error];
}
function get_pageInfo($thislink){
global $siteUrl, $clearSiteUrl;
list($curlInfo, $content, $error) = getUrl($thislink);
if (!empty($error)) {
echo $error . ' - retry' . PHP_EOL. PHP_EOL;
return;
}
if ($curlInfo['http_code'] == 404) {
echo ' – 404 found' . PHP_EOL;
return 404;
}
preg_match_all('/<a\b[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/i' , $content , $links1);
preg_match_all('/<a\b[^>]*href=\'([^\']*)\'[^>]*>([\s\S]*?)<\/a>/i' , $content , $links2);
// Собираем ссылочки в один массив
$links['links'] = array_merge($links1[1],$links2[1]);
$links['aliases'] = array_merge($links1[2],$links2[2]);
echo $curlInfo['http_code'] . ' ' . count($links['links'] ) . PHP_EOL;
$sublinks = [];
// Делим ссылки на внешние и внутренние + нормализуем их
foreach ($links['links'] as $key=>$link){
if (strpos($link,'javascript') !== false) continue;
if (strpos($link,'mailto:') !== false) continue;
if (strpos($link,'tel:') !== false) continue;
if (strpos($link,'skype:') !== false) continue;
if (preg_match('/https:\/\/[^\/]*/i',$link,$match)) {
if ($match[0] == $siteUrl || $match[0] == 'https://' . $clearSiteUrl) {
$sublinks[] = $link;
} else {
// Nothing
}
} else {
if (strpos($link,'../') === 0) {
$levels = explode('/',$thislink);
$backlevel = 0;
while (strpos($link,'../') === 0){
$link = substr($link,3);
$backlevel++;
}
$level = count($levels) - $backlevel;
if ($level <= 3) {
$sublinks[] = $siteUrl.'/'.$link;
} else {
$newlink = '';
for($i=0;$i<=$level;$i++){
if ($newlink) $newlink .= '/';
$newlink .= $levels[$i];
}
$sublinks[] = $newlink.'/'.$link;
}
} else {
if (strpos($link,'/') === 0) {
$sublinks[] = $siteUrl.$link;
} else {
$sublinks[] = $thislink.$link;
}
}
}
}
return $sublinks;
}
echo 'Done' . PHP_EOL;