addItem(new Typecho_Widget_Helper_Layout('div', array('class' => 'typecho-page-title')), '

网址管理配置

'); // 每页显示数量 $pageSize = new Typecho_Widget_Helper_Form_Element_Text('pageSize', null, '20', _t('每页显示数量'), _t('后台管理中每页显示的网址数量')); $pageSize->input->setAttribute('class', 'mini'); $form->addInput($pageSize); // 是否开启网址验证 $validateUrl = new Typecho_Widget_Helper_Form_Element_Radio('validateUrl', array( '1' => _t('开启'), '0' => _t('关闭') ), '1', _t('网址验证'), _t('新增网址时是否验证网址有效性')); $form->addInput($validateUrl); // ================== 全文抓取配置 ================== $form->addItem(new Typecho_Widget_Helper_Layout('div', array('class' => 'typecho-page-title')), '

全文抓取配置

'); // 是否开启全文抓取 $enableFullText = new Typecho_Widget_Helper_Form_Element_Radio('enableFullText', array( '1' => _t('开启'), '0' => _t('关闭') ), '0', _t('开启全文抓取'), _t('开启后会对白名单中的网站自动抓取全文')); $form->addInput($enableFullText); // 白名单配置(多行文本) $fullTextWhitelist = new Typecho_Widget_Helper_Form_Element_Textarea('fullTextWhitelist', null, "https://wiki.eryajf.net/learning-weekly.xml|.markdown-body\nhttps://example.com/rss|#content", _t('全文抓取白名单'), _t('每行一个,格式:RSS地址|内容选择器(CSS选择器)
示例:https://wiki.eryajf.net/learning-weekly.xml|.post-content')); $form->addInput($fullTextWhitelist); // 每个站点抓取全文的篇数 $fullTextPerSite = new Typecho_Widget_Helper_Form_Element_Text('fullTextPerSite', null, '3', _t('每站抓取全文篇数'), _t('每个RSS源最多抓取几篇的全文,建议1-5')); $fullTextPerSite->input->setAttribute('class', 'mini'); $form->addInput($fullTextPerSite); // 页面抓取超时时间(单篇文章) $pageFetchTimeout = new Typecho_Widget_Helper_Form_Element_Text('pageFetchTimeout', null, '8', _t('页面抓取超时时间(秒)'), _t('抓取单篇文章页面时的超时时间,建议8-15秒')); $pageFetchTimeout->input->setAttribute('class', 'mini'); $form->addInput($pageFetchTimeout); // ================== RSS配置 ================== $form->addItem(new Typecho_Widget_Helper_Layout('div', array('class' => 'typecho-page-title')), '

RSS配置

'); // RSS页面每页显示数量 $rssPageSize = new Typecho_Widget_Helper_Form_Element_Text('rssPageSize', null, '30', _t('RSS页面每页显示数量'), _t('RSS信息页面每页显示的文章数量')); $rssPageSize->input->setAttribute('class', 'mini'); $form->addInput($rssPageSize); // RSS刷新间隔 $rssRefresh = new Typecho_Widget_Helper_Form_Element_Text('rssRefresh', null, '3600', _t('RSS刷新间隔(秒)'), _t('建议的RSS刷新间隔时间,实际执行时间由宝塔计划任务决定')); $rssRefresh->input->setAttribute('class', 'mini'); $form->addInput($rssRefresh); // 【新增】每次自动刷新网址数量 $rssRefreshLimit = new Typecho_Widget_Helper_Form_Element_Text('rssRefreshLimit', null, '20', _t('每次自动刷新网址数量'), _t('每次定时任务最多刷新的RSS网址数量,建议10-50,根据服务器性能调整')); $rssRefreshLimit->input->setAttribute('class', 'mini'); $form->addInput($rssRefreshLimit); // 每个站点最大文章数 $maxFeedsPerSite = new Typecho_Widget_Helper_Form_Element_Text('maxFeedsPerSite', null, '5', _t('每个站点最大文章数'), _t('每个RSS源最多显示的文章数量')); $maxFeedsPerSite->input->setAttribute('class', 'mini'); $form->addInput($maxFeedsPerSite); // RSS文章保留时间(改为下拉框) $rssKeepTime = new Typecho_Widget_Helper_Form_Element_Select('rssKeepTime', array( '0' => _t('不自动清理(默认)'), // ← 将"默认"标识放在这里 '86400' => _t('一天之前(24小时前)'), '259200' => _t('三天之前(72小时前)'), '604800' => _t('一周之前(7天前)'), '1296000' => _t('半个月之前(15天前)'), '2592000' => _t('一个月之前(30天前)'), '7776000' => _t('三个月之前(90天前)'), '15552000' => _t('半年之前(180天前)') ), '259200', // ← 这里改为 0,默认不清理 _t('RSS文章保留时间'), _t('自动清理超过此时间的RSS文章,按照文章发布时间判断,默认不自动清理')); $form->addInput($rssKeepTime); // RSS最大缓存条数 $maxCachePerSite = new Typecho_Widget_Helper_Form_Element_Text('maxCachePerSite', null, '5', _t('每个站点最大缓存条数'), _t('每个RSS源最多缓存的文章数量,0表示不限制')); $maxCachePerSite->input->setAttribute('class', 'mini'); $form->addInput($maxCachePerSite); // 连接超时时间 $fetchTimeout = new Typecho_Widget_Helper_Form_Element_Text('fetchTimeout', null, '5', _t('RSS抓取超时时间(秒)'), _t('抓取RSS源时的超时时间')); $fetchTimeout->input->setAttribute('class', 'mini'); $form->addInput($fetchTimeout); // 失败重试次数 $retryTimes = new Typecho_Widget_Helper_Form_Element_Text('retryTimes', null, '2', _t('失败重试次数'), _t('RSS抓取失败时的重试次数')); $retryTimes->input->setAttribute('class', 'mini'); $form->addInput($retryTimes); // ================== 网站状态检查配置 ================== $form->addItem(new Typecho_Widget_Helper_Layout('div', array('class' => 'typecho-page-title')), '

网站状态检查配置

'); // 状态检查超时时间 $statusCheckTimeout = new Typecho_Widget_Helper_Form_Element_Text('statusCheckTimeout', null, '8', _t('状态检查超时时间(秒)'), _t('检查网站状态时的超时时间')); $statusCheckTimeout->input->setAttribute('class', 'mini'); $form->addInput($statusCheckTimeout); // 每次检查的最大数量 $statusCheckMax = new Typecho_Widget_Helper_Form_Element_Text('statusCheckMax', null, '80', _t('每次检查最大数量'), _t('每次自动检查时最多检查的网址数量')); $statusCheckMax->input->setAttribute('class', 'mini'); $form->addInput($statusCheckMax); // ================== 定时任务配置 ================== $form->addItem(new Typecho_Widget_Helper_Layout('div', array('class' => 'typecho-page-title')), '

定时任务配置

'); // RSS定时任务访问密钥 $rssCronSecret = new Typecho_Widget_Helper_Form_Element_Text('rssCronSecret', null, self::generateSecret(), _t('RSS定时任务密钥'), _t('用于RSS定时任务访问的密钥,请妥善保管')); $form->addInput($rssCronSecret); // 状态检查定时任务访问密钥 $statusCronSecret = new Typecho_Widget_Helper_Form_Element_Text('statusCronSecret', null, self::generateSecret(), _t('状态检查定时任务密钥'), _t('用于状态检查定时任务访问的密钥,请妥善保管')); $form->addInput($statusCronSecret); } /** * 个人用户的配置面板 */ public static function personalConfig(Typecho_Widget_Helper_Form $form) {} /** * 初始化数据库路径 */ private static function initDbPath() { $dbDir = __DIR__ . '/db'; // 确保目录存在 if (!is_dir($dbDir)) { @mkdir($dbDir, 0755, true); } $dbFiles = glob($dbDir . '/urlnav_*.db'); if (!empty($dbFiles)) { self::$dbPath = $dbFiles[0]; } else { $randomStr = substr(md5(uniqid(rand(), true)), 0, 10); self::$dbPath = $dbDir . '/urlnav_' . $randomStr . '.db'; } } /** * 生成随机密钥 */ private static function generateSecret() { return substr(md5(uniqid(rand(), true) . time()), 0, 16); } public static function getCategoryStats($categoryId) { $db = self::getDbConnection(); // 获取网址总数 $stmt = $db->prepare("SELECT COUNT(*) as url_count FROM urlnav_urls WHERE category_id = ? AND is_active = 1"); $stmt->execute([$categoryId]); $urlCount = $stmt->fetchColumn(); // 获取有RSS的网址数 $stmt = $db->prepare("SELECT COUNT(*) as rss_count FROM urlnav_urls WHERE category_id = ? AND rss_url IS NOT NULL AND rss_url != '' AND is_active = 1"); $stmt->execute([$categoryId]); $rssCount = $stmt->fetchColumn(); return [ 'url_count' => (int)$urlCount, 'rss_count' => (int)$rssCount ]; } /** * 初始化数据库 */ private static function initDatabase() { if (empty(self::$dbPath)) { self::initDbPath(); } try { $db = new PDO('sqlite:' . self::$dbPath); $db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); // 检查分类表是否存在 $tableCheck = $db->query("SELECT name FROM sqlite_master WHERE type='table' AND name='urlnav_categories'"); if (!$tableCheck->fetch()) { // 创建分类表 $db->exec("CREATE TABLE urlnav_categories ( id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, description TEXT, sort_order INTEGER DEFAULT 0, is_active INTEGER DEFAULT 1, created_at DATETIME DEFAULT CURRENT_TIMESTAMP, updated_at DATETIME DEFAULT CURRENT_TIMESTAMP )"); // 插入默认分类 $db->exec("INSERT INTO urlnav_categories (name, description, sort_order) VALUES ('常用工具', '日常使用的在线工具', 1), ('设计资源', '设计相关的素材和资源', 2), ('开发资源', '程序开发相关资源', 3), ('技术社区', '技术交流和学习社区', 4)"); } // 检查网址表是否存在 $tableCheck2 = $db->query("SELECT name FROM sqlite_master WHERE type='table' AND name='urlnav_urls'"); if (!$tableCheck2->fetch()) { // 创建网址表 $db->exec("CREATE TABLE urlnav_urls ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT NOT NULL, url TEXT NOT NULL, description TEXT, rss_url TEXT, category_id INTEGER, star_rating INTEGER DEFAULT 0, -- 新增:星级评分,0-3表示0-3颗星 sort_order INTEGER DEFAULT 0, is_active INTEGER DEFAULT 1, is_online INTEGER DEFAULT 1, last_status_check DATETIME, status_check_count INTEGER DEFAULT 0, last_status_code INTEGER, created_at DATETIME DEFAULT CURRENT_TIMESTAMP, updated_at DATETIME DEFAULT CURRENT_TIMESTAMP, last_refresh DATETIME, refresh_count INTEGER DEFAULT 0, success_count INTEGER DEFAULT 0, failure_count INTEGER DEFAULT 0, last_error TEXT, FOREIGN KEY (category_id) REFERENCES urlnav_categories(id) ON DELETE SET NULL )"); // 创建索引 $db->exec("CREATE INDEX idx_category_id ON urlnav_urls(category_id)"); $db->exec("CREATE INDEX idx_is_active ON urlnav_urls(is_active)"); $db->exec("CREATE INDEX idx_rss_url ON urlnav_urls(rss_url)"); $db->exec("CREATE INDEX idx_last_refresh ON urlnav_urls(last_refresh)"); $db->exec("CREATE INDEX idx_is_online ON urlnav_urls(is_online)"); $db->exec("CREATE INDEX idx_last_status_check ON urlnav_urls(last_status_check)"); } // 创建RSS缓存表 - 修改:添加full_content字段 $tableCheck3 = $db->query("SELECT name FROM sqlite_master WHERE type='table' AND name='urlnav_rss_cache'"); if (!$tableCheck3->fetch()) { $db->exec("CREATE TABLE urlnav_rss_cache ( id INTEGER PRIMARY KEY AUTOINCREMENT, url_id INTEGER NOT NULL, feed_title TEXT NOT NULL, feed_link TEXT NOT NULL, feed_description TEXT, full_content TEXT, -- 新增:完整内容字段 pub_date DATETIME NOT NULL, guid TEXT NOT NULL, cached_at DATETIME DEFAULT CURRENT_TIMESTAMP, is_fresh INTEGER DEFAULT 1, FOREIGN KEY (url_id) REFERENCES urlnav_urls(id) ON DELETE CASCADE, UNIQUE(url_id, guid) )"); $db->exec("CREATE INDEX idx_url_id ON urlnav_rss_cache(url_id)"); $db->exec("CREATE INDEX idx_pub_date ON urlnav_rss_cache(pub_date)"); $db->exec("CREATE INDEX idx_cached_at ON urlnav_rss_cache(cached_at)"); $db->exec("CREATE INDEX idx_is_fresh ON urlnav_rss_cache(is_fresh)"); $db->exec("CREATE UNIQUE INDEX idx_url_guid ON urlnav_rss_cache(url_id, guid)"); } // 创建收藏表 - 新增 $tableCheck8 = $db->query("SELECT name FROM sqlite_master WHERE type='table' AND name='urlnav_favorites'"); if (!$tableCheck8->fetch()) { $db->exec("CREATE TABLE urlnav_favorites ( id INTEGER PRIMARY KEY AUTOINCREMENT, user_id INTEGER DEFAULT 0, feed_id INTEGER NOT NULL, feed_title TEXT NOT NULL, feed_link TEXT NOT NULL, feed_description TEXT, full_content TEXT, -- 新增:完整内容字段 pub_date DATETIME NOT NULL, site_title TEXT, site_url TEXT, category_name TEXT, favorited_at DATETIME DEFAULT CURRENT_TIMESTAMP, UNIQUE(user_id, feed_id) )"); $db->exec("CREATE INDEX idx_favorite_user_id ON urlnav_favorites(user_id)"); $db->exec("CREATE INDEX idx_favorite_feed_id ON urlnav_favorites(feed_id)"); $db->exec("CREATE INDEX idx_favorite_created_at ON urlnav_favorites(favorited_at)"); } // 创建RSS刷新记录表 $tableCheck4 = $db->query("SELECT name FROM sqlite_master WHERE type='table' AND name='urlnav_refresh_log'"); if (!$tableCheck4->fetch()) { $db->exec("CREATE TABLE urlnav_refresh_log ( id INTEGER PRIMARY KEY AUTOINCREMENT, refresh_type TEXT NOT NULL, success_count INTEGER DEFAULT 0, total_feeds INTEGER DEFAULT 0, url_count INTEGER DEFAULT 0, new_articles INTEGER DEFAULT 0, error_message TEXT, refresh_time DATETIME DEFAULT CURRENT_TIMESTAMP, duration INTEGER DEFAULT 0, cron_type TEXT DEFAULT 'rss' -- 新增:区分RSS和状态检查 )"); $db->exec("CREATE INDEX idx_refresh_time ON urlnav_refresh_log(refresh_time)"); $db->exec("CREATE INDEX idx_refresh_type ON urlnav_refresh_log(refresh_type)"); $db->exec("CREATE INDEX idx_cron_type ON urlnav_refresh_log(cron_type)"); } // 创建定时任务记录表 $tableCheck5 = $db->query("SELECT name FROM sqlite_master WHERE type='table' AND name='urlnav_cron_log'"); if (!$tableCheck5->fetch()) { $db->exec("CREATE TABLE urlnav_cron_log ( id INTEGER PRIMARY KEY AUTOINCREMENT, cron_type TEXT NOT NULL, executed_time DATETIME DEFAULT CURRENT_TIMESTAMP, result TEXT, error_message TEXT )"); $db->exec("CREATE INDEX idx_executed_time ON urlnav_cron_log(executed_time)"); $db->exec("CREATE INDEX idx_cron_type ON urlnav_cron_log(cron_type)"); } // 创建状态检查记录表 $tableCheck6 = $db->query("SELECT name FROM sqlite_master WHERE type='table' AND name='urlnav_status_log'"); if (!$tableCheck6->fetch()) { $db->exec("CREATE TABLE urlnav_status_log ( id INTEGER PRIMARY KEY AUTOINCREMENT, url_id INTEGER NOT NULL, is_online INTEGER DEFAULT 0, status_code INTEGER, response_time INTEGER, check_time DATETIME DEFAULT CURRENT_TIMESTAMP, error_message TEXT, FOREIGN KEY (url_id) REFERENCES urlnav_urls(id) ON DELETE CASCADE )"); $db->exec("CREATE INDEX idx_url_id_status ON urlnav_status_log(url_id)"); $db->exec("CREATE INDEX idx_check_time ON urlnav_status_log(check_time)"); $db->exec("CREATE INDEX idx_is_online_status ON urlnav_status_log(is_online)"); } // 创建状态检查统计表 $tableCheck7 = $db->query("SELECT name FROM sqlite_master WHERE type='table' AND name='urlnav_status_stats'"); if (!$tableCheck7->fetch()) { $db->exec("CREATE TABLE urlnav_status_stats ( id INTEGER PRIMARY KEY AUTOINCREMENT, total_checks INTEGER DEFAULT 0, success_checks INTEGER DEFAULT 0, failed_checks INTEGER DEFAULT 0, avg_response_time REAL DEFAULT 0, last_check_time DATETIME, created_at DATETIME DEFAULT CURRENT_TIMESTAMP, updated_at DATETIME DEFAULT CURRENT_TIMESTAMP )"); // 初始化一条记录 $db->exec("INSERT INTO urlnav_status_stats (total_checks, success_checks, failed_checks, avg_response_time) VALUES (0, 0, 0, 0)"); } // 创建更新时间触发器 $db->exec("CREATE TRIGGER IF NOT EXISTS update_category_time AFTER UPDATE ON urlnav_categories BEGIN UPDATE urlnav_categories SET updated_at = CURRENT_TIMESTAMP WHERE id = NEW.id; END"); $db->exec("CREATE TRIGGER IF NOT EXISTS update_url_time AFTER UPDATE ON urlnav_urls BEGIN UPDATE urlnav_urls SET updated_at = CURRENT_TIMESTAMP WHERE id = NEW.id; END"); $db = null; } catch (PDOException $e) { error_log('UrlNav: 数据库初始化失败: ' . $e->getMessage()); } } /** * 数据库迁移 */ private static function migrateDatabase() { try { $db = new PDO('sqlite:' . self::$dbPath); $db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); // 检查是否需要添加字段 $tableInfo = $db->query("PRAGMA table_info(urlnav_urls)"); $columns = $tableInfo->fetchAll(PDO::FETCH_ASSOC); $newColumns = array( 'rss_url' => "ALTER TABLE urlnav_urls ADD COLUMN rss_url TEXT", 'last_refresh' => "ALTER TABLE urlnav_urls ADD COLUMN last_refresh DATETIME", 'refresh_count' => "ALTER TABLE urlnav_urls ADD COLUMN refresh_count INTEGER DEFAULT 0", 'success_count' => "ALTER TABLE urlnav_urls ADD COLUMN success_count INTEGER DEFAULT 0", 'failure_count' => "ALTER TABLE urlnav_urls ADD COLUMN failure_count INTEGER DEFAULT 0", 'last_error' => "ALTER TABLE urlnav_urls ADD COLUMN last_error TEXT", 'is_online' => "ALTER TABLE urlnav_urls ADD COLUMN is_online INTEGER DEFAULT 1", 'last_status_check' => "ALTER TABLE urlnav_urls ADD COLUMN last_status_check DATETIME", 'status_check_count' => "ALTER TABLE urlnav_urls ADD COLUMN status_check_count INTEGER DEFAULT 0", 'last_status_code' => "ALTER TABLE urlnav_urls ADD COLUMN last_status_code INTEGER" ); foreach ($newColumns as $columnName => $sql) { $hasColumn = false; foreach ($columns as $column) { if ($column['name'] === $columnName) { $hasColumn = true; break; } } if (!$hasColumn) { $db->exec($sql); } } // 检查缓存表是否需要添加is_fresh字段 $cacheTableInfo = $db->query("PRAGMA table_info(urlnav_rss_cache)"); $cacheColumns = $cacheTableInfo->fetchAll(PDO::FETCH_ASSOC); $hasIsFresh = false; foreach ($cacheColumns as $column) { if ($column['name'] === 'is_fresh') { $hasIsFresh = true; break; } } if (!$hasIsFresh) { $db->exec("ALTER TABLE urlnav_rss_cache ADD COLUMN is_fresh INTEGER DEFAULT 1"); $db->exec("CREATE INDEX IF NOT EXISTS idx_is_fresh ON urlnav_rss_cache(is_fresh)"); } // 检查是否需要添加star_rating字段 $hasStarRating = false; foreach ($columns as $column) { if ($column['name'] === 'star_rating') { $hasStarRating = true; break; } } if (!$hasStarRating) { $db->exec("ALTER TABLE urlnav_urls ADD COLUMN star_rating INTEGER DEFAULT 0"); error_log("UrlNav: 已添加star_rating字段到urlnav_urls表"); } // 检查缓存表是否需要添加full_content字段 $hasFullContent = false; foreach ($cacheColumns as $column) { if ($column['name'] === 'full_content') { $hasFullContent = true; break; } } if (!$hasFullContent) { $db->exec("ALTER TABLE urlnav_rss_cache ADD COLUMN full_content TEXT"); $db->exec("ALTER TABLE urlnav_favorites ADD COLUMN full_content TEXT"); } // 检查refresh_log表是否需要添加cron_type字段 $refreshLogTableInfo = $db->query("PRAGMA table_info(urlnav_refresh_log)"); $refreshLogColumns = $refreshLogTableInfo->fetchAll(PDO::FETCH_ASSOC); $hasCronType = false; foreach ($refreshLogColumns as $column) { if ($column['name'] === 'cron_type') { $hasCronType = true; break; } } if (!$hasCronType) { $db->exec("ALTER TABLE urlnav_refresh_log ADD COLUMN cron_type TEXT DEFAULT 'rss'"); $db->exec("CREATE INDEX IF NOT EXISTS idx_cron_type ON urlnav_refresh_log(cron_type)"); } // ===== 修复关键:添加缺失的new_articles字段 ===== $hasNewArticles = false; foreach ($refreshLogColumns as $column) { if ($column['name'] === 'new_articles') { $hasNewArticles = true; break; } } if (!$hasNewArticles) { $db->exec("ALTER TABLE urlnav_refresh_log ADD COLUMN new_articles INTEGER DEFAULT 0"); error_log("UrlNav: 已添加new_articles字段到urlnav_refresh_log表"); } // 🔴 新增:检查是否需要添加message字段 $hasMessage = false; foreach ($refreshLogColumns as $column) { if ($column['name'] === 'message') { $hasMessage = true; break; } } if (!$hasMessage) { $db->exec("ALTER TABLE urlnav_refresh_log ADD COLUMN message TEXT"); error_log("UrlNav: 已添加message字段到urlnav_refresh_log表"); } // 🔴 新增:检查是否需要添加details字段 $hasDetails = false; foreach ($refreshLogColumns as $column) { if ($column['name'] === 'details') { $hasDetails = true; break; } } if (!$hasDetails) { $db->exec("ALTER TABLE urlnav_refresh_log ADD COLUMN details TEXT"); error_log("UrlNav: 已添加details字段到urlnav_refresh_log表"); } // ===== 修复结束 ===== $db = null; } catch (PDOException $e) { error_log('UrlNav数据库迁移失败: ' . $e->getMessage()); } } /** * 获取数据库连接 - 优化版,解决数据库锁问题 */ public static function getDbConnection() { if (empty(self::$dbPath)) { self::initDbPath(); } if (!file_exists(self::$dbPath)) { self::initDatabase(); } $maxRetries = 3; $retryDelay = 1; // 秒 for ($retry = 0; $retry < $maxRetries; $retry++) { if ($retry > 0) { error_log("UrlNav: 数据库连接重试 {$retry},等待 {$retryDelay} 秒..."); sleep($retryDelay); } try { $db = new PDO('sqlite:' . self::$dbPath); $db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); $db->exec('PRAGMA foreign_keys = ON'); $db->exec('PRAGMA busy_timeout = 3000'); // 设置3秒超时 $db->exec('PRAGMA journal_mode = WAL'); // 使用WAL模式提高并发性能 return $db; } catch (PDOException $e) { if (strpos($e->getMessage(), 'database is locked') !== false && $retry < $maxRetries - 1) { continue; } throw new Exception('数据库连接失败: ' . $e->getMessage()); } } throw new Exception('数据库连接失败:重试' . $maxRetries . '次后仍被锁定'); } /** * 获取插件配置 */ public static function getConfig() { static $config = null; if ($config === null) { $options = Typecho_Widget::widget('Widget_Options'); $config = $options->plugin('UrlNav'); } return $config; } /** * 获取RSS管理器 */ private static function getRssManager() { if (self::$rssManager === null) { self::$rssManager = new UrlNav_RssManager(); } return self::$rssManager; } public static function executeRssCronTask() { // 立即设置响应头,防止502 if (!headers_sent()) { header('Content-Type: application/json; charset=utf-8'); header('Cache-Control: no-cache, no-store, must-revalidate'); header('Pragma: no-cache'); header('Expires: 0'); // 立即输出,让Nginx知道脚本在运行 echo json_encode(['status' => 'starting', 'timestamp' => time()]); flush(); ob_flush(); } $startTime = microtime(true); try { error_log("UrlNav RSS定时任务: 开始执行 " . date('Y-m-d H:i:s')); // 设置更长的执行时间 @set_time_limit(300); // 5分钟 @ini_set('max_execution_time', 300); // 添加简单的锁检查,防止多个进程同时执行 $lockFile = __DIR__ . '/db/rss_cron_running.lock'; $lockTimeout = 1800; // 30分钟超时 if (file_exists($lockFile)) { $lockTime = @filemtime($lockFile); if ($lockTime && (time() - $lockTime) < $lockTimeout) { error_log("UrlNav RSS定时任务: 跳过执行,已在运行中"); return array( 'success' => false, 'message' => '定时任务已在运行中,跳过本次执行', 'timestamp' => time() ); } // 锁已超时,删除它 @unlink($lockFile); } // 创建锁文件 @touch($lockFile); @file_put_contents($lockFile, "Started at: " . date('Y-m-d H:i:s')); register_shutdown_function(function() use ($lockFile) { if (file_exists($lockFile)) { @unlink($lockFile); error_log("UrlNav: shutdown函数删除RSS锁文件"); } }); // 执行刷新任务 $refreshResult = self::refreshAllRssFeeds(true); $duration = round(microtime(true) - $startTime, 2); // 删除锁文件 if (file_exists($lockFile)) { @unlink($lockFile); } // 🆕 修改:确保result包含RSS地址信息 $result = array( 'success' => $refreshResult['success'], 'refreshed' => true, 'refresh_result' => $refreshResult, 'timestamp' => time(), 'duration' => $duration, 'message' => $refreshResult['message'], // 🆕 关键:直接包含RSS地址信息 'successRssUrls' => $refreshResult['successRssUrls'] ?? array(), 'failedRssUrls' => $refreshResult['failedRssUrls'] ?? array() ); // 记录日志(会自动将上面的result转为JSON存入数据库) self::logCron('rss_auto_refresh', $result); error_log("UrlNav RSS定时任务: 执行完成,耗时 {$duration} 秒"); return $result; } catch (Exception $e) { error_log("UrlNav RSS定时任务异常: " . $e->getMessage()); // 确保锁文件被删除 $lockFile = __DIR__ . '/db/rss_cron_running.lock'; if (file_exists($lockFile)) { @unlink($lockFile); } return array( 'success' => false, 'error' => $e->getMessage(), 'timestamp' => time(), 'message' => 'RSS定时任务执行异常', 'successRssUrls' => array(), 'failedRssUrls' => array() ); } } /** * 执行状态检查定时任务 - 完全移除锁机制 */ public static function executeStatusCronTask() { try { $startTime = microtime(true); error_log("UrlNav 状态检查定时任务: 开始执行 " . date('Y-m-d H:i:s')); // 修改这里:调用正确的自动检查方式 $statusResult = self::manualCheckStatus(null, false); // $urlIds=null, $isBatchCheck=false $endTime = microtime(true); $duration = round($endTime - $startTime, 2); // 更新状态检查统计 self::updateStatusStats($statusResult); // 记录状态检查专用的定时任务日志 self::logCron('status_auto_check', json_encode(array_merge($statusResult, array( 'duration' => $duration, 'timestamp' => time() )))); if ($statusResult['success']) { error_log("UrlNav 状态检查定时任务: 执行成功,耗时 {$duration} 秒"); return array( 'success' => true, 'status_checked' => $statusResult['total'] > 0, 'status_result' => $statusResult, 'timestamp' => time(), 'duration' => $duration, 'message' => '状态检查定时任务执行成功' ); } else { error_log("UrlNav 状态检查定时任务: 执行失败: " . $statusResult['message']); return array( 'success' => false, 'status_checked' => false, 'status_result' => $statusResult, 'timestamp' => time(), 'duration' => $duration, 'message' => '状态检查定时任务执行失败' ); } } catch (Exception $e) { error_log("UrlNav 状态检查定时任务异常: " . $e->getMessage()); return array( 'success' => false, 'error' => $e->getMessage(), 'timestamp' => time(), 'message' => '状态检查定时任务执行异常' ); } } /** * 通用的锁定任务执行器 - 优化版,减少锁竞争 */ private static function executeLockedTask($lockFile, $taskType, $callback) { $lockTimeout = 3600; // 延长到1小时超时 // 简化的锁检查:如果锁文件存在且未超时,直接跳过 if (file_exists($lockFile)) { $lockTime = @filemtime($lockFile); if ($lockTime && (time() - $lockTime) < $lockTimeout) { $lockDuration = time() - $lockTime; error_log("UrlNav {$taskType}: 跳过执行,锁文件存在 {$lockDuration} 秒"); return array( 'success' => false, 'message' => "{$taskType}定时任务正在运行,跳过本次执行", 'timestamp' => time(), 'lock_time' => $lockTime, 'lock_duration' => $lockDuration ); } // 锁已超时,删除它 @unlink($lockFile); error_log("UrlNav {$taskType}: 删除超时的锁文件(已存在超过 {$lockTimeout} 秒)"); } // 创建锁文件 if (!@touch($lockFile)) { error_log("UrlNav {$taskType}: 无法创建锁文件"); return array( 'success' => false, 'message' => '无法创建锁文件', 'timestamp' => time() ); } // 在锁文件中记录开始时间 file_put_contents($lockFile, "Started at: " . date('Y-m-d H:i:s') . "\nTask type: {$taskType}"); error_log("UrlNav {$taskType}: 开始执行定时任务 " . date('Y-m-d H:i:s')); try { // 确保锁文件会被删除(即使脚本意外终止) register_shutdown_function(function() use ($lockFile, $taskType) { if (file_exists($lockFile)) { $lockDuration = time() - filemtime($lockFile); @unlink($lockFile); error_log("UrlNav {$taskType}: shutdown函数删除锁文件,锁持续了 {$lockDuration} 秒"); } }); // 执行回调函数 $result = $callback(); // 删除锁文件 if (file_exists($lockFile)) { $lockDuration = time() - filemtime($lockFile); @unlink($lockFile); error_log("UrlNav {$taskType}: 任务完成,删除锁文件,任务耗时 {$lockDuration} 秒"); } return $result; } catch (Exception $e) { // 确保锁文件被删除 if (file_exists($lockFile)) { $lockDuration = time() - filemtime($lockFile); @unlink($lockFile); error_log("UrlNav {$taskType}: 异常时删除锁文件,锁持续了 {$lockDuration} 秒"); } error_log("UrlNav {$taskType}定时任务异常: " . $e->getMessage()); self::logCron('error', $e->getMessage()); return array( 'success' => false, 'error' => $e->getMessage(), 'timestamp' => time(), 'message' => "{$taskType}定时任务执行异常" ); } } /** * 手动解锁定时任务(供调试使用) */ public static function unlockCron($cronType = 'rss') { if ($cronType === 'rss') { $lockFile = __DIR__ . '/db/rss_cron.lock'; } elseif ($cronType === 'status') { $lockFile = __DIR__ . '/db/status_cron.lock'; } else { $lockFile = __DIR__ . '/db/cron.lock'; } if (file_exists($lockFile)) { if (@unlink($lockFile)) { error_log("UrlNav: 手动解锁{$cronType}成功"); return array( 'success' => true, 'message' => "{$cronType}定时任务锁已解除", 'timestamp' => time() ); } else { error_log("UrlNav: 手动解锁{$cronType}失败"); return array( 'success' => false, 'message' => '无法删除锁文件', 'timestamp' => time() ); } } else { return array( 'success' => true, 'message' => "{$cronType}没有锁文件存在", 'timestamp' => time() ); } } public static function refreshAllRssFeeds($isCron = false) { $startTime = microtime(true); // 🆕 新增:在定时任务中自动清理过期缓存 if ($isCron) { self::cleanExpiredCache(); } // 立即设置响应头,避免502 if ($isCron && !headers_sent()) { header('Content-Type: application/json; charset=utf-8'); header('Cache-Control: no-cache, no-store, must-revalidate'); header('Pragma: no-cache'); header('Expires: 0'); // 立即输出一些内容,让Nginx知道脚本还在运行 echo '{"status":"starting","message":"RSS刷新任务开始...","timestamp":' . time() . '}'; flush(); ob_flush(); } // 设置更长的执行时间 if ($isCron) { @set_time_limit(300); // 5分钟 @ini_set('max_execution_time', 300); } try { $db = self::getDbConnection(); // 使用后台配置的数量 $config = self::getConfig(); $limit = intval($config->rssRefreshLimit ?? 10); $limit = max(1, min($limit, 30)); // 限制在1-30之间 error_log("===== UrlNav RSS刷新开始,时间: " . date('Y-m-d H:i:s') . " ====="); error_log("配置数量: {$limit}"); // 优化查询:优先处理从未刷新或很久没刷新的 // 关键修复:添加时间条件,避免重复刷新刚刷过的 $sql = " SELECT id, rss_url, url, title, last_refresh, failure_count, success_count, created_at FROM urlnav_urls WHERE is_active = 1 AND rss_url IS NOT NULL AND TRIM(rss_url) != '' AND ( -- 从未刷新过的 last_refresh IS NULL -- 或者超过1小时没刷新的 OR last_refresh < datetime('now', '-1 hour') -- 或者失败次数多且超过30分钟没重试 OR (failure_count > success_count AND last_refresh < datetime('now', '-30 minutes')) ) ORDER BY CASE -- 最高优先级:从未刷新过的 WHEN last_refresh IS NULL THEN 0 -- 次高优先级:失败次数多于成功次数的 WHEN failure_count > success_count THEN 1 -- 中等优先级:新添加的网址(最近3天内) WHEN created_at > datetime('now', '-3 days') THEN 2 -- 低优先级:正常的 ELSE 3 END, -- 按刷新时间从早到晚排序 CASE WHEN last_refresh IS NULL THEN created_at ELSE last_refresh END ASC LIMIT ? "; $stmt = $db->prepare($sql); $stmt->execute(array($limit)); $urls = $stmt->fetchAll(PDO::FETCH_ASSOC); // 如果没有符合条件的,放宽条件选择一些 if (empty($urls)) { error_log("UrlNav: 没有需要立即刷新的RSS源,选择一些较久没刷新的"); $sql = " SELECT id, rss_url, url, title, last_refresh, failure_count, success_count FROM urlnav_urls WHERE is_active = 1 AND rss_url IS NOT NULL AND TRIM(rss_url) != '' ORDER BY last_refresh ASC NULLS FIRST LIMIT ? "; $stmt = $db->prepare($sql); $stmt->execute(array(min($limit, 5))); // 少选几个 $urls = $stmt->fetchAll(PDO::FETCH_ASSOC); } if (empty($urls)) { error_log("UrlNav: 没有需要刷新的RSS网址"); return array( 'success' => true, 'message' => '没有需要刷新的RSS网址', 'successCount' => 0, 'failureCount' => 0, 'newArticles' => 0, 'totalFeeds' => 0, 'urlCount' => 0, 'successRssUrls' => array(), 'failedRssUrls' => array() ); } error_log("UrlNav: 获取到 " . count($urls) . " 个需要刷新的RSS源"); // 记录获取到的URL信息 foreach ($urls as $url) { $refreshStatus = $url['last_refresh'] ? "最后刷新: " . $url['last_refresh'] : "从未刷新"; error_log("UrlNav: 选中 - ID: {$url['id']}, {$refreshStatus}, URL: {$url['rss_url']}"); } $successCount = 0; $failureCount = 0; $totalFeeds = 0; $newArticles = 0; // 🆕 修改:记录成功和失败的RSS地址 $successRssUrls = array(); $failedRssUrls = array(); // 配置参数 $timeout = intval($config->fetchTimeout ?? 15); // 默认15秒 $retryTimes = intval($config->retryTimes ?? 2); // 重试2次 $maxFeeds = intval($config->maxFeedsPerSite ?? 20); // 每个站点20条 // 关键:定期输出内容,保持连接活跃 $lastOutputTime = $startTime; foreach ($urls as $index => $url) { $currentTime = microtime(true); $elapsedTime = $currentTime - $startTime; // 检查总执行时间(4分钟限制) if ($isCron && $elapsedTime > 240) { error_log('UrlNav: 接近总超时(4分钟),停止处理'); break; } // 每3秒输出一次,保持连接活跃(防502关键) if ($isCron && ($currentTime - $lastOutputTime) > 3) { if (!headers_sent()) { echo '{"status":"processing","progress":"' . ($index+1) . '/' . count($urls) . '","timestamp":' . time() . '}'; flush(); ob_flush(); } $lastOutputTime = $currentTime; } try { error_log("UrlNav: [开始] 处理RSS #" . ($index+1) . " - ID: " . $url['id'] . ", URL: " . $url['rss_url']); error_log("UrlNav: 最后刷新时间: " . ($url['last_refresh'] ?: '从未刷新')); $urlResult = self::refreshSingleRssUrl($url, $timeout, $retryTimes, $maxFeeds); if ($urlResult['success']) { $successCount++; $newArticles += $urlResult['new_articles']; $totalFeeds += $urlResult['total_feeds']; // 🆕 记录成功的RSS地址 $successRssUrls[] = $url['rss_url']; error_log("UrlNav: [成功] ID: " . $url['id'] . ", 新增文章: " . $urlResult['new_articles'] . ", RSS: " . $url['rss_url']); } else { $failureCount++; // 🆕 记录失败的RSS地址 $failedRssUrls[] = $url['rss_url']; error_log("UrlNav: [失败] ID: " . $url['id'] . ", 错误: " . ($urlResult['error'] ?? '未知错误') . ", RSS: " . $url['rss_url']); } // 短暂休息,避免对目标服务器压力过大 if ($index < count($urls) - 1) { // 不是最后一个时休息 usleep(800000); // 0.8秒休息 } } catch (Exception $e) { $failureCount++; // 🆕 记录异常的RSS地址 $failedRssUrls[] = $url['rss_url'] . " [异常]"; error_log('UrlNav: [异常] ID: ' . $url['id'] . ', RSS: ' . $url['rss_url'] . ', 异常: ' . $e->getMessage()); } } $duration = round(microtime(true) - $startTime, 2); // 记录日志 - 现在传递成功和失败的RSS地址 self::logRefresh($isCron ? 'cron' : 'manual', $successCount, $totalFeeds, count($urls), $newArticles, null, $duration, 'rss', $successRssUrls, $failedRssUrls); $message = "刷新完成:成功 {$successCount} 个,失败 {$failureCount} 个"; $result = array( 'success' => $successCount > 0 || count($urls) == 0, 'successCount' => $successCount, 'failureCount' => $failureCount, 'newArticles' => $newArticles, 'totalFeeds' => $totalFeeds, 'urlCount' => count($urls), 'duration' => $duration, 'message' => $message, // 🆕 修改:返回成功和失败的RSS地址 'successRssUrls' => $successRssUrls, 'failedRssUrls' => $failedRssUrls ); error_log("UrlNav: [完成] RSS刷新完成,耗时 {$duration} 秒,{$message}"); error_log("===== UrlNav RSS刷新结束 ====="); return $result; } catch (Exception $e) { error_log('UrlNav: [全局异常] 刷新失败: ' . $e->getMessage()); error_log("===== UrlNav RSS刷新异常结束 ====="); return array( 'success' => false, 'message' => '刷新失败: ' . $e->getMessage(), 'successRssUrls' => array(), 'failedRssUrls' => array() ); } } /** * 获取RSS刷新状态统计 */ public static function getRssRefreshStatus() { try { $db = self::getDbConnection(); // 获取统计信息 $stats = array(); // 总RSS源数量 $stmt = $db->query("SELECT COUNT(*) as total FROM urlnav_urls WHERE is_active = 1 AND rss_url IS NOT NULL AND TRIM(rss_url) != ''"); $stats['total_rss_sources'] = $stmt->fetchColumn(); // 从未刷新的数量 $stmt = $db->query("SELECT COUNT(*) as never_refreshed FROM urlnav_urls WHERE is_active = 1 AND rss_url IS NOT NULL AND TRIM(rss_url) != '' AND last_refresh IS NULL"); $stats['never_refreshed'] = $stmt->fetchColumn(); // 今天刷新的数量 $stmt = $db->query("SELECT COUNT(*) as today_refreshed FROM urlnav_urls WHERE is_active = 1 AND rss_url IS NOT NULL AND TRIM(rss_url) != '' AND date(last_refresh) = date('now')"); $stats['today_refreshed'] = $stmt->fetchColumn(); // 最近7天刷新的数量 $stmt = $db->query("SELECT COUNT(*) as week_refreshed FROM urlnav_urls WHERE is_active = 1 AND rss_url IS NOT NULL AND TRIM(rss_url) != '' AND last_refresh >= datetime('now', '-7 days')"); $stats['week_refreshed'] = $stmt->fetchColumn(); // 最久未刷新的时间 $stmt = $db->query("SELECT MIN(last_refresh) as oldest_refresh FROM urlnav_urls WHERE is_active = 1 AND rss_url IS NOT NULL AND TRIM(rss_url) != '' AND last_refresh IS NOT NULL"); $oldest = $stmt->fetchColumn(); $stats['oldest_refresh'] = $oldest; if ($oldest) { $stats['oldest_days'] = round((time() - strtotime($oldest)) / 86400, 1); } // 需要刷新的数量(超过1天没刷新的) $stmt = $db->query("SELECT COUNT(*) as need_refresh FROM urlnav_urls WHERE is_active = 1 AND rss_url IS NOT NULL AND TRIM(rss_url) != '' AND (last_refresh IS NULL OR last_refresh < datetime('now', '-1 day'))"); $stats['need_refresh'] = $stmt->fetchColumn(); // 成功率统计 $stmt = $db->query("SELECT SUM(success_count) as total_success, SUM(failure_count) as total_failure, SUM(refresh_count) as total_refreshes FROM urlnav_urls WHERE is_active = 1 AND rss_url IS NOT NULL AND TRIM(rss_url) != ''"); $countStats = $stmt->fetch(PDO::FETCH_ASSOC); $stats['total_success'] = $countStats['total_success'] ?? 0; $stats['total_failure'] = $countStats['total_failure'] ?? 0; $stats['total_refreshes'] = $countStats['total_refreshes'] ?? 0; $stats['success_rate'] = $stats['total_refreshes'] > 0 ? round(($stats['total_success'] / $stats['total_refreshes']) * 100, 1) : 0; return $stats; } catch (Exception $e) { error_log('UrlNav: 获取刷新状态失败: ' . $e->getMessage()); return array(); } } private static function refreshSingleRssUrl($url, $timeout = 8, $retryTimes = 1, $maxFeeds = 10) { $urlId = $url['id']; $rssUrl = trim($url['rss_url']); error_log("UrlNav: === 开始处理RSS ID: {$urlId} ==="); error_log("UrlNav: RSS URL: {$rssUrl}"); try { $db = self::getDbConnection(); // 更新刷新统计 $stmt = $db->prepare("UPDATE urlnav_urls SET refresh_count = refresh_count + 1 WHERE id = ?"); $stmt->execute(array($urlId)); error_log("UrlNav: 更新刷新统计成功"); // 解析RSS内容 error_log("UrlNav: 开始解析RSS内容..."); $feeds = self::parseRssFeedWithRetry($rssUrl, $retryTimes, $timeout); error_log("UrlNav: RSS解析完成,获取到 " . count($feeds) . " 篇文章"); if (empty($feeds)) { error_log("UrlNav: 没有获取到文章数据"); $stmt = $db->prepare(" UPDATE urlnav_urls SET last_refresh = CURRENT_TIMESTAMP, last_error = '无可用数据' WHERE id = ? "); $stmt->execute(array($urlId)); error_log("UrlNav: === 处理完成(无数据)==="); return array( 'success' => true, 'new_articles' => 0, 'total_feeds' => 0, 'error' => null ); } // 限制每个站点最大文章数 $feeds = array_slice($feeds, 0, $maxFeeds); error_log("UrlNav: 限制后文章数: " . count($feeds)); $addedCount = 0; foreach ($feeds as $feedIndex => $feed) { try { // 确保所有必要字段都有值 $title = !empty($feed['title']) ? substr($feed['title'], 0, 255) : '无标题'; $link = !empty($feed['link']) ? substr($feed['link'], 0, 500) : $url['url']; $description = !empty($feed['description']) ? substr($feed['description'], 0, 1000) : ''; $fullContent = !empty($feed['full_content']) ? substr($feed['full_content'], 0, 5000) : $description; // 使用完整内容,如果不存在则使用描述 $pubDate = !empty($feed['pubDate']) ? $feed['pubDate'] : date('Y-m-d H:i:s'); $guid = !empty($feed['guid']) ? substr($feed['guid'], 0, 255) : md5($link . $pubDate); error_log("UrlNav: 处理文章 #" . ($feedIndex+1) . ": {$title}"); // 使用INSERT OR IGNORE避免冲突 $stmt = $db->prepare(" INSERT OR IGNORE INTO urlnav_rss_cache (url_id, feed_title, feed_link, feed_description, full_content, pub_date, guid, cached_at, is_fresh) VALUES (?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, 1) "); $stmt->execute(array( $urlId, $title, $link, $description, $fullContent, $pubDate, $guid )); if ($stmt->rowCount() > 0) { $addedCount++; error_log("UrlNav: 文章 #" . ($feedIndex+1) . " 插入成功"); } else { error_log("UrlNav: 文章 #" . ($feedIndex+1) . " 已存在,跳过"); } } catch (Exception $e) { error_log('UrlNav: 文章处理异常: ' . $e->getMessage()); // 继续处理下一篇文章 continue; } } // 更新URL统计信息 $stmt = $db->prepare(" UPDATE urlnav_urls SET success_count = success_count + 1, last_refresh = CURRENT_TIMESTAMP, last_error = NULL WHERE id = ? "); $stmt->execute(array($urlId)); error_log("UrlNav: 成功解析RSS - ID: {$urlId}, 获取到 " . count($feeds) . " 篇文章, 新增 {$addedCount} 篇"); error_log("UrlNav: === 处理完成(成功)==="); return array( 'success' => true, 'new_articles' => $addedCount, 'total_feeds' => count($feeds), 'error' => null ); } catch (Exception $e) { // 记录错误信息 $errorMessage = substr($e->getMessage(), 0, 500); error_log("UrlNav: RSS解析失败 - 错误: {$errorMessage}"); $stmt = $db->prepare(" UPDATE urlnav_urls SET failure_count = failure_count + 1, last_refresh = CURRENT_TIMESTAMP, last_error = ? WHERE id = ? "); $stmt->execute(array($errorMessage, $urlId)); error_log("UrlNav: === 处理完成(失败)==="); return array( 'success' => false, 'new_articles' => 0, 'total_feeds' => 0, 'error' => $errorMessage ); } } /** * 获取需要刷新的网址数量 - 新增方法 */ public static function getUrlsNeedingRefresh() { try { $db = self::getDbConnection(); $sql = " SELECT COUNT(*) as count FROM urlnav_urls WHERE is_active = 1 AND rss_url IS NOT NULL AND TRIM(rss_url) != '' AND ( last_refresh IS NULL OR last_refresh <= datetime('now', '-1 hour') ) "; $stmt = $db->query($sql); $result = $stmt->fetch(PDO::FETCH_ASSOC); return $result['count'] ?? 0; } catch (Exception $e) { error_log('UrlNav: 获取需要刷新的网址数量失败: ' . $e->getMessage()); return 0; } } /** * 带重试机制的RSS解析 - 优化版 */ private static function parseRssFeedWithRetry($rssUrl, $retryTimes = 1, $timeout = 8) { $lastError = null; for ($i = 0; $i <= $retryTimes; $i++) { try { if ($i > 0) { // 重试前等待一段时间 sleep($i * 2); error_log("UrlNav: RSS重试第{$i}次: {$rssUrl}"); } $feeds = self::parseRssFeed($rssUrl, $timeout); return $feeds; } catch (Exception $e) { $lastError = $e; $errorMsg = $e->getMessage(); // 如果是DNS错误,尝试使用IP直接访问(针对特定域名) if (strpos($errorMsg, 'getaddrinfo failed') !== false && strpos($rssUrl, 'windful.cn') !== false) { // 尝试使用IP访问(需要你知道windful.cn的IP) // $rssUrl = str_replace('https://windful.cn/', 'https://[IP地址]/', $rssUrl); error_log("UrlNav: DNS解析失败,建议检查windful.cn域名是否正常"); } if ($i < $retryTimes) { error_log("UrlNav: RSS解析失败,第" . ($i+1) . "次重试: " . $errorMsg); } } } // 所有重试都失败 throw new Exception("RSS解析失败: " . $lastError->getMessage()); } /** * 解析RSS源 - 完整功能增强版(修改全文字段逻辑) */ private static function parseRssFeed($rssUrl, $timeout = 8) { error_log("UrlNav: >>> 开始解析RSS: {$rssUrl}"); try { // 设置超时时间(保持原样) $context = stream_context_create(array( 'http' => array( 'timeout' => $timeout, 'ignore_errors' => true, 'header' => "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\r\n" . "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n" . "Accept-Language: zh-CN,zh;q=0.9,en;q=0.8\r\n" ), 'ssl' => array( 'verify_peer' => false, 'verify_peer_name' => false, 'allow_self_signed' => true ) )); error_log("UrlNav: 尝试获取RSS内容..."); $content = @file_get_contents($rssUrl, false, $context); if ($content === false) { $error = error_get_last(); $errorMsg = $error['message'] ?? '未知错误'; error_log("UrlNav: file_get_contents失败: {$errorMsg}"); if (isset($http_response_header)) { error_log("UrlNav: HTTP响应头: " . implode(" | ", $http_response_header)); } throw new Exception('无法获取RSS内容: ' . $errorMsg); } error_log("UrlNav: 获取内容成功,长度: " . strlen($content) . " 字节"); // 检查HTTP状态码(保持原样) if (isset($http_response_header[0])) { error_log("UrlNav: HTTP状态: {$http_response_header[0]}"); if (strpos($http_response_header[0], '404') !== false) { throw new Exception('RSS源不存在 (404)'); } if (strpos($http_response_header[0], '403') !== false) { throw new Exception('拒绝访问 (403)'); } if (strpos($http_response_header[0], '500') !== false) { throw new Exception('服务器内部错误 (500)'); } } if (empty($content) || trim($content) === '') { error_log("UrlNav: RSS内容为空"); throw new Exception('RSS内容为空'); } // 处理可能存在的BOM头(保持原样) if (substr($content, 0, 3) == "\xEF\xBB\xBF") { $content = substr($content, 3); error_log("UrlNav: 已移除BOM头"); } // 简单的XML修复(保持原样) $content = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/', '', $content); $content = preg_replace('/&(?!(amp|lt|gt|quot|apos|#\d+);)/', '&', $content); // 🆕 增强:尝试多种XML解析方式,确保兼容性 libxml_use_internal_errors(true); libxml_clear_errors(); $xml = null; // 方式1:先尝试DOMDocument(最兼容WordPress/Typecho) try { $dom = new DOMDocument(); $dom->recover = true; $dom->strictErrorChecking = false; if (@$dom->loadXML($content)) { error_log("UrlNav: 使用DOMDocument解析成功"); $xml = simplexml_import_dom($dom); } } catch (Exception $e) { error_log("UrlNav: DOMDocument解析失败: " . $e->getMessage()); } // 方式2:如果DOMDocument失败,使用SimpleXML if ($xml === null) { error_log("UrlNav: 尝试SimpleXML解析..."); $xml = simplexml_load_string($content, 'SimpleXMLElement', LIBXML_NOCDATA); } if ($xml === false) { $errorMsg = 'XML解析失败'; $xmlErrors = libxml_get_errors(); if (!empty($xmlErrors)) { $errorMsg .= ': ' . $xmlErrors[0]->message; error_log("UrlNav: XML错误: " . $xmlErrors[0]->message); } libxml_clear_errors(); if (strpos($content, 'fullTextPerSite ?? 3); $pageFetchTimeout = intval($config->pageFetchTimeout ?? 10); $fullTextCount = 0; // 计数器 // 检查是否在白名单中(保持原样) $selector = self::isInFullTextWhitelist($rssUrl); $isInWhitelist = ($selector !== false); error_log("UrlNav: 白名单检查 - 是否在白名单: " . ($isInWhitelist ? '是' : '否') . ($isInWhitelist ? ",选择器: {$selector}" : "")); // ========== RSS格式解析 ========== if (isset($xml->channel) && isset($xml->channel->item)) { error_log("UrlNav: 检测到RSS格式 (channel->item)"); foreach ($xml->channel->item as $itemIndex => $item) { // 🆕 增强:安全处理每个item,防止一个item失败影响全部 try { // 基础内容获取(保持原样) $fullContent = ''; $description = isset($item->description) ? (string)$item->description : ''; $articleTitle = isset($item->title) ? (string)$item->title : '无标题文章'; $articleLink = isset($item->link) ? (string)$item->link : ''; // 确保标题不为空(保持原样) if (empty($articleTitle)) { $articleTitle = '未命名文章 ' . date('Y-m-d H:i:s'); } // 如果链接为空,尝试使用guid(保持原样) if (empty($articleLink) && isset($item->guid)) { $articleLink = (string)$item->guid; } error_log("UrlNav: 处理文章: {$articleTitle}"); // 🆕 增强:更好的content:encoded提取(处理WordPress/Typecho) $namespaces = $item->getNamespaces(true); // 1. 优先获取content:encoded(WordPress完整内容) $encodedContent = ''; if (isset($namespaces['content'])) { $contentNs = $item->children($namespaces['content']); if (isset($contentNs->encoded)) { $encodedContent = (string)$contentNs->encoded; if (!empty($encodedContent) && trim($encodedContent) !== '') { $fullContent = $encodedContent; error_log("UrlNav: ✓ 找到content:encoded完整内容,长度: " . strlen($fullContent)); } } } // 2. 如果没有content:encoded,使用description if (empty($fullContent) && !empty($description)) { $fullContent = $description; error_log("UrlNav: 使用description作为内容,长度: " . strlen($description)); } // 3. 尝试dc:description命名空间 if (empty($fullContent) && isset($namespaces['dc'])) { $dcNs = $item->children($namespaces['dc']); if (isset($dcNs->description) && !empty((string)$dcNs->description)) { $fullContent = (string)$dcNs->description; error_log("UrlNav: 找到dc:description内容"); } } // 4. 尝试item的直接子元素(保持原样) if (empty($fullContent)) { foreach ($item->children() as $child) { $childName = $child->getName(); $childContent = (string)$child; // 跳过已知的短字段 if (in_array($childName, ['title', 'link', 'guid', 'pubDate', 'author', 'category'])) { continue; } // 如果子元素内容较长,可能是文章内容 if (strlen($childContent) > 100) { $fullContent = $childContent; error_log("UrlNav: 从子元素 {$childName} 提取内容"); break; } } } // ===== 页面抓取判断逻辑(完全保持不变) ===== $pageContent = null; $rssContentLength = strlen($fullContent); // 判断逻辑:只有在白名单中且未超过限制才抓取 if ($isInWhitelist && $fullTextCount < $fullTextPerSite) { $needPageFetch = true; $fullTextCount++; error_log("UrlNav: 白名单抓取全文 #{$fullTextCount}/{$fullTextPerSite} - {$articleTitle}"); } else { $needPageFetch = false; if ($isInWhitelist && $fullTextCount >= $fullTextPerSite) { error_log("UrlNav: 已达白名单抓取限制({$fullTextCount}/{$fullTextPerSite}),跳过"); } elseif (!$isInWhitelist) { error_log("UrlNav: 非白名单网站,使用RSS摘要({$rssContentLength}字符),不抓取全文"); } } // 执行页面抓取(仅白名单) if ($needPageFetch && !empty($articleLink)) { // 短暂延迟,避免对服务器压力过大 if ($itemIndex > 0) { usleep(rand(300000, 800000)); // 300-800ms延迟 } // 使用选择器抓取 $pageContent = self::fetchFullContentWithSelector($articleLink, $selector, $pageFetchTimeout); if (!empty($pageContent)) { $pageLength = strlen($pageContent); if ($pageLength > $rssContentLength + 300) { $fullContent = $pageContent; error_log("UrlNav: ✓ 页面抓取成功,获得 {$pageLength} 字符内容"); } elseif ($pageLength > 0) { // 合并内容 $fullContent = $fullContent . "\n\n[页面补充内容]\n" . $pageContent; error_log("UrlNav: ✓ 合并页面内容,总长度: " . strlen($fullContent)); } else { error_log("UrlNav: ✗ 页面抓取未获得内容"); } } else { error_log("UrlNav: ✗ 页面抓取失败"); } } // ===== 页面抓取逻辑结束 ===== // 🔴 修改:非白名单网站全文字段处理逻辑 if (!$isInWhitelist) { // 非白名单网站,判断 description 或 content:encoded 是否大于500字 $descriptionLength = strlen($description); $encodedContentLength = strlen($encodedContent); // 只要 description 或 content:encoded 任意一个大于500字就存入全文 if ($descriptionLength >= 500 || $encodedContentLength >= 500) { // 有足够长的内容,存入全文字段 // 内容清理和截断 $fullContent = preg_replace('/\s+/', ' ', $fullContent); if (strlen($fullContent) > 10000) { $fullContent = substr($fullContent, 0, 10000) . '... [内容已截断]'; error_log("UrlNav: 内容过长,已截断至10000字符"); } error_log("UrlNav: 非白名单网站,description({$descriptionLength})或content:encoded({$encodedContentLength})长度≥500字,存入全文字段"); } else { // 内容太短,留空不存储 $fullContent = ''; error_log("UrlNav: 非白名单网站,description({$descriptionLength})和content:encoded({$encodedContentLength})都小于500字,全文字段留空"); } } else { // 白名单网站保持原有逻辑 if (!empty($fullContent)) { // 移除过多的空白字符 $fullContent = preg_replace('/\s+/', ' ', $fullContent); // 截断到合理长度 if (strlen($fullContent) > 10000) { $fullContent = substr($fullContent, 0, 10000) . '... [内容已截断]'; error_log("UrlNav: 内容过长,已截断至10000字符"); } } else { error_log("UrlNav: 警告:未找到任何内容"); $fullContent = $description; } } // 获取发布时间(保持原样) $pubDate = date('Y-m-d H:i:s', strtotime((string)$item->pubDate)); // 获取GUID(保持原样) $guid = (string)$item->guid; $feeds[] = array( 'title' => $articleTitle, 'link' => $articleLink, 'description' => $description, 'full_content' => $fullContent, // 🔴 现在非白名单网站可能为空 'pubDate' => $pubDate, 'guid' => $guid ); error_log("UrlNav: ✓ 文章解析完成: {$articleTitle}"); } catch (Exception $e) { // 🆕 增强:单个item失败不影响其他item error_log("UrlNav: 文章处理失败,跳过: " . $e->getMessage()); continue; } } } // ========== Atom格式解析(保持原样但应用相同逻辑修改) ========== elseif (isset($xml->entry) || ($xml->getName() == 'feed' && isset($xml->children('http://www.w3.org/2005/Atom')->entry))) { error_log("UrlNav: 检测到Atom格式"); // 获取所有entry元素(保持原样) $entries = isset($xml->entry) ? $xml->entry : $xml->children('http://www.w3.org/2005/Atom')->entry; foreach ($entries as $entryIndex => $entry) { $link = ''; $title = ''; $description = ''; $fullContent = ''; $pubDate = ''; $guid = ''; // 获取链接(完全保持不变) if (isset($entry->link)) { foreach ($entry->link as $linkElem) { $attributes = $linkElem->attributes(); if ((string)$attributes['rel'] == 'alternate' || empty((string)$attributes['rel'])) { $link = (string)$attributes['href']; break; } } } // 如果没有找到链接,使用id作为链接(完全保持不变) if (empty($link) && isset($entry->id)) { $link = (string)$entry->id; } // 获取标题(完全保持不变) if (isset($entry->title)) { $title = (string)$entry->title; } // 获取描述(summary)(完全保持不变) if (isset($entry->summary)) { $description = (string)$entry->summary; } // ===== Atom全文抓取 ===== // 1. 优先获取content元素(完全保持不变) $atomContent = ''; if (isset($entry->content)) { $contentElem = $entry->content; $attributes = $contentElem->attributes(); // 检查type属性 $type = (string)($attributes['type'] ?? ''); if ($type === 'html' || $type === 'xhtml' || empty($type)) { $atomContent = (string)$contentElem; $fullContent = $atomContent; error_log("UrlNav: 找到Atom content完整内容,类型: {$type},长度: " . strlen($fullContent)); } elseif ($type === 'text') { $atomContent = htmlspecialchars((string)$contentElem); $fullContent = $atomContent; error_log("UrlNav: 找到Atom text内容,长度: " . strlen($fullContent)); } } // 2. 如果没有content,尝试summary(完全保持不变) if (empty($fullContent) && isset($entry->summary)) { $fullContent = $description; error_log("UrlNav: 使用Atom summary作为内容,长度: " . strlen($fullContent)); } // 3. 检查是否有CDATA包裹(完全保持不变) if (!empty($fullContent) && strpos($fullContent, '/s', $fullContent, $matches)) { $fullContent = $matches[1]; error_log("UrlNav: 从CDATA提取Atom内容"); } } // ===== Atom格式的页面抓取判断(完全保持不变) ===== $pageContent = null; $atomContentLength = strlen($fullContent); // 判断逻辑:只有在白名单中且未超过限制才抓取 if ($isInWhitelist && $fullTextCount < $fullTextPerSite) { $needPageFetch = true; $fullTextCount++; error_log("UrlNav Atom: 白名单抓取全文 #{$fullTextCount}/{$fullTextPerSite} - {$title}"); } else { $needPageFetch = false; error_log("UrlNav Atom: " . ($isInWhitelist ? "已达限制" : "非白名单") . ",使用Atom内容({$atomContentLength}字符)"); } // 执行Atom页面抓取(仅白名单) if ($needPageFetch && !empty($link)) { if ($entryIndex > 0) { usleep(rand(300000, 800000)); } $pageContent = self::fetchFullContentWithSelector($link, $selector, $pageFetchTimeout); if (!empty($pageContent) && strlen($pageContent) > $atomContentLength + 300) { $fullContent = $pageContent; error_log("UrlNav: ✓ Atom页面抓取成功"); } } // ===== Atom页面抓取结束 ===== // 🔴 修改:Atom格式的非白名单网站全文字段处理 if (!$isInWhitelist) { // 非白名单网站,判断 summary 或 content 是否大于500字 $descriptionLength = strlen($description); $atomContentLength = strlen($atomContent); // 只要 summary 或 content 任意一个大于500字就存入全文 if ($descriptionLength >= 500 || $atomContentLength >= 500) { // 有足够长的内容,存入全文字段 // 内容截断 if (strlen($fullContent) > 10000) { $fullContent = substr($fullContent, 0, 10000) . '... [内容已截断]'; error_log("UrlNav: Atom内容过长,已截断"); } error_log("UrlNav Atom: 非白名单网站,summary({$descriptionLength})或content({$atomContentLength})长度≥500字,存入全文字段"); } else { // 内容太短,留空不存储 $fullContent = ''; error_log("UrlNav Atom: 非白名单网站,summary({$descriptionLength})和content({$atomContentLength})都小于500字,全文字段留空"); } } else { // 白名单网站保持原有逻辑 // 4. 内容截断(完全保持不变) if (!empty($fullContent) && strlen($fullContent) > 10000) { $fullContent = substr($fullContent, 0, 10000) . '... [内容已截断]'; error_log("UrlNav: Atom内容过长,已截断"); } } // ===== Atom全文抓取结束 ===== // 获取发布时间(updated或published)(完全保持不变) if (isset($entry->updated)) { $pubDate = date('Y-m-d H:i:s', strtotime((string)$entry->updated)); } elseif (isset($entry->published)) { $pubDate = date('Y-m-d H:i:s', strtotime((string)$entry->published)); } else { $pubDate = date('Y-m-d H:i:s'); } // 获取guid(id)(完全保持不变) if (isset($entry->id)) { $guid = (string)$entry->id; } else { $guid = md5($link . $pubDate); } $feeds[] = array( 'title' => $title, 'link' => $link, 'description' => $description, 'full_content' => $fullContent, // 🔴 现在非白名单网站可能为空 'pubDate' => $pubDate, 'guid' => $guid ); } } // ========== 其他RSS格式解析(保持原样但应用相同逻辑修改) ========== elseif (isset($xml->item)) { error_log("UrlNav: 检测到RSS格式 (直接item)"); foreach ($xml->item as $itemIndex => $item) { // 优先获取完整内容 $fullContent = ''; $description = isset($item->description) ? (string)$item->description : ''; $articleTitle = (string)$item->title; $articleLink = (string)$item->link; // 尝试获取content:encoded(完整内容) $encodedContent = ''; $namespaces = $item->getNamespaces(true); if (isset($namespaces['content'])) { $contentNs = $item->children($namespaces['content']); if (isset($contentNs->encoded)) { $encodedContent = (string)$contentNs->encoded; $fullContent = $encodedContent; error_log("UrlNav: 找到content:encoded完整内容"); } } // 如果没找到content:encoded,使用description if (empty($fullContent) && !empty($description)) { $fullContent = $description; error_log("UrlNav: 使用description作为内容"); } // ===== 其他格式的页面抓取判断 ===== $pageContent = null; $rssContentLength = strlen($fullContent); // 判断逻辑:只有在白名单中且未超过限制才抓取 if ($isInWhitelist && $fullTextCount < $fullTextPerSite) { $needPageFetch = true; $fullTextCount++; error_log("UrlNav Other: 白名单抓取全文 #{$fullTextCount}/{$fullTextPerSite} - {$articleTitle}"); } else { $needPageFetch = false; } // 页面抓取(仅白名单) if ($needPageFetch && !empty($articleLink)) { if ($itemIndex > 0) { usleep(rand(300000, 800000)); } $pageContent = self::fetchFullContentWithSelector($articleLink, $selector, $pageFetchTimeout); if (!empty($pageContent) && strlen($pageContent) > strlen($fullContent) + 300) { $fullContent = $pageContent; } } // ===== 其他格式页面抓取结束 ===== // 🔴 修改:其他格式的非白名单网站全文字段处理 if (!$isInWhitelist) { // 非白名单网站,判断 description 或 content:encoded 是否大于500字 $descriptionLength = strlen($description); $encodedContentLength = strlen($encodedContent); // 只要 description 或 content:encoded 任意一个大于500字就存入全文 if ($descriptionLength >= 500 || $encodedContentLength >= 500) { // 有足够长的内容,存入全文字段 // 内容截断 if (!empty($fullContent) && strlen($fullContent) > 10000) { $fullContent = substr($fullContent, 0, 10000) . '... [内容已截断]'; } error_log("UrlNav Other: 非白名单网站,description({$descriptionLength})或content:encoded({$encodedContentLength})长度≥500字,存入全文字段"); } else { // 内容太短,留空不存储 $fullContent = ''; error_log("UrlNav Other: 非白名单网站,description({$descriptionLength})和content:encoded({$encodedContentLength})都小于500字,全文字段留空"); } } else { // 白名单网站保持原有逻辑 // 内容截断 if (!empty($fullContent) && strlen($fullContent) > 10000) { $fullContent = substr($fullContent, 0, 10000) . '... [内容已截断]'; } } $feeds[] = array( 'title' => $articleTitle, 'link' => $articleLink, 'description' => $description, 'full_content' => $fullContent, // 🔴 现在非白名单网站可能为空 'pubDate' => date('Y-m-d H:i:s', strtotime((string)$item->pubDate)), 'guid' => (string)$item->guid ); } } // ========== 尝试检测命名空间(保持原样但应用相同逻辑修改) ========== else { // 检查是否有Atom命名空间 $namespaces = $xml->getNamespaces(true); foreach ($namespaces as $ns) { if (strpos($ns, 'www.w3.org/2005/Atom') !== false) { $atom = $xml->children($ns); if (isset($atom->entry)) { error_log("UrlNav: 检测到Atom命名空间格式"); foreach ($atom->entry as $entryIndex => $entry) { $entry = $entry->children($ns); // 获取完整内容 $fullContent = ''; $atomContent = ''; $entryDescription = ''; if (isset($entry->content)) { $atomContent = (string)$entry->content; $fullContent = $atomContent; } if (isset($entry->summary)) { $entryDescription = (string)$entry->summary; if (empty($fullContent)) { $fullContent = $entryDescription; } } // ===== 命名空间格式的页面抓取判断 ===== $needPageFetch = false; $entryLink = isset($entry->link) ? (string)$entry->link : ''; $rssContentLength = strlen($fullContent); // 判断逻辑:只有在白名单中且未超过限制才抓取 if ($isInWhitelist && $fullTextCount < $fullTextPerSite) { $needPageFetch = true; $fullTextCount++; } // 页面抓取(仅白名单) if ($needPageFetch && !empty($entryLink)) { if ($entryIndex > 0) { usleep(rand(300000, 800000)); } $pageContent = self::fetchFullContentWithSelector($entryLink, $selector, $pageFetchTimeout); if (!empty($pageContent)) { $fullContent = $pageContent; } } // ===== 命名空间格式页面抓取结束 ===== // 🔴 修改:命名空间格式的非白名单网站全文字段处理 if (!$isInWhitelist) { // 非白名单网站,判断 summary 或 content 是否大于500字 $descriptionLength = strlen($entryDescription); $contentLength = strlen($atomContent); // 只要 summary 或 content 任意一个大于500字就存入全文 if ($descriptionLength >= 500 || $contentLength >= 500) { // 有足够长的内容,存入全文字段 // 内容截断 if (!empty($fullContent) && strlen($fullContent) > 10000) { $fullContent = substr($fullContent, 0, 10000) . '... [内容已截断]'; } error_log("UrlNav Namespace: 非白名单网站,summary({$descriptionLength})或content({$contentLength})长度≥500字,存入全文字段"); } else { // 内容太短,留空不存储 $fullContent = ''; error_log("UrlNav Namespace: 非白名单网站,summary({$descriptionLength})和content({$contentLength})都小于500字,全文字段留空"); } } else { // 白名单网站保持原有逻辑 // 内容截断 if (!empty($fullContent) && strlen($fullContent) > 10000) { $fullContent = substr($fullContent, 0, 10000) . '... [内容已截断]'; } } $feeds[] = array( 'title' => isset($entry->title) ? (string)$entry->title : '', 'link' => $entryLink, 'description' => $entryDescription, 'full_content' => $fullContent, // 🔴 现在非白名单网站可能为空 'pubDate' => isset($entry->updated) ? date('Y-m-d H:i:s', strtotime((string)$entry->updated)) : date('Y-m-d H:i:s'), 'guid' => isset($entry->id) ? (string)$entry->id : '' ); } break; } } } if (empty($feeds)) { error_log("UrlNav: 无法识别的RSS格式"); throw new Exception('无法识别的RSS格式'); } } if (empty($feeds)) { error_log("UrlNav: RSS中没有找到文章内容"); throw new Exception('RSS中没有找到文章内容'); } error_log("UrlNav: 找到 " . count($feeds) . " 篇文章"); error_log("UrlNav: <<< RSS解析成功"); return $feeds; } catch (Exception $e) { error_log("UrlNav: <<< RSS解析失败: " . $e->getMessage()); throw new Exception("解析RSS失败 [{$rssUrl}]: " . $e->getMessage()); } } /** * 提取CDATA内容(处理多层或不规范CDATA) * @param string $content 原始内容 * @param string $source 来源标识(用于日志) * @return string 处理后的内容 */ private static function extractCdataContent($content, $source = '') { if (empty($content)) { return $content; } // 如果内容包含CDATA标记 if (strpos($content, '/s', $content, $matches)) { $extracted = $matches[1]; // 如果提取的内容明显比原来短,说明CDATA格式正确 if (strlen($extracted) < strlen($content) * 0.9 && strlen($extracted) > 50) { $content = $extracted; error_log("UrlNav: 从CDATA提取 {$source} 内容 (第{$cdataCount}次)"); } else { // CDATA可能嵌套或不规范,尝试移除CDATA标记 $content = str_replace('', '', $content); error_log("UrlNav: 清理不规范的CDATA标记"); break; } } else { // CDATA格式不正确,直接移除标记 $content = str_replace('', '', $content); error_log("UrlNav: 清理不规范的CDATA标记"); break; } } $finalLength = strlen($content); if ($originalLength != $finalLength) { error_log("UrlNav: CDATA处理完成 {$source},从 {$originalLength} 到 {$finalLength} 字符"); } } return $content; } /** * 从文章页面抓取完整内容 * @param string $articleUrl 文章链接 * @param string $title 文章标题(用于日志) * @param int $timeout 超时时间(秒) * @return string|null 抓取到的内容,失败返回null */ private static function fetchFullContentFromPage($articleUrl, $title = '', $timeout = 10) { error_log("UrlNav: 尝试从页面抓取完整内容: {$articleUrl}"); try { // 设置请求头,模拟浏览器 $context = stream_context_create([ 'http' => [ 'timeout' => $timeout, 'ignore_errors' => true, 'header' => "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\r\n" . "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8\r\n" . "Accept-Language: zh-CN,zh;q=0.9,en;q=0.8\r\n" . "Accept-Encoding: gzip\r\n" . "Connection: close\r\n" . "Upgrade-Insecure-Requests: 1", 'method' => 'GET' ], 'ssl' => [ 'verify_peer' => false, 'verify_peer_name' => false, 'allow_self_signed' => true ] ]); $html = @file_get_contents($articleUrl, false, $context); if ($html === false) { $error = error_get_last(); error_log("UrlNav: 无法访问文章页面: " . ($error['message'] ?? '未知错误')); return null; } if (empty($html)) { error_log("UrlNav: 文章页面内容为空"); return null; } $htmlLength = strlen($html); error_log("UrlNav: 获取页面成功,长度: {$htmlLength} 字节"); // 转换编码为UTF-8(如果检测到其他编码) $encoding = 'UTF-8'; if (preg_match('/]*charset=["\']?([a-zA-Z0-9\-_]+)["\']?/i', $html, $matches)) { $encoding = strtoupper($matches[1]); if ($encoding !== 'UTF-8') { $html = mb_convert_encoding($html, 'UTF-8', $encoding); error_log("UrlNav: 检测到编码 {$encoding},已转换为UTF-8"); } } // 提取内容 $fullContent = ''; // 方法1:尝试提取Open Graph描述 if (preg_match('/]*>(.*?)<\/article>/is', '/]*>(.*?)<\/div>/is', '/]*>(.*?)<\/div>/is', // 通用内容区域 '/]*>(.*?)<\/div>/is', '/]*>(.*?)<\/div>/is', // Typecho主题 '/]*>(.*?)<\/div>/is', '/]*>(.*?)<\/div>/is', // 其他常见模式 '/]*>(.*?)<\/div>/is', '/]*>(.*?)<\/div>/is', '/]*>(.*?)<\/div>/is' ]; foreach ($contentPatterns as $pattern) { if (preg_match($pattern, $html, $matches) && isset($matches[1])) { $extracted = $matches[1]; // 移除脚本和样式 $extracted = preg_replace('/]*>.*?<\/script>/is', '', $extracted); $extracted = preg_replace('/]*>.*?<\/style>/is', '', $extracted); $extracted = preg_replace('//s', '', $extracted); // 提取纯文本,但保留段落结构 $extracted = strip_tags($extracted, '