一本大道不卡专区一区二区三区,精品成AV人一区二区三区,欧美另类图片区视频一区在线观看

Nginx如何設置才能精準識別并攔截無用的爬蟲，只保留Googlebot（谷歌）、Bingbot（必應）、Baiduspider（百度）等少數(shù)幾個真實有用的爬蟲？

當前位置：點晴教程→知識管理交流 →『技術文檔交流』

admin

2025年9月25日 10:14 本文熱度 1121

可以通過Nginx的$http_user_agent變量結合IP驗證來實現(xiàn)精準識別。以下是完整的配置方案：

1. 基礎User-Agent過濾配置

nginx

# 在http或server塊中定義允許的爬蟲映射
map $http_user_agent $allowed_bot {
    default 0;
    
    # Google爬蟲
    "~*googlebot" 1;
    "~*Googlebot-News" 1;
    "~*Googlebot-Image" 1;
    "~*Googlebot-Video" 1;
    "~*Mediapartners-Google" 1;
    "~*AdsBot-Google" 1;
    
    # 百度爬蟲
    "~*baiduspider" 1;
    "~*Baiduspider-image" 1;
    "~*Baiduspider-news" 1;
    "~*Baiduspider-video" 1;
    
    # 必應爬蟲
    "~*bingbot" 1;
    "~*adidxbot" 1;
    
    # 其他需要允許的爬蟲
    "~*Twitterbot" 1;
    "~*facebookexternalhit" 1;
    "~*Slackbot" 1;
}
# 攔截非允許的爬蟲
server {
    listen 80;
    server_name yourdomain.com;
    
    # 攔截非允許的User-Agent
    if ($allowed_bot = 0) {
        # 檢查是否包含爬蟲關鍵詞但不在白名單中
        if ($http_user_agent ~* (bot|spider|crawler|scraper)) {
            return 444;  # 直接關閉連接
            # 或者返回403：return 403 "Access Denied";
        }
    }
    
    # 其他正常配置...
    location / {
        # 你的正常配置
    }
}

2. 增強版：IP驗證 + User-Agent雙重驗證

nginx

# 定義官方爬蟲IP段（需要定期更新）
geo $valid_bot_ip {
    default 0;
    
    # Google官方IP段（示例，需要定期更新）
    66.249.64.0/19 1;
    64.233.160.0/19 1;
    66.102.0.0/20 1;
    74.125.0.0/16 1;
    
    # 百度官方IP段
    180.76.0.0/16 1;
    119.63.196.0/16 1;
    123.125.71.0/16 1;
    
    # 必應官方IP段
    157.55.0.0/16 1;
    207.46.0.0/16 1;
    65.52.0.0/14 1;
}
# 增強版爬蟲驗證
map "$http_user_agent:$valid_bot_ip" $is_valid_bot {
    default 0;
    "~*googlebot:1" 1;
    "~*baiduspider:1" 1;
    "~*bingbot:1" 1;
    # 添加其他需要驗證的爬蟲...
}
server {
    listen 80;
    server_name yourdomain.com;
    
    # 嚴格的爬蟲驗證
    if ($http_user_agent ~* (bot|spider|crawler|scraper)) {
        if ($is_valid_bot = 0) {
            # 記錄可疑爬蟲訪問
            access_log /var/log/nginx/bot_access.log;
            
            # 根據(jù)嚴重程度處理
            if ($http_user_agent ~* (ahrefs|semrush|mj12bot)) {
                return 444;  # 已知惡意爬蟲直接阻斷
            }
            
            # 其他可疑爬蟲限流或返回假數(shù)據(jù)
            limit_req zone=bot_limit burst=5 nodelay;
            return 200 "正常頁面內容";  # 可以返回簡化版頁面
        }
    }
    
    location / {
        # 正常請求處理
    }
}
# 爬蟲限流區(qū)域
limit_req_zone $binary_remote_addr zone=bot_limit:10m rate=1r/s;

3. 針對特定路徑的爬蟲控制

nginx

# 對robots.txt和sitemap特殊處理
location = /robots.txt {
    # 允許所有爬蟲訪問robots.txt
    add_header Content-Type text/plain;
    return 200 "User-agent: *\nDisallow: /admin/\nDisallow: /private/\nSitemap: https://yourdomain.com/sitemap.xml";
}
# 對sitemap文件特殊處理
location ~* \.(xml|txt)$ {
    if ($http_user_agent ~* (bot|spider|crawler)) {
        if ($is_valid_bot = 0) {
            return 403;
        }
    }
    # 正常處理sitemap文件
}

4. 完整的配置示例

nginx

http {
    # 爬蟲IP白名單（需要定期更新）
    geo $bot_ip_whitelist {
        default 0;
        include /etc/nginx/conf.d/bot-ips.conf;  # 外部IP列表文件
    }
    
    # 允許的爬蟲User-Agent映射
    map $http_user_agent $allowed_bot_ua {
        default 0;
        "~*(googlebot|baiduspider|bingbot|twitterbot|facebookexternalhit)" 1;
    }
    
    # 惡意爬蟲特征
    map $http_user_agent $bad_bot {
        default 0;
        "~*(ahrefs|semrush|mj12bot|dotbot|petalbot)" 1;
    }
    
    server {
        listen 80;
        server_name example.com;
        
        # 惡意爬蟲直接阻斷
        if ($bad_bot = 1) {
            return 444;
        }
        
        # 可疑爬蟲驗證
        if ($http_user_agent ~* (bot|spider|crawler)) {
            set $bot_check "A";
            
            # 檢查是否在白名單中
            if ($allowed_bot_ua = 1) {
                set $bot_check "${bot_check}B";
            }
            
            # 檢查IP是否在白名單中（可選，嚴格模式）
            if ($bot_ip_whitelist = 1) {
                set $bot_check "${bot_check}C";
            }
            
            # 如果不在白名單中
            if ($bot_check = "A") {
                access_log /var/log/nginx/suspicious_bots.log;
                limit_req zone=bot_limit burst=3 nodelay;
                
                # 返回假數(shù)據(jù)或攔截
                return 200 "<!DOCTYPE html><html><head><title>網(wǎng)站維護中</title></head><body></body></html>";
            }
        }
        
        location / {
            # 正常業(yè)務邏輯
            try_files $uri $uri/ /index.html;
        }
        
        # 靜態(tài)資源允許所有訪問
        location ~* \.(jpg|jpeg|png|gif|ico|css|js)$ {
            expires 1y;
            add_header Cache-Control "public, immutable";
        }
    }
    
    # 爬蟲限流配置
    limit_req_zone $binary_remote_addr zone=bot_limit:10m rate=1r/m;
}

5. 維護和更新

創(chuàng)建IP列表文件 /etc/nginx/conf.d/bot-ips.conf：

text

# Google IP段
66.249.64.0/19 1;
64.233.160.0/19 1;
# 百度 IP段  
180.76.0.0/16 1;
119.63.196.0/16 1;
# 必應 IP段
157.55.0.0/16 1;
207.46.0.0/16 1;

定期更新腳本：

bash

#!/bin/bash
# 更新爬蟲IP列表
wget -O /tmp/google_ips.txt https://www.google.com/ipranges/goog.txt
# 處理獲取的IP段并更新配置文件

注意事項

User-Agent可以被偽造，所以IP驗證很重要
定期更新IP段，官方爬蟲IP會變化
監(jiān)控日志，及時發(fā)現(xiàn)新的惡意爬蟲
避免誤殺，測試時先從寬松開始
考慮性能影響，復雜的匹配規(guī)則會增加CPU負擔

這種配置可以有效攔截大多數(shù)無用爬蟲，同時確保搜索引擎正常收錄。

該文章在 2025/9/25 10:14:07 編輯過

成人欧美一区二区三区的电影,日韩一级一欧美一级国产,国产成人国拍亚洲精品,无码人妻精品一区二区三区毛片,伊人久久无码大香线蕉综合

Nginx如何設置才能精準識別并攔截無用的爬蟲，只保留Googlebot（谷歌）、Bingbot（必應）、Baiduspider（百度）等少數(shù)幾個真實有用的爬蟲？

1. 基礎User-Agent過濾配置

2. 增強版：IP驗證 + User-Agent雙重驗證

3. 針對特定路徑的爬蟲控制

4. 完整的配置示例

5. 維護和更新

注意事項

Nginx如何設置才能精準識別并攔截無用的爬蟲，只保留Googlebot（谷歌）、Bingbot（必應）、Baiduspider（百度）等少數(shù)幾個真實有用的爬蟲？