# AWSTATS ROBOTS DATABASE #------------------------------------------------------- # If you want to add robots to extend AWStats database detection capabilities, # you must add an entry in RobotsSearchIDOrder_listx and RobotsHashIDLib. # The entry in RobotsSearchIDOrder_listx is a Perl regular expression # (see http://perldoc.perl.org/perlreref.html). AWSTats applies these # expressions to the user agent string in the order given by the lists. The # first match specifies the robot. # # Note: This regular expression must not contain any whitespace. # Otherwise AWStats will produce lines in the database that # will be misinterpreted and as a consequence the corresponding data in the # generated HTML reports will be wrong. If you want to match whitespace in # the user agent string, use other constructs like '\s', '[:blank:]', # '\p{IsSpace}', '\x20' etc. # # The corresponding entry in RobotsHashIDLib contains the regular expression # as key, followed by a string containing HTML-text. AWStats inserts this # text into reports to describe the bot. If possible the text should contain # a link to the bot home page. This makes it easier for sysadmins to find # the information necessary e.g. to adapt the robots.txt file. # # An entry in the RobotsAffiliateLib is not necessary. An entry in this list # contains as first part the regular expression specifying the bot. The # second part is a string that gives the Company or product managing the bot. # This information is not used yet. # # There are several sorts of bots that AWStats is not able to detect and # therefore a considerable amount of bot generated traffic counts # as user traffic: # # a) A crawler that identifies itself in the referrer string, but not in # the user agent string. An example is the crawler from semalt.semalt.com. # # b) Crawlers that correctly access robots.txt but identify themselves in # in the user agent string only once or just a few times. Most of the # time a user agent string ist used that does not contain hints that # a bot is involved. An example is the iCjobs spider. # msnbot-UDiscovery/2.0b seems to show this behaviour too. # # # #------------------------------------------------------- # 2016-09-02 RobC # Fixed a few errors and added a few missing bots from awstats 7.5 release. # # 2016-08-28 RobC # Complete re-build of this file almost from scratch. # dropped many old bots, added many new bots and reordered file. # edited and added regex expressions to stop spaces causing problems. # You should tune file by placing the most common robots crawling your site at top # in List1. # # # N.B. many bots need to be in correct order so don't chnage order without checking if # change will cause counts to be allocated to wrong bot. Not always simple. # # # 2005-08-19 Sean Carlos http://www.antezeta.com/awstats.html # added dipsie (not tested with real data). # added DomainsDB.net http://domainsdb.net/ # added ia_archiver-web.archive.org (was inadvertently grouped with Alexa traffic) # added Nutch (used by looksmart (furl?)) # added rssImagesBot # added Sqworm # added t\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e # added w3c css-validator # added documentation link to bot home pages for above and selected major bots. # In the case of international bots, choose .com page. # Included tool tip (html "title"). # To do: parameterize to match both AWStats language and tooltips settings. # To do: add html links for all bots based on current documentation in source # files referenced below. # changed '\wbot[\/\-]', to '\wbot[\/\-]' (removed comma) # made minor grammar corrections to notes below # 2005-08-24 added YahooSeeker-Testing # added w3c-checklink # updated url for ask.com # 2005-08-24 added Girafabot http://www.girafa.com/ # 2005-08-30 added PluckFeedCrawler http://www.pluck.com/ # added Gaisbot/3.0 (robot05@gais.cs.ccu.edu.tw; ) # dded geniebot (wgao@genieknows.com) # added BecomeBot link http://www.become.com/site_owners.html # added topicblogs http://www.topicblogs.com/ # added Powermarks; seen used by referrer spam # added YahooSeeker # added NG/2. http://www.exabot.com/ # 2005-09-15 added link for Walhello appie # added bender focused_crawler # updated YahooSeeker description (blog crawler) # 2005-09-16 added link for http://linkchecker.sourceforge.net # added ConveraCrawler/0.9d ( http://www.authoritativeweb.com/crawl) # added Blogslive info@blogslive.com intelliseek.com # added BlogPulse (ISSpider-3.0) intelliseek.com # 2005-09-26 added Feedfetcher-Google (http://www.google.com/feedfetcher.html) # added EverbeeCrawler # added Yahoo-Blogs http://help.yahoo.com/help/us/ysearch/crawling/crawling-02.html # added link for Bloglines http://www.bloglines.com # 2005-10-19 fixed Feedfetcher-Google (http://www.google.com/feedfetcher.html) # added Blogshares Spiders (Synchronized V1.5.1) # added yacy # 2005-11-21 added Argus www.simpy.com # added BlogsSay :: RSS Search Crawler (http://www.blogssay.com/) # added MJ12bot http://majestic12.co.uk/bot.php # added OpenTaggerBot (http://www.opentagger.com/opentaggerbot.htm) # added OutfoxBot/0.3 (For internet experiments; outfox.agent@gmail.com) # added RufusBot Rufus Web Miner http://64.124.122.252.webaroo.com/feedback.html # added Seekbot (http://www.seekbot.net/bot.html) # added Yahoo-MMCrawler/3.x (mms-mmcrawler-support@yahoo-inc.com) # added link for BaiDuSpider # added link for Blogshares Spider # added link for StackRambler http://www.rambler.ru/doc/faq.shtml # added link for WISENutbot # added link for ZyBorg/1.0 (wn-14.zyborg@looksmart.net; http://www.WISEnutbot.com. Moved location to above wisenut to avoid classification as wisenut # 2005-12-15 # added FAST Enteprise Crawler/6 (www dot fastsearch dot com). Note spelling Enteprise not Enterprise. # added findlinks http://wortschatz.uni-leipzig.de/findlinks/ # added IBM Almaden Research Center WebFountain™ http://www.almaden.ibm.com/cs/crawler [hc3] # added INFOMINE/8.0 VLCrawler (http://infomine.ucr.edu/useragents) # added lmspider (lmspider@scansoft.com) http://www.nuance.com/ # added noxtrumbot http://www.noxtrum.com/ # added SandCrawler (Microsoft) # added SBIder http://www.sitesell.com/sbider.html # added SeznamBot http://fulltext.seznam.cz/ # added sohu-search http://corp.sohu.com/ (looked for //robots.txt not /robots.txt) # added the ruffle SemanticWeb crawler v0.5 - http://www.unreach.net # added WebVulnCrawl/1.0 libwww-perl/5.803 (looked for //robots.txt not /robots.txt) # added Yahoo! Japan keyoshid http://www.yahoo.co.jp/ # added Y!J http://help.yahoo.co.jp/help/jp/search/indexing/indexing-15.html # added link for GigaBot # added link for MagpieRSS # added link for MSIECrawler # 2005-12-21 # added aipbot http://www.aipbot.com aipbot@aipbot.com [matthys70 users.sourceforge.net] # added Everest-Vulcan Inc./0.1 (R&D project; http://everest.vulcan.com/crawlerhelp) # added Fast-Search-Engine http://www.fast-search-engine.com/ [matthys70 users.sourceforge.net] # added g2Crawler (nobody@airmail.net) http://crawler.instantnetworks.net/ # added Jakarta commons-httpclient http://jakarta.apache.org/commons/httpclient/ (hit robots.txt). May be used as robot or browser - a site may want to remove this entry. # added OmniExplorer_Bot http://www.omni-explorer.com/ [matthys70 users.sourceforge.net] # added USTC-Semantic-Group ai.ustc.edu.cn/mas/en/research/index.php ? # 2005-12-22 # added EARTHCOM.info www.earthcom.info # added HTTrack off-line browser 'httrack','HTTrack', http://www.httrack.com/ [Moizes Gabor] # added KummHttp http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_g_l_301105_2\b [Moizes Gabor] # 2006-01-01 # added Dulance http://www.dulance.com/bot.jsp # added MojeekBot http://www.mojeek.com/bot.html # added nicebot http://www.egghelp.org/setup.htm ? # added Snappy http://www.urltrends.com/faq.php # added sohu agent # added VORTEX http://marty.anstey.ca/robots/vortex/ [matthys70 users.sourceforge.net] # added zspider http://feedback.redkolibri.com/ # 2006-01-13 # added boitho.com-dc http://www.boitho.com/dcbot.html # added IRLbot http://irl.cs.tamu.edu/crawler # added virus_detector virus_harvester@securecomputing.com # added Wavefire http://www.wavefire.com; info@wavefire.com # added WebFilter Robot # 2006-01-24 # added Shim-Crawler http://www.logos.ic.i.u-tokyo.ac.jp/crawler/; crawl@logos.ic.i.u-tokyo.ac.jp # added Exabot exabot.com # added LetsCrawl.com http://letscrawl.com # added ichiro http://help.goo.ne.jp/door/crawlerE.html # 2006-01-27 additional 22 robots from a list provided by Moizes Gabor # added ALeadSoftbot http://www.aleadsoft.com/bot.htm # added CipinetBot http://www.cipinet.com/bot.html # added Cuasarbot http://www.cuasar.com/ # added Dumbot http://www.dumbfind.com/ # added Extreme_Picture_Finder http://www.exisoftware.com/ # added Fooky.com/ScorpionBot/ScoutOut http://www.fooky.com/scorpionbots # added IlTrovatore-Setaccio http://www.iltrovatore.it/aiuto/motore_di_ricerca.html bot@iltrovatore.it # added InsurancoBot http://www.fastspywareremoval.com/ # added InternetArchive http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org # added KazoomBot http://www.kazoom.ca/bot.html kazoombot@kazoom.ca # added Kurzor http://www.easymail.hu/ cursor@easymail.hu # added NutchCVS http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org # added NutchOSU-VLIB http://lucene.apache.org/nutch/bot.html nutch-agent@lucene.apache.org # added Orbiter http://www.dailyorbit.com/bot.htm # added PHP_version_tracker http://www.nexen.net/phpversion/bot.php # added SuperBot http://www.sparkleware.com/superbot/ # added SynooBot http://www.synoo.de/bot.html webmaster@synoo.com # added TestBot http://www.agbrain.com/ # added TutorGigBot http://www.tutorgig.info/ # added WebIndexer mailto://webindexerv1@yahoo.com # added WebMiner http://64.124.122.252/feedback.html # 2006-02-01 # added heritrix https://sourceforge.net/forum/message.php?msg_id=3550202 # added Zeus Webster Pro https://sourceforge.net/forum/message.php?msg_id=3141164 # additional robots from a list provided by Moizes Gabor [ mojzi -a-t- free mail hu ] # added Candlelight_Favorites_Inspector # added DomainChecker # added EasyDL # added FavOrg # added Favorites_Sweeper # added Html_Link_Validator # added Internet_Ninja # added JRTwine_Software_Check_Favorites_Utility # fixed Microsoft_URL_Control # added miniRank # added Missigua_Locator # added NPBot # added Ocelli # added Onet.pl_SA # added proodleBot # added SearchGuild_DMOZ_Experiment # added Susie # added Website_Monitoring_Bot # added Xenu_Link_Sleuth # 2006-05-15 # added ASPseek http://www.aspseek.org/ # added AdamM Bot http://home.blic.net/adamm/ # added archive.org_bot http://crawls.archive.org/collections/bncf/crawl.html # added arianna.libero.it (Italian Portal/search engine) # added Biz360 spider http://www.biz360.com # added BlogBridge Service http://www.blogbridge.com/ # added BlogSearch http://www.icerocket.com/ # added libcrawl # added edgeio-relanshanbottriever http://www.edgeio.com # added FeedFlow http://feedflow.com/about # added Biblioteca Nazionale Centrale di Firenze (Italian National Archive) http://www.bncf.firenze.sbn.it/raccolta.txt # added Java catchall - used by many spam bots # added lanshanbot http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_g_l_140406_1%5Cb # added msnbot-media http://search.msn.com/msnbot.htm # added MT::Telegraph::Agent # added Netluchs http://www.netluchs.de/ (German SE bot) # added oBot http://www.webmasterworld.com/forum11/1616.htm # added Onfolio http://www.onfolio.com/ (IE Toolbar plugin) - hit rss feeds. # added ping.blo.gs http://blo.gs/ping.php blog bot # added Sphere Scout http://www.sphere.com/ # added sproose crawler http://www.sproose.com/bot.html # added SyndicAPI http://syndicapi.com/bot.html # added Yahoo! Mindset http://mindset.research.yahoo.com/ # added msrabot # added Vagabondo & Vagabondo-WAP http://www.wise-guys.nl/Contact/index.php?botselected=webagents)#=uk # fixed Missigua Locator detection (Missigua_Locator -> Missigua Locator) # changed echo to echo! to avoid conflict with the bonecho (Firefox 2.0) browser. # This requires you to reprocess historic logs if you want EchO! to be recognized for older reports. # 2006-05-17 # added Alpha Search Agent # 62.152.125.60 Eurologon Srl # added Krugle http://www.krugle.com/crawler/info.html the search engine for developers # added Octora Beta Bot http://www.octora.com/ # Blog and Rss Search Engine # added UbiCrawler http://law.dsi.unimi.it/ubicrawler/ # added Yahoo! Slurp China http://misc.yahoo.com.cn/help.html # You must reprocess old logs for the Yahoo! Slurp China bot to be detected in old reports # 2006-05-20 # added 1-More Scanner http://www.myzips.com/software/1-More-Scanner.phtml # added Accoona-AI-Agent http://www.accoona.com/ # added ActiveBookmark http://www.libmaster.com/active_bookmark.php # added BIGLOTRON http://www.biglotron.com/robot.html # added Bookmark-Manager http://bkm.sourceforge.net/ # added cbn00glebot # added Cerberian Drtrs http://www.pgts.com.au/cgi-bin/psql?robot_info=25240 # added CFNetwork http://www.cocoadev.com/index.pl?CFNetwork # added CheckWeb link validator http://p.duby.free.fr/chkweb.htm # added Computer and Automation Research Institute Crawler http://www.ilab.sztaki.hu/~stamas/publications/p184-benczur.html # added ConveraCrawler http://www.authoritativeweb.com/crawl/ # added ConveraMultiMediaCrawler http://www.authoritativeweb.com/crawl/ # added CSE HTML Validator Lite Online http://online.htmlvalidator.com/php/onlinevallite.php # added Cursor http://adcenter.hu/docs/en/bot.html # added Custo http://www.netwu.com/custo/ # added DataFountains/DMOZ Downloader http://infomine.ucr.edu/ # added Deepindex http://www.deepindex.net/faq.php # added DNSGroup http://www.dnsgroup.com/ # added DoCoMo http://www.nttdocomo.co.jp/ # added dumm.de-Bot http://www.dumm.de/ # added ETS v http://www.freetranslation.com/help/ # added eventax http://www.eventax.de/ # added FAST Enterprise Crawler * crawleradmin.t-info@telekom.de http://www.telekom.de/ # added FAST Enterprise Crawler http://www.fast.no/ # added FAST Enterprise Crawler * T-Info_BI_cluster crawleradmin.t-info@telekom.de http://www.telekom.de/ # added FeedValidator http://feedvalidator.org/ # added FilmkameraBot http://www.filmkamera.at/bot.html # added Findexa Crawler http://www.findexa.no/gulesider/article26548.ece # added Global Fetch http://www.wesonet.com/ # added GOFORITBOT http://www.goforit.com/about/ # added GoForIt.com http://www.goforit.com/about/ # added GPU p2p crawler http://gpu.sourceforge.net/search_engine.php # added HooWWWer http://cosco.hiit.fi/search/hoowwwer/ # added HPPrint # added HTMLParser http://htmlparser.sourceforge.net/ # added Hundesuche.com-Bot http://www.hundesuche.com/ # added InfoBot http://www.infobot.org/ # added InfociousBot http://corp.infocious.com/tech_crawler.php # added InternetSupervision http://internetsupervision.com/ # added isearch2006 http://www.yahoo.com.cn/ # added IUPUI_Research_Bot http://spamhuntress.com/2005/04/25/a-mail-harvester-visits/ # added KalamBot http://64.124.122.251/feedback.html # added kamano.de NewsFeedVerzeichnis http://www.kamano.de/ # added Kevin http://dznet.com/kevin/ # added KnowItAll http://www.cs.washington.edu/research/knowitall/ # added Knowledge.com http://www.knowledge.com/ # added Kouaa Krawler http://www.kouaa.com/ # added ksibot http://ego.ms.mff.cuni.cz/ # added Link Valet Online http://www.htmlhelp.com/tools/valet/ # added lwp-request http://search.cpan.org/~gaas/libwww-perl-5.69/bin/lwp-request # added lwp-trivial http://search.cpan.org/src/GAAS/libwww-perl-5.805/lib/LWP/Simple.pm # added MapoftheInternet.com http://MapoftheInternet.com/ # added Matrix S.p.A. - FAST Enterprise Crawler http://tin.virgilio.it/ # added Megite http://www.megite.com/ # added Metaspinner http://index.meta-spinner.de/ # added Mini-reptile # added Misterbot http://www.misterbot.fr/ # added Miva http://www.miva.com/ # added Mizzu Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_m_141105_2\b # added MSRBOT http://research.microsoft.com/research/sv/msrbot/ # added MS SharePoint Portal Server - MS Search 4.0 Robot http://support.microsoft.com/default.aspx?scid=kb;en-us;284022 # added Mydoyouhike http://www.doyouhike.net/my # added NASA Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_140506_2\b # added NetSprint http://www.netsprint.pl/serwis/ # added NimbleCrawler http://www.healthline.com/ # added OpenWebSpider http://www.openwebspider.org/ # added Oracle Ultra Search http://www.oracle.com/technology/products/ultrasearch/index.html # added OSSProxy http://www.marketscore.com/FAQ.Aspx # added passwordmaker.org http://passwordmaker.org/ # added PEAR HTTP Request class http://pear.php.net/ # added PEERbot http://www.peerbot.com/ # added PHP version tracker http://www.nexen.net/phpversion/bot.php # added PictureOfInternet http://malfunction.org/poi/ # added plinki http://www.plinki.com/ # added Port Huron Labs http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1133\b # added PostFavorites http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_n_s_1135\b # added ProjectWF-java-test-crawler # added PyQuery http://sourceforge.net/projects/pyquery/ # added Schizozilla http://spamhuntress.com/2005/03/18/gizmo/ # added Scumbot # added Sensis Web Crawler http://www.sensis.com.au/ # added snap.com beta crawler http://www.snap.com/ # added Steeler http://www.tkl.iis.u-tokyo.ac.jp/~crawler/ # added STEROID Download http://faqs.org.ru/progr/pascal/delphi_internet2.htm # added Suchfin-Bot http://www.suchfin.de/ # added Sunrise http://www.sunrisexp.com/ # added Tagyu Agent http://www.tagyu.com/ # added Tcl http client package http://www.tcl.tk/man/tcl8.4/TclCmd/http.htm # added TeragramCrawlerSURF http://www.teragram.com/ # added Test Crawler http://netp.ath.cx/ # added UnChaos Bot Hybrid Web Search Engine http://www.unchaos.com/ # added unido-bot http://www.unchina.org/unido/unido/our_projects/3_3.html # added UniversalFeedParser http://feedparser.org/ (seen from md301000.inktomisearch.com) # added updated http://www.updated.com/ # added Vermut http://vermut.aol.com # added versus crawler from eda.baykan@epfl.ch http://www.epfl.ch/Eindex.html # added Vespa Crawler (Yahoo Norway?) http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=%5Cbid_t_z_030406_1%5Cb # added VSE http://www.vivisimo.com/ # added webcrawl.net http://www.webcrawl.net/ # added Web Downloader http://www.krasu.ru/soft/chuchelo/ # added Webdup http://www.webdup.com/en/index.html # added Wells Search http://www.psychedelix.com/cgi-bin/csv2html.pl?data=allagents.csv&template=detail.html&match=\bid_t_z_1484\b # added WordPress http://wordpress.org/ # added wume crawler http://wume.cse.lehigh.edu/~xiq204/crawler/ # added Xenu's Link Sleuth (with ') # added xirq http://www.xirq.com/ # added yoogliFetchAgent http://www.yoogli.com/ # added Z-Add Link Checker http://w3.z-add.co.uk/linkcheck/ # -- fix - some robots were reported with _ where _ should have been a space. # changed Xenu Link Sleuth # changed microsoft[_+\s]url[_+\s]control -> microsoft_url_control # changed favorites_sweeper -> favorites_sweeper # -- updates # updated AskJeeves to Ask # 2012-06-05 Albrecht Mueller # added Grabber from SDSC (San Diego Supercomputer Center). # 2013-09-30 Albrecht Mueller # AWStats probably cannot detect this bot as it identifies itself in # the referrer field and not in the user agent string. #92.113.100.35 - - [29/Sep/2013:17:22:46 +0200] "GET /robots.txt HTTP/1.1" 200 516 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-" #92.113.100.35 - - [29/Sep/2013:17:22:49 +0200] "GET /tghome.htm HTTP/1.1" 200 4445 "http://extrabot.com/help/frytygativyheku.htm" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-" #92.113.100.35 - - [29/Sep/2013:17:22:51 +0200] "GET / HTTP/1.1" 200 5467 "http://extrabot.com/help/frytygativyheku.htm" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0" "-" # to do MS Search 4.0 Robot #package AWSROB; # Robots list was found at http://www.robotstxt.org/wc/active/all.txt # Other robots can be found at http://www.jafsoft.com/searchengines/webbots.html # Rem: To avoid bad detection, some robot's ids were removed from this list: # - Robots with ID of 3 letters only # - Robots called 'webs' and 'tcl' # Rem: directhit changed into direct_hit (its real id) # Rem: calif changed into calif[^r] to avoid confusion between Tiscalifreenet browser # Rem: fish changed into [^a]fish to avoid confusion between Madsafish browser # Rem: roadrunner changed into road_runner # Rem: lycos changed to lycos_ to avoid confusion with lycos-online browser # Rem: voyager changed into ^voyager\/ to avoid to exclude voyager and amigavoyager browser # RobotsSearchIDOrder # It contains all matching criteria to search for in log fields. This list is # used to know in which order to search Robot IDs. # Most frequent ones are in list1, used when LevelForRobotsDetection is 1 or more # Minor robots are in list2, used when LevelForRobotsDetection is 2 or more # Note: Robots IDs are in lower case, '_', ' ' and '+' are changed into '[_+\s]' and are quoted. #------------------------------------------------------- @RobotsSearchIDOrder_list1 = ( # Common robots (In robot file) 'bingbot/', 'bingpreview', 'MSIECrawler', 'msnbot/', 'msnbot\-media/', 'AdIdxBot/', 'NOT[\x20]Googlebot/', 'Googlebot/', 'Google[\x20]Web[\x20]Preview', 'Googlebot\-Image/', 'Googlebot\-Mobile/', 'google\-sitemaps', 'Googlebot\-News', 'Googlebot\-Video/', 'AdsBot\-Google[\x20]\(', 'AdsBot\-Google\-Mobile\-Apps', 'Mediapartners-Google', 'Feedfetcher\-Google', 'Google\-Adwords\-Instant', 'Firefox/1\.5', 'Yahoo![\x20]Slurp[\x20]China', 'Yahoo![\x20]Slurp', 'Baiduspider/', 'Baiduspider\-image', 'baidu', 'YandexBot/', 'YandexImages/', 'YandexMetrika/', 'YandexMobileBot/', 'yandex', 'electricmonk/', 'spbot/', 'SeznamBot/', 'msie8', 'AhrefsBot/', '007ac9[\x20]Crawler', '2345Explorer/', '360Spider', 'A[\x20]Simple[\x20]Crawler', 'Abrave', 'acapbot/', 'Accoona\-AI\-Agent/', 'AdnormCrawlerCatchBot/', 'aiHitBot/', 'aipbot/', 'Apache\-HttpClient/', 'Apexoo[\x20]Spider', 'Applebot/', 'archive\.org_bot', 'Babya[\x20]Discoverer', 'BDCbot/', 'BinGet/', 'bl\.uk_lddc_bot/', 'BLEXBot/', 'boitho\.com\-dc/', 'BusinessBot:', 'CatchBot/', 'CB/Nutch', 'CCBot/', 'Cliqzbot/', 'CMS[\x20]Crawler', 'ConveraCrawler/', 'cosmos/', 'crawl/Nutch', 'crawler4j', 'CRAZYWEBCRAWLER', 'CSE[\x20]HTML[\x20]Validator', 'C\-T[\x20]bot', 'Curl/PHP', 'Dalvik/', 'DataCrawler/', 'Deepnet[\x20]Explorer', 'DeuSu/', 'Digincore', 'Discordbot/', 'DoCoMo/', 'Domain[\x20]Re\-Animator[\x20]Bot', 'DomainCrawler/', 'DomainMacroCrawler/', 'DomainSONOCrawler/', 'DomainStatsBot/', 'DotBot/', 'DuckDuckGo\-Favicons\-Bot/', 'ELinks/', 'ELinks[\x20]\(', 'EmailMarketingRobot/', 'EmeraldShield\.com[\x20]WebBot', 'envolk\[ITS\]spider/', 'EsperanzaBot', 'Exabot/', 'facebookexternalhit/', 'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de', 'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de', 'FAST\-WebCrawler/', 'Feosey[\x20]Mohk[\x20]Crawler', 'findlinks/', 'Findxbot/', 'FirePHP/', 'FlippyBearBot/', 'FreeWebMonitoring[\x20]SiteChecker/', 'fujilabol', 'FurlBot/', 'Gaisbot/', 'Gallent[\x20]Spider', 'GarlikCrawler/', 'GetintentCrawler[\x20]getintent\.com', 'Gigabot/', 'gipo\-crawler/Nutch', 'Girafabot', 'Gluten[\x20]Free[\x20]Crawler/', 'gocrawl', 'GrapeshotCrawler/', 'GSiteCrawler/', 'GurujiBot/', 'HaosouSpider', 'holmes/', 'HTTP_Request2/', 'HubSpot[\x20]Webcrawler', 'HyperCrawl/', 'ICC\-Crawler/', 'iconoclast', 'IDGCrawler/Nutch', 'idmarch[\x20]Automatic\.beta/', 'Incutio[\x20]XML', 'InfluenceBot', 'IRLbot/', 'IssueCrawler', 'istellabot/', 'James[\x20]BOT', 'Jigsaw/', 'JobFeed', 'KomodiaBot/', 'Konqueror/', 'linkapediabot', 'metager\-linkchecker', 'linkchecker', 'LinkCheck', 'linkdexbot/', 'LinkedInBot/', 'LinkpadBot/', 'Links[\x20]\(', 'LinksManager\.com_bot', 'LWP::Simple/', 'Mail\.RU_Bot/', 'meanpathbot/', 'Mediatoolkitbot', 'MegaIndex\.ru/', 'merzscope', 'mfibot/', 'microsoft.*discovery', 'missigua_locator', 'MixrankBot', 'MJ12bot/', 'MojeekBot/', 'Mojolicious', 'MXT/Nutch', 'My[\x20]Nutch[\x20]Spider/', 'myse/Nutch', 'NerdyBot', 'netEstate[\x20]NE[\x20]Crawler', 'NetResearchServer/', 'NRLCorpusBuilder/Nutch', 'nutch\-1\.4/', 'nutch\-1\.8/', 'NutchCVS/', 'o\.uk[\x20]robot', 'oBot/', 'ocrawler;', 'ODP[\x20]link[\x20]checker', 'Offline[\x20]Explorer/', 'OmniExplorer_Bot/', 'OrangeBot/', 'PageBitesHyperBot/', 'pdffillerbot/', 'PhantomJS', 'PHP/5\.2\.8', 'Ploetz[\x20]\+[\x20]Zeller', 'Plukkie/', 'Princetonbot/', 'PrivacyAwareBot/', 'proximic', 'psbot/', 'psbot\-image', 'python_wk_crawler', 'Python\-urllib/', 'QCrawl', 'ResearchBot', 'roboto', 'rogerbot/', 'RSSingBot', 'RukiCrawler/', 'SafeDNS[\x20]search[\x20]bot/', 'SafeDNSBot', 'SafeSearch[\x20]microdata[\x20]crawler', 'safesearch', 'SBL\-BOT', 'Screaming[\x20]Frog[\x20]SEO[\x20]Spider/', 'ScreenerBot[\x20]Crawler[\x20]Beta', 'Searchie/', 'Seekmo', 'semanticbot', 'SemrushBot/', 'seo\-audit\-check\-bot/', 'Seobility', 'SEOkicks\-Robot', 'SEOlyticsCrawler/', 'SEOstats', 'Seosys/Nutch', 'Seoterritory\.com[\x20]bot', 'Shim\-Crawler', 'SiteExplorer/', 'siteexplorer\.info', 'Slackbot\-LinkExpanding', 'SmabblerBot/', 'Sogou[\x20]web[\x20]spider/', 'special_archiver/', 'Spiderbot/', 'ssearch_bot', 'SurdotlyBot/', 'SurveyBot/', 'taiil/Nutch', 'tbot\-nutch', 'TeeRaidBot', 'TelegramBot', 'Test/Nutch', 'Test[\x20]Spider', 'TestCrawler', 'TurnitinBot/', 'TurnitinBot', 'TweetmemeBot/', 'UCY/Nutch', 'Uptimebot/', 'URL[\x20]Checker', 'UXCrawlerBot', 'Validator\.nu/', 'vBSEO', 'vBulletin[\x20]via[\x20]PHP', 'vebidoobot', 'viz/Nutch', 'VoilaBot', 'VORTEX/', 'voyager/', 'W3C_Validator/', 'W3C\-checklink/', 'WBSearchBot/', 'WeSEE:Ads/PageBot', 'WeSEE:Ads/PictureBot', 'WeSEE_Bot', 'Wget/', 'Who\.is[\x20]Bot', 'wonderbot/', 'woobot/', 'Wotbox/', 'Xaldon[\x20]WebSpider', 'Xenu[\x20]Link[\x20]Sleuth', 'xenu_link_sleuth', 'XML[\x20]Sitemaps[\x20]Generator', 'XoviBot/', 'yacybot', 'Yahoo[\x20]Link[\x20]Preview', 'YisouSpider', 'yoozBot', 'zspider/', 'ZumBot/', # below placed at end to catch some generics 'nbot', 'ng/1\.', 'ng/2\.', 'libwww\-perl', 'CFNetwork', 'urllib', 'javabee', 'projectwf\-java\-test\-crawler', 'java', 'loocalcrawler/nutch', 'nutchosu\-vlib', 'nutch', 'perlcrawler', 'perl', # old robots using firefox < version 11 not identifying themselves as a robot. '(firefox/)([0-9]\.|[0-1][0]\.)' ); @RobotsSearchIDOrder_list2 = ( # Less common robots (In robot file) '^Mozilla$', '^mozilla\/3\.0\s\(compatible$', '^mozilla\/4\.0$', '^mozilla\/4\.0\s\(compatible;\)$', '^mozilla\/5\.0$', '^mozilla\/5\.0\s\(compatible;$', '^mozilla\/5\.0\s\(en\-us\)$', '^mozilla\/5\.0\sfirefox\/3\.0\.5$', '^Mozilla/6\.0[\x20]\(compatible\)$', '^Mozilla/(.*)Beta[\x20]\(Windows\)', 'MSIE[\x20]2', 'MSIE[\x20]3', 'MSIE[\x20]4', 'MSIE[\x20]5', 'MSIE[\x20]6', 'Windows[\x20]95', 'Windows[\x20]98', # these could be removed to speed up processing as they are rarely seen 'a6\-indexer', 'abcdatos', 'abonti\.com', 'acme\.spider', 'activebookmark', 'adamm_bot', 'advbot', 'affectv\.co\.uk', 'ahoythehomepagefinder', 'aleadsoftbot', 'alkaline', 'allrati', 'alltop', 'almaden', 'alpha_search_agent', 'anthill', 'antibot', 'aport', 'appie', 'applesyndication', 'arachnophilia', 'arale', 'araneo', 'architext', 'archive\-de\.com', 'aretha', 'argus', 'ariadne', 'arianna\.libero\.it', 'arks', 'aspider', 'aspseek', 'asterias', 'asynchttpclient', 'atn\.txt', 'atomz', 'auresys', 'awbot', 'backlinktest\.com', 'backrub', 'bbot', 'becomebot', 'bender', 'betabot', 'bigbrother', 'biglotron', 'BingLocalSearch', 'bittorrent_bot', 'biz360[_+\s]spider', 'bjaaland', 'blackwidow', 'blindekuh', 'blogbridge[_+\s]service', 'blogged_crawl', 'bloglines', 'bloglovin', 'blogpulse', 'blogsearch', 'blogshares', 'blogslive', 'blogssay', 'bloodhound', 'bncf\.firenze\.sbn\.it/raccolta\.txt', 'bobby', 'bookmark\-manager', 'borg\-bot', 'boris', 'brightnet', 'bruinbot', 'bspider', 'bubing', 'bumblebee', 'butterfly', 'buzztracker', 'cactvschemistryspider', 'calif[^r]', 'candlelight[_+\s]favorites[_+\s]inspector', 'careerbot', 'carpathia', 'cassandra', 'catbot', 'cbn00glebot', 'cerberian_drtrs', 'cfetch', 'cgireader', 'chattertrap', 'check_http', 'checkbot', 'checkweb_link_validator', 'christcrawler', 'churl', 'cienciaficcion', 'cipinetbot', 'imagecoccoc', 'coccoc', 'coldfusion', 'collective', 'combine', 'commons\-httpclient', 'computer_and_automation_research_institute_crawler', 'conceptbot', 'contentmatch', 'converamultimediacrawler', 'coolbot', 'copubbot', 'core', 'covario', 'cruiser', 'cscrawler', 'cuasarbot', 'cursor', 'cusco', 'custo', 'cyberspyder', 'datafountains/dmoz_downloader', 'dataprovider\.com', 'daumoa', 'daviesbot', 'daylifefeedfetcher', 'daypopbot', 'deepindex', 'desertrealm', 'deweb', 'dienstspider', 'digger', 'digout4u', 'diibot', 'dipsie\.bot', 'direct_hit', 'discobot', 'dlvr\.it', 'dnabot', 'dnsgroup', 'doccheckbot', 'domainappender', 'domainchecker', 'domainsdb\.net', 'download_express', 'dragonbot', 'dreamwidth', 'drupal', 'dulance', 'dumbot', 'dumm\.de\-bot', 'dwcp', 'e\-collector', 'earthcom\.info', 'easydl', 'ebiness', 'eccp', 'echo!', 'edgeio\-retriever', 'elfinbot', 'emacs', 'emcspider', 'enteprise', 'ernst[:blank:]2\.0', 'esther', 'ets_v', 'eventax', 'everbeecrawler', 'everest\-vulcan', 'evliyacelebi', 'exactseek', 'extreme[_+\s]picture[_+\s]finder', 'ezoom', 'ezresult', 'facebook', 'facebot', 'fast\-search\-engine', 'matrix_s\.p\.a\._\-_fast_enterprise_crawler', 'fast_enterprise_crawler', 'fastbot', 'fastcrawler', 'favicon', 'favorg', 'favorites_sweeper', 'fdse', 'feedburner', 'feedcrawl', 'feedflow', 'feedmyinbox', 'feedroll\.com', 'feedsky', 'feedster', 'feedvalidator', 'feedzira', 'felix', 'ferret', 'fetchbot', 'fetchrover', 'fever/', 'fido', 'filmkamerabot', 'filterdb\.iss\.net', 'finderlein[_+\s]research[_+\s]crawler', 'findexa_crawler', 'finnish', 'fireball', 'firmilybot', 'flexum', 'foaf\-search\.net', 'fooky\.com/ScorpionBot', 'fouineur', 'francoroute', 'freecrawl', 'freenews', 'funnelweb', 'g2crawler', 'gama', 'gazz', 'gcreep', 'geniebot', 'genieo', 'geohasher', 'getbot', 'geturl', 'gigablastopensource', 'global_fetch', 'gnodspider', 'goforit\.com', 'goforitbot', 'golem', 'gonzo', 'gougou', 'gpu_p2p_crawler', 'grabber', 'grapeshot', 'grapnel', 'griffon', 'gromit', 'grub', 'gulliver', 'gulperbot', 'hambot', 'hanrss', 'harvest', 'havindex', 'henrythemiragorobot', 'heritrix', 'hl_ftien_spider', 'hometown', 'hoowwwer', 'hpprint', 'htdig', 'html[_+\s]link[_+\s]validator', 'htmlgobble', 'htmlparser', 'httrack', 'hundesuche\.com\-bot', 'hyperdecontextualizer', 'ia_archiver\-web\.archive\.org', 'ia_archiver', 'iajabot', 'iaskspider', 'i\-bot', 'icarus6j', 'ichiro', 'icjobs\.de', 'ilse', 'iltrovatore\-setaccio', 'imagelock', 'implisensebot', 'inagist', 'incywincy', 'infobot', 'infociousbot', 'infohelfer', 'infomine', 'informant', 'infoseeksidewinder', 'infoseek', 'infospider', 'inspectorwww', 'insurancobot', 'integromedb\.org', 'intelliagent', 'internet[_+\s]ninja', 'internetarchive', 'internetseer', 'internetsupervision', 'ips\-agent', 'irobot', 'iron33', 'isearch2006', 'israelisearch', 'iupui_research_bot', 'izsearch', 'jacobin[\x20]club', 'jakarta', 'jbot', 'jcrawler', 'jeeves', 'jennybot', 'jobboerse', 'jobot', 'jobo', 'joebot', 'jrtwine[_+\s]software[_+\s]check[_+\s]favorites[_+\s]utility', 'js\-kit', 'jubii', 'jumpstation', 'justview', 'kalambot', 'kamano\.de_newsfeedverzeichnis', 'kapsi', 'katipo', 'kazoombot', 'kevin', 'keyoshid', 'kilroy', 'kinja\-imagebot', 'kinjabot', 'knowitall', 'knowledge\.com', 'ko[_+\s]yappo[_+\s]robot', 'kouaa_krawler', 'krugle', 'ksibot', 'kummhttp', 'kurzor', 'labelgrabber\.txt', 'lanshanbot', 'larbin', 'largesmall[\x20]crawler', 'legs', 'letscrawl\.com', 'libcrawl', 'lilina', 'link_valet_online', 'linkbot', 'linkdex\.com', 'linkidator', 'linkscan', 'linkstats[\x20]bot', 'linkwalker', 'lipperhey', 'livejournal\.com', 'lmspider', 'loadtimebot', 'lockon', 'logo_gif', 'longurl', 'lssrocketcrawler', 'ltbot', 'ltx71', 'lwp\-request', 'lwp\-trivial', 'lycos[_+\s]', 'macworm', 'madaali\.de', 'magpierss', 'magpie', 'mapoftheinternet\.com', 'marvin', 'mattie', 'mediabot', 'mediafox', 'megaindex', 'megite', 'memorybot', 'mercator', 'meshexplorer', 'metager2\-verification\-bot', 'metajobbot', 'metaspinner', 'metauri', 'miadev', 'microsoft[_+\s]url[_+\s]control', 'microsoft[\x20]bits', 'microsoft\-webdav\-miniredir', 'mindcrawler', 'mindupbot', 'mini\-reptile', 'minirank', 'misterbot', 'miva', 'mizzu_labs', 'mnogosearch', 'moget', 'momspider', 'monster', 'motor', 'movabletype', 'ms[_+\s]search[_+\s]6\.0[_+\s]robot', 'ms_search_4\.0_robot', 'msnbot\-udiscovery', 'msrabot', 'msrbot', 'mt::telegraph::agent', 'muncher', 'muscatferret', 'mwdsearch', 'mydoyouhike', 'myweb', 'nagios', 'nasa_search', 'ndspider', 'nederland\.zoek', 'netcarta', 'netcraft', 'netluchs', 'netmechanic', 'netnewswire', 'netscoop', 'netsprint', 'netvibes', 'newrelicpinger', 'newscan\-online', 'newsfox', 'newsgatoronline', 'nextgensearchbot', 'nhse', 'nicebot', 'nimblecrawler', 'ning', 'nomad', 'northstar', 'noxtrumbot', 'npbot', 'nzexplorer', 'objectssearch', 'occam', 'ocelli', 'octopus', 'octora_beta_bot', 'onet\.pl[_+\s]sa', 'onfolio', 'openfind', 'opentaggerbot', 'openwebspider', 'optimizer', 'oracle_ultra_search', 'orb_search', 'orbiter', 'packrat', 'pageboy', 'panscient', 'parasite', 'passwordmaker\.org', 'patric', 'pear_http_request_class', 'peerbot', 'pegasus', 'perignator', 'perman', 'petersnews', 'phantom', 'php[_+\s]version[_+\s]tracker', 'phpcrawl', 'phpdig', 'picmole', 'pictureofinternet', 'piltdownman', 'pimptrain', 'ping\.blo\.gs', 'pingdom', 'pioneer', 'pita', 'pitkow', 'pjspider', 'plinki', 'pluckfeedcrawler', 'plumtreewebaccessor', 'pogodak', 'pompos', 'popdexter', 'poppi', 'port_huron_labs', 'portalb', 'postfavorites', 'postpost', 'postrank', 'powermarks', 'printfulbot', 'proodlebot', 'protopage', 'publiclibraryarchive', 'pyquery', 'python', 'qihoobot', 'quipply', 'qwantify', 'r6\_', 'rambler', 'ratingburner', 'raven', 'rbse', 'redalert', 'regator', 'relevantnoise\.com', 'resumerobot', 'rhcs', 'riddler', 'road_runner', 'robbie', 'robi', 'robocrawl', 'robofox', 'robozilla', 'rojo', 'rome[\x20]client', 'roverbot', 'rpt\-httpclient', 'rssgraffiti', 'rssimagesbot', 'ruffle', 'rufusbot', 'rules', 'safeads\.xyz', 'safetynetrobot', 'sage\+\+', 'sandcrawler', 'savetheworldheritage', 'sbider', 'schizozilla', 'scooter', 'scoutjet', 'scumbot', 'search\-info', 'search_au', 'searchguild[_+\s]dmoz[_+\s]experiment', 'searchmetricsbot', 'searchprocess', 'seekbot', 'semalt', 'senrigan', 'sensis_web_crawler', 'seodiver', 'seokicks\.de', 'seoscanners', 'sgscout', 'shaggy', 'shaihulud', 'shareaholicbot', 'shoutcast', 'sift', 'simbot', 'simplepie', 'sistrix', 'site\-valet', 'sitebot', 'sitedomain\-bot', 'sitetech', 'skimbot', 'skymob', 'slcrawler', 'slurp', 'slysearch', 'smartspider', 'smtbot', 'snap\.com_beta_crawler', 'snappy', 'snooper', 'sohu\-search', 'sohu', 'solbot', 'speedy', 'sphere_scout', 'spider[_+\s]monkey', 'spiderline', 'spiderlytics', 'spiderman', 'spiderview', 'spip', 'sproose_crawler', 'spry', 'sqworm', 'ssearcher', 'steeler', 'steroid__download', 'stq_bot', 'Stratagems[\x20]Kumo', 'suchfin\-bot', 'suke', 'summify\.com', 'sunrise', 'suntek', 'superbot', 'superfeedr', 'susie', 'sven', 'syndic8', 'syndicapi', 'synoobot', 'synthesio', 't\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e', 'tach_bw', 'tagyu_agent', 'tailrank', 'tarantula', 'tarspider', 'tcl_http_client_package', 'techbot', 'technoratibot', 'templeton', 'teoma', 'teragramcrawlersurf', 'test_crawler', 'testbot', 'thumbsniper', 'titan', 'titin', 'tkwww', 'tlspider', 'topblogsinfo', 'topicblogs', 'topix\.net', 'trapit', 'trileet', 'turtlescanner', 'turtle', 'tutorgigbot', 'tweetedtimes', 'twiceler', 'twisted[\x20]pagegetter', 'twitterbot', 'twitterfeed', 'ubicrawler', 'ucsd', 'udmsearch', 'ultraseek', 'unchaos_bot_hybrid_web_search_engine', 'unido\-bot', 'unisterbot', 'universalfeedparser', 'unlost_web_crawler', 'unwindfetchor', 'updated', 'urlck', 'ustc\-semantic\-group', 'vagabondo\-wap', 'vagabondo', 'valkyrie', 'vermut', 'versus_crawler_from_eda\.baykan@epfl\.ch', 'verticrawl', 'vespa_crawler', 'victoria', 'virus[_+\s]detector', 'visionsearch', 'voidbot', 'voltron', 'vse/', 'vwbot', 'w3c[_+\s]css[_+\s]validator[_+\s]jfouffa', 'w3index', 'w3m2', 'wallpaper', 'wanderer', 'wapspider', 'wapspIRLider', 'watchmouse', 'wavefire', 'waybackarchive\.org', 'wazzup', 'web_downloader', 'webbandit', 'webbase', 'webcatcher', 'webclipping\.com', 'webcollage', 'webcompass', 'webcopy', 'webcrawl\.net', 'webdup', 'webfetcher', 'webfilter', 'webfoot', 'webinator', 'webindexer', 'weblayers', 'weblinker', 'webminer', 'webmirror', 'webmoose', 'webquest', 'webreader', 'webreaper', 'website[_+\s]monitoring[_+\s]bot', 'websnarf', 'webspider', 'webvac', 'webvulncrawl', 'webwalker', 'webwalk', 'webwatch', 'wells_search', 'wer\-liefert\-was', 'wesee:search', 'wevikabot', 'whatuseek', 'whowhere', 'windows\-rss\-platform', 'wired\-digital', 'zyborg', 'wisenutbot', 'wiumi', 'wmir', 'wolp', 'wombat', 'wonderer', 'woozweb', 'wordpress', 'worm', 'wume_crawler', 'wwwc', 'wwweasel', 'wz101', 'xget', 'xirq', 'xydo', 'y!j', 'yahoo![\x20]searchmonkey', 'yahoo!_mindset', 'yahoo\-blogs', 'yahoo\-mmcrawler', 'yahoo\-newscrawler', 'yahoo[\x20]pipes', 'yahoo\-verticalcrawler', 'yahoocachesystem', 'yahooexternalcache', 'yahoofeedseeker', 'yahooseeker\-testing', 'yahooseeker', 'yahooysmcm', 'yammer', 'yanga', 'yet\-another\-spider', 'yeti', 'yie8', 'yodaobot', 'yooglifetchagent', 'youdao', 'yourls', 'z\-add_link_checker', 'zealbot', 'zemanta', 'zend_http_client', 'zeus', 'zhuaxia', '[^a]fish', '[\x20]netseer[\x20]', '^[1-3]$', '^finbot', '^motorola$', '^msie', '^voyager/', '^webindex$', '1\-more_scanner' ); @RobotsSearchIDOrder_listgen = ( # Generic robot 'robot', 'blog', 'checker', 'crawl', 'discover', 'feed', 'fetcher', 'hunter', 'link', 'scanner', 'seek', 'sitemap', 'spider', 'sucker', 'validator', 'bot[\s_+:,\.\;\/\\\-]', '[\s_+:,\.\;\/\\\-]bot', 'curl', 'php', 'ruby/', 'no_user_agent' ); # RobotsHashIDLib # List of robots names ('robot id','robot clear text') #------------------------------------------------------- %RobotsHashIDLib = ( # Common robots (In robot file) 'bingbot/','bingbot', 'bingpreview','BingPreview', 'MSIECrawler','MSIECrawler', 'msnbot/','msnbot', 'msnbot\-media/','msnbot-media', 'AdIdxBot/','AdIdxBot Microsoft Ad Quality control', 'NOT[\x20]Googlebot/','NOT Googlebot', 'Googlebot/','Googlebot', 'Google[\x20]Web[\x20]Preview','Google Web Preview', 'Googlebot\-Image/','Googlebot-Image', 'Googlebot\-Mobile/','Googlebot-Mobile', 'google\-sitemaps','google-sitemaps', 'Googlebot\-News','Googlebot-News', 'Googlebot\-Video/','Googlebot-Video', 'AdsBot\-Google[\x20]\(','AdsBot-Google', 'AdsBot\-Google\-Mobile\-Apps','AdsBot-Google-Mobile-Apps', 'Mediapartners\-Google','Mediapartners-Google', 'Feedfetcher\-Google','Feedfetcher-Google', 'Google\-Adwords\-Instant','Google-Adwords-Instant', 'Firefox/1\.5','Nautic Expo using Firefox/1.5', 'Yahoo![\x20]Slurp[\x20]China','Yahoo! Slurp China', 'Yahoo![\x20]Slurp','Yahoo! Slurp', 'Baiduspider/','Baiduspider', 'Baiduspider\-image','Baiduspider-image', 'baidu','Baidu ( catchall )', 'YandexBot/','YandexBot', 'YandexImages/','YandexImages', 'YandexMetrika/','YandexMetrika', 'YandexMobileBot/','YandexMobileBot', 'yandex','Yandex ( catchall )', 'electricmonk/','electricmonk', 'spbot/','spbot', 'SeznamBot/','SeznamBot', 'msie8','msie8 - ( Rogue Robot )', 'AhrefsBot/','AhrefsBot', '007ac9[\x20]Crawler','007ac9 Crawler', '2345Explorer/','2345Explorer', '360Spider','360Spider', 'A[\x20]Simple[\x20]Crawler','A Simple Crawler', 'Abrave','Abrave', 'acapbot/','acapbot', 'Accoona\-AI\-Agent/','Accoona-AI-Agent', 'AdnormCrawlerCatchBot/','AdnormCrawlerCatchBot', 'aiHitBot/','aiHitBot', 'aipbot/','aipbot', 'Apache\-HttpClient/','Apache-HttpClient', 'Apexoo[\x20]Spider','Apexoo Spider', 'Applebot/','Applebot', 'archive\.org_bot','archive.org_bot', 'Babya[\x20]Discoverer','Babya Discoverer', 'BDCbot/','BDCbot', 'BinGet/','BinGet', 'bl\.uk_lddc_bot/','bl.uk_lddc_bot', 'BLEXBot/','BLEXBot', 'boitho\.com\-dc/','boitho.com-dc', 'BusinessBot:','BusinessBot:', 'CatchBot/','CatchBot', 'CB/Nutch','CB/Nutch', 'CCBot/','CCBot', 'Cliqzbot/','Cliqzbot', 'CMS[\x20]Crawler','CMS Crawler', 'ConveraCrawler/','ConveraCrawler', 'cosmos/','cosmos', 'crawl/Nutch','crawl/Nutch', 'crawler4j','crawler4j', 'CRAZYWEBCRAWLER','CRAZYWEBCRAWLER', 'CSE[\x20]HTML[\x20]Validator','CSE HTML Validator', 'C\-T[\x20]bot','C-T bot', 'Curl/PHP','Curl/PHP', 'Dalvik/','Dalvik', 'DataCrawler/','DataCrawler', 'Deepnet[\x20]Explorer','Deepnet Explorer', 'DeuSu/','DeuSu', 'Digincore','Digincore', 'Discordbot/','Discordbot', 'DoCoMo/','DoCoMo', 'Domain[\x20]Re\-Animator[\x20]Bot','Domain Re-Animator Bot', 'DomainCrawler/','DomainCrawler', 'DomainMacroCrawler/','DomainMacroCrawler', 'DomainSONOCrawler/','DomainSONOCrawler', 'DomainStatsBot/','DomainStatsBot', 'DotBot/','DotBot', 'DuckDuckGo\-Favicons\-Bot/','DuckDuckGo-Favicons-Bot', 'ELinks/','ELinks', 'ELinks[\x20]\(','ELinks (', 'EmailMarketingRobot/','EmailMarketingRobot', 'EmeraldShield\.com[\x20]WebBot','EmeraldShield.com WebBot', 'envolk\[ITS\]spider/','envolk ITS spider', 'EsperanzaBot','EsperanzaBot', 'Exabot/','Exabot', 'facebookexternalhit/','facebookexternalhit', 'fast_enterprise_crawler.*scrawleradmin\.t\-info@telekom\.de','FAST Enterprise crawleradmin.t-info@telekom.de', 'fast_enterprise_crawler.*t\-info_bi_cluster_crawleradmin\.t\-info@telekom\.de','FAST Enterprise T-Info_BI_cluster crawleradmin.t-info@telekom.de', 'FAST\-WebCrawler/','FAST-WebCrawler', 'Feosey[\x20]Mohk[\x20]Crawler','Feosey Mohk Crawler', 'findlinks/','findlinks', 'Findxbot/','Findxbot', 'FirePHP/','FirePHP', 'FlippyBearBot/','FlippyBearBot', 'FreeWebMonitoring[\x20]SiteChecker/','FreeWebMonitoring SiteChecker', 'fujilabol','fujilabol', 'FurlBot/','FurlBot', 'Gaisbot/','Gaisbot', 'Gallent[\x20]Spider','Gallent Spider', 'GarlikCrawler/','GarlikCrawler', 'GetintentCrawler[\x20]getintent\.com','GetintentCrawler getintent.com', 'Gigabot/','Gigabot', 'gipo\-crawler/Nutch','gipo-crawler/Nutch', 'Girafabot','Girafabot', 'Gluten[\x20]Free[\x20]Crawler/','Gluten Free Crawler', 'gocrawl','gocrawl', 'GrapeshotCrawler/','GrapeshotCrawler', 'GSiteCrawler/','GSiteCrawler', 'GurujiBot/','GurujiBot', 'HaosouSpider','HaosouSpider', 'holmes/','holmes', 'HTTP_Request2/','HTTP_Request2', 'HubSpot[\x20]Webcrawler','HubSpot Webcrawler', 'HyperCrawl/','HyperCrawl', 'ICC\-Crawler/','ICC-Crawler', 'iconoclast','iconoclast', 'IDGCrawler/Nutch','IDGCrawler/Nutch', 'idmarch[\x20]Automatic\.beta/','idmarch Automatic.beta', 'Incutio[\x20]XML','Incutio XML', 'InfluenceBot','InfluenceBot', 'IRLbot/','IRLbot', 'IssueCrawler','IssueCrawler', 'istellabot/','istellabot', 'James[\x20]BOT','James BOT', 'Jigsaw/','Jigsaw', 'JobFeed','JobFeed', 'KomodiaBot/','KomodiaBot', 'Konqueror/','Konqueror', 'linkapediabot','linkapediabot', 'metager\-linkchecker','metager-linkchecker', 'linkchecker','linkchecker', 'LinkCheck','LinkCheck', 'linkdexbot/','linkdexbot', 'LinkedInBot/','LinkedInBot', 'LinkpadBot/','LinkpadBot', 'Links[\x20]\(','Links (', 'LinksManager\.com_bot','LinksManager.com_bot', 'LWP::Simple/','LWP::Simple', 'Mail\.RU_Bot/','Mail.RU Bot', 'meanpathbot/','meanpathbot', 'Mediatoolkitbot','Mediatoolkitbot', 'MegaIndex\.ru/','MegaIndex.ru', 'merzscope','merzscope', 'mfibot/','mfibot', 'microsoft.*discovery','Microsoft Office Protocol Discovery', 'missigua_locator','missigua_locator', 'MixrankBot','MixrankBot', 'MJ12bot/','MJ12bot', 'MojeekBot/','MojeekBot', 'Mojolicious','Mojolicious', 'MXT/Nutch','MXT/Nutch', 'My[\x20]Nutch[\x20]Spider/','My Nutch Spider', 'myse/Nutch','myse/Nutch', 'NerdyBot','NerdyBot', 'netEstate[\x20]NE[\x20]Crawler','netEstate NE Crawler', 'NetResearchServer/','NetResearchServer', 'NRLCorpusBuilder/Nutch','NRLCorpusBuilder/Nutch', 'nutch\-1\.4/','nutch-1.4', 'nutch\-1\.8/','nutch-1.8', 'NutchCVS/','NutchCVS', 'o\.uk[\x20]robot','o uk.robot', 'oBot/','oBot', 'ocrawler;','ocrawler;', 'ODP[\x20]link[\x20]checker','ODP link checker', 'Offline[\x20]Explorer/','Offline Explorer', 'OmniExplorer_Bot/','OmniExplorer_Bot', 'OrangeBot/','OrangeBot', 'PageBitesHyperBot/','PageBitesHyperBot', 'pdffillerbot/','pdffillerbot', 'PhantomJS','PhantomJS', 'PHP/5\.2\.8','PHP/5.2.8', 'Ploetz[\x20]\+[\x20]Zeller','Ploetz + Zeller', 'Plukkie/','Plukkie', 'Princetonbot/','Princetonbot', 'PrivacyAwareBot/','PrivacyAwareBot', 'proximic','proximic', 'psbot/','psbot', 'psbot\-image','psbot-image', 'python_wk_crawler','python_wk_crawler', 'Python\-urllib/','Python-urllib', 'QCrawl','QCrawl', 'ResearchBot','ResearchBot', 'roboto','roboto', 'rogerbot/','rogerbot', 'RSSingBot','RSSingBot', 'RukiCrawler/','RukiCrawler', 'SafeDNS[\x20]search[\x20]bot/','SafeDNS search bot', 'SafeDNSBot','SafeDNSBot', 'SafeSearch[\x20]microdata[\x20]crawler','SafeSearch microdata crawler', 'safesearch','safesearch ( catchall )', 'SBL\-BOT','SBL-BOT', 'Screaming[\x20]Frog[\x20]SEO[\x20]Spider/','Screaming Frog SEO Spider', 'ScreenerBot[\x20]Crawler[\x20]Beta','ScreenerBot Crawler Beta', 'Searchie/','Searchie', 'Seekmo','Seekmo', 'semanticbot','semanticbot', 'SemrushBot/','SemrushBot', 'seo\-audit\-check\-bot/','seo-audit-check-bot', 'Seobility','Seobility', 'SEOkicks\-Robot','SEOkicks-Robot', 'SEOlyticsCrawler/','SEOlyticsCrawler', 'SEOstats','SEOstats', 'Seosys/Nutch','Seosys/Nutch', 'Seoterritory\.com[\x20]bot','Seoterritory.com.bot', 'Shim\-Crawler','Shim-Crawler', 'SiteExplorer/','SiteExplorer', 'siteexplorer\.info','siteexplorer.info', 'Slackbot\-LinkExpanding','Slackbot-LinkExpanding', 'SmabblerBot/','SmabblerBot', 'Sogou[\x20]web[\x20]spider/','Sogou web spider', 'special_archiver/','special_archiver', 'Spiderbot/','Spiderbot', 'ssearch_bot','ssearch_bot', 'SurdotlyBot/','SurdotlyBot', 'SurveyBot/','SurveyBot', 'taiil/Nutch','taiil/Nutch', 'tbot\-nutch','tbot-nutch', 'TeeRaidBot','TeeRaidBot', 'TelegramBot','TelegramBot', 'Test/Nutch','Test/Nutch', 'Test[\x20]Spider','Test Spider', 'TestCrawler','TestCrawler', 'TurnitinBot/','TurnitinBot', 'TurnitinBot','TurnitinBot', 'TweetmemeBot/','TweetmemeBot', 'UCY/Nutch','UCY/Nutch', 'Uptimebot/','Uptimebot', 'URL[\x20]Checker','URL Checker', 'UXCrawlerBot','UXCrawlerBot', 'Validator\.nu/','Validator.nu', 'vBSEO','vBSEO', 'vBulletin[\x20]via[\x20]PHP','vBulletin via PHP', 'vebidoobot','vebidoobot', 'viz/Nutch','viz/Nutch', 'VoilaBot','VoilaBot', 'VORTEX/','VORTEX', 'voyager/','voyager', 'W3C_Validator/','W3C_Validator', 'W3C\-checklink/','W3C-checklink', 'WBSearchBot/','WBSearchBot', 'WeSEE:Ads/PageBot','WeSEE:Ads/PageBot', 'WeSEE:Ads/PictureBot','WeSEE:Ads/PictureBot', 'WeSEE_Bot','WeSEE_Bot', 'Wget/','Wget', 'Who\.is[\x20]Bot','Who.is.Bot', 'wonderbot/','wonderbot', 'woobot/','woobot', 'Wotbox/','Wotbox', 'Xaldon[\x20]WebSpider','Xaldon WebSpider', 'Xenu[\x20]Link[\x20]Sleuth','Xenu Link Sleuth', 'xenu_link_sleuth','xenu_link_sleuth', 'XML[\x20]Sitemaps[\x20]Generator','XML Sitemaps Generator', 'XoviBot/','XoviBot', 'yacybot','yacybot', 'Yahoo[\x20]Link[\x20]Preview','Yahoo Link Preview', 'YisouSpider','YisouSpider', 'yoozBot','yoozBot', 'zspider/','zspider', 'ZumBot/','ZumBot', # below placed at end to catch some generics 'nbot','nbot', 'ng/1\.','ng/1.', 'ng/2\.','ng/2.', 'libwww\-perl','libwww-perl', 'CFNetwork','CFNetwork', 'urllib','urllib', 'javabee','javabee', 'projectwf\-java\-test\-crawler','projectwf-java-test-crawler', 'java','Java ( catchall )', 'loocalcrawler/nutch','loocalcrawler/nutch', 'nutchosu\-vlib','nutchosu-vlib', 'nutch','nutch ( catchall )', 'perlcrawler','perlcrawler', 'perl','perl', '(firefox/)([0-9]\.|[0-1][0]\.)','Firefox version 10 and lower - various robots', # Less common robots (In robot file) '^Mozilla$','Mozilla ( Rogue Robot )', '^mozilla\/3\.0\s\(compatible$', 'mozilla/3.0 (compatible - ( Rogue Robot )', '^mozilla\/4\.0$', 'mozilla/4.0 - ( Rogue Robot )', '^mozilla\/4\.0\s\(compatible;\)$', 'mozilla/4.0 (compatible;) - ( Rogue Robot )', '^mozilla\/5\.0$', 'mozilla/5.0 - ( Rogue Robot )', '^mozilla\/5\.0\s\(compatible;$', 'mozilla/5.0 (compatible; - ( Rogue Robot )', '^mozilla\/5\.0\s\(en\-us\)$', 'mozilla/5.0 (en-us) - ( Rogue Robot )', '^mozilla\/5\.0\sfirefox\/3\.0\.5$', 'mozilla/5.0 firefox/3.0.5 - ( Rogue Robot )', '^Mozilla/6\.0[\x20]\(compatible\)$','Mozilla/6.0 (compatible) - ( Rogue Robot )', '^Mozilla/(.*)Beta[\x20]\(Windows\)','Mozilla Beta (Windows) - ( Rogue Robot )', 'MSIE[\x20]2','MSIE 2 - ( Rogue Robot )', 'MSIE[\x20]3','MSIE 3 - ( Rogue Robot )', 'MSIE[\x20]4','MSIE 4 - ( Rogue Robot )', 'MSIE[\x20]5','MSIE 5 - ( Rogue Robot )', 'MSIE[\x20]6','MSIE 6 - ( Rogue Robot )', 'Windows[\x20]95','Windows 95 - ( Rogue Robot )', 'Windows[\x20]98','Windows 99 - ( Rogue Robot )', # these could be removed to speed up processing as they are rarely seen 'a6\-indexer','a6-indexer', 'abcdatos','abcdatos', 'abonti\.com','abonti.com', 'acme\.spider','acme.spider', 'activebookmark','activebookmark', 'adamm_bot','adamm_bot', 'advbot','advbot', 'affectv\.co\.uk','affectv.co.uk', 'ahoythehomepagefinder','ahoythehomepagefinder', 'aleadsoftbot','aleadsoftbot', 'alkaline','alkaline', 'allrati','allrati', 'alltop','alltop', 'almaden','almaden', 'alpha_search_agent','alpha_search_agent', 'anthill','anthill', 'antibot','antibot', 'aport','aport', 'appie','appie', 'applesyndication','applesyndication', 'arachnophilia','arachnophilia', 'arale','arale', 'araneo','araneo', 'architext','architext', 'archive\-de\.com','archive-de.com', 'aretha','aretha', 'argus','argus', 'ariadne','ariadne', 'arianna\.libero\.it','arianna.libero.it', 'arks','arks', 'aspider','aspider', 'aspseek','aspseek', 'asterias','asterias', 'asynchttpclient','asynchttpclient', 'atn\.txt','atn.txt', 'atomz','atomz', 'auresys','auresys', 'awbot','awbot', 'backlinktest\.com','backlinktest.com', 'backrub','backrub', 'bbot','bbot', 'becomebot','becomebot', 'bender','bender', 'betabot','betabot', 'bigbrother','bigbrother', 'biglotron','biglotron', 'BingLocalSearch','BingLocalSearch', 'bittorrent_bot','bittorrent_bot', 'biz360[_+\s]spider','biz360 spider', 'bjaaland','bjaaland', 'blackwidow','blackwidow', 'blindekuh','blindekuh', 'blogbridge[_+\s]service','blogbridge service', 'blogged_crawl','blogged_crawl', 'bloglines','bloglines', 'bloglovin','bloglovin', 'blogpulse','blogpulse', 'blogsearch','blogsearch', 'blogshares','blogshares', 'blogslive','blogslive', 'blogssay','blogssay', 'bloodhound','bloodhound', 'bncf\.firenze\.sbn\.it/raccolta\.txt','bncf\.firenze\.sbn.it/raccolta.txt', 'bobby','bobby', 'bookmark\-manager','bookmark-manager', 'borg\-bot','borg-bot', 'boris','boris', 'brightnet','brightnet', 'bruinbot','bruinbot', 'bspider','bspider', 'bubing','bubing', 'bumblebee','bumblebee', 'butterfly','butterfly', 'buzztracker','buzztracker', 'cactvschemistryspider','cactvschemistryspider', 'calif[^r]','calif[^r]', 'candlelight[_+\s]favorites[_+\s]inspector','candlelight favorites inspector', 'careerbot','careerbot', 'carpathia','carpathia', 'cassandra','cassandra', 'catbot','catbot', 'cbn00glebot','cbn00glebot', 'cerberian_drtrs','cerberian_drtrs', 'cfetch','cfetch', 'cgireader','cgireader', 'chattertrap','chattertrap', 'check_http','check_http', 'checkbot','checkbot', 'checkweb_link_validator','checkweb_link_validator', 'christcrawler','christcrawler', 'churl','churl', 'cienciaficcion','cienciaficcion', 'cipinetbot','cipinetbot', 'imagecoccoc','imagecoccoc', 'coccoc','coccoc', 'coldfusion','coldfusion', 'collective','collective', 'combine','combine', 'commons\-httpclient','commons-httpclient', 'computer_and_automation_research_institute_crawler','computer_and_automation_research_institute_crawler', 'conceptbot','conceptbot', 'contentmatch','contentmatch', 'converamultimediacrawler','converamultimediacrawler', 'coolbot','coolbot', 'copubbot','copubbot', 'core','core', 'covario','covario', 'cruiser','cruiser', 'cscrawler','cscrawler', 'cuasarbot','cuasarbot', 'cursor','cursor', 'cusco','cusco', 'custo','custo', 'cyberspyder','cyberspyder', 'datafountains/dmoz_downloader','datafountains/dmoz_downloader', 'dataprovider\.com','dataprovider.com', 'daumoa','daumoa', 'daviesbot','daviesbot', 'daylifefeedfetcher','daylifefeedfetcher', 'daypopbot','daypopbot', 'deepindex','deepindex', 'desertrealm','desertrealm', 'deweb','deweb', 'dienstspider','dienstspider', 'digger','digger', 'digout4u','digout4u', 'diibot','diibot', 'dipsie\.bot','dipsie.bot', 'direct_hit','direct_hit', 'discobot','discobot', 'dlvr\.it','dlvr.it', 'dnabot','dnabot', 'dnsgroup','dnsgroup', 'doccheckbot','doccheckbot', 'domainappender','domainappender', 'domainchecker','domainchecker', 'domainsdb\.net','domainsdb.net', 'download_express','download_express', 'dragonbot','dragonbot', 'dreamwidth','dreamwidth', 'drupal','drupal', 'dulance','dulance', 'dumbot','dumbot', 'dumm\.de\-bot','dumm.de-bot', 'dwcp','dwcp', 'e\-collector','e-collector', 'earthcom\.info','earthcom.info', 'easydl','easydl', 'ebiness','ebiness', 'eccp','eccp', 'echo!','echo!', 'edgeio\-retriever','edgeio-retriever', 'elfinbot','elfinbot', 'emacs','emacs', 'emcspider','emcspider', 'enteprise','enteprise', 'ernst[:blank:]2\.0','ernst[:blank:]2.0', 'esther','esther', 'ets_v','ets_v', 'eventax','eventax', 'everbeecrawler','everbeecrawler', 'everest\-vulcan','everest-vulcan', 'evliyacelebi','evliyacelebi', 'exactseek','exactseek', 'extreme[_+\s]picture[_+\s]finder','extreme picture finder', 'ezoom','ezoom', 'ezresult','ezresult', 'facebook','facebook', 'facebot','facebot', 'fast\-search\-engine','fast-search-engine', 'matrix_s\.p\.a\._\-_fast_enterprise_crawler','matrix_s.p.a._-_fast_enterprise_crawler', 'fast_enterprise_crawler','fast_enterprise_crawler', 'fastbot','fastbot', 'fastcrawler','fastcrawler', 'favicon','favicon', 'favorg','favorg', 'favorites_sweeper','favorites_sweeper', 'fdse','fdse', 'feedburner','feedburner', 'feedcrawl','feedcrawl', 'feedflow','feedflow', 'feedmyinbox','feedmyinbox', 'feedroll\.com','feedroll.com', 'feedsky','feedsky', 'feedster','feedster', 'feedvalidator','feedvalidator', 'feedzira','feedzira', 'felix','felix', 'ferret','ferret', 'fetchbot','fetchbot', 'fetchrover','fetchrover', 'fever/','fever', 'fido','fido', 'filmkamerabot','filmkamerabot', 'filterdb\.iss\.net','filterdb.iss.net', 'finderlein[_+\s]research[_+\s]crawler','finderlein research crawler', 'findexa_crawler','findexa_crawler', 'finnish','finnish', 'fireball','fireball', 'firmilybot','firmilybot', 'flexum','flexum', 'foaf\-search\.net','foaf-search.net', 'fooky\.com/ScorpionBot','fooky.com/ScorpionBot', 'fouineur','fouineur', 'francoroute','francoroute', 'freecrawl','freecrawl', 'freenews','freenews', 'funnelweb','funnelweb', 'g2crawler','g2crawler', 'gama','gama', 'gazz','gazz', 'gcreep','gcreep', 'geniebot','geniebot', 'genieo','genieo', 'geohasher','geohasher', 'getbot','getbot', 'geturl','geturl', 'gigablastopensource','gigablastopensource', 'global_fetch','global_fetch', 'gnodspider','gnodspider', 'goforit\.com','goforit.com', 'goforitbot','goforitbot', 'golem','golem', 'gonzo','gonzo', 'gougou','gougou', 'gpu_p2p_crawler','gpu_p2p_crawler', 'grabber','grabber', 'grapeshot','grapeshot', 'grapnel','grapnel', 'griffon','griffon', 'gromit','gromit', 'grub','grub', 'gulliver','gulliver', 'gulperbot','gulperbot', 'hambot','hambot', 'hanrss','hanrss', 'harvest','harvest', 'havindex','havindex', 'henrythemiragorobot','henrythemiragorobot', 'heritrix','heritrix', 'hl_ftien_spider','hl_ftien_spider', 'hometown','hometown', 'hoowwwer','hoowwwer', 'hpprint','hpprint', 'htdig','htdig', 'html[_+\s]link[_+\s]validator','html link validator', 'htmlgobble','htmlgobble', 'htmlparser','htmlparser', 'httrack','httrack', 'hundesuche\.com\-bot','hundesuche.com-bot', 'hyperdecontextualizer','hyperdecontextualizer', 'ia_archiver\-web\.archive\.org','ia_archiver-web.archive.org', 'ia_archiver','ia_archiver', 'iajabot','iajabot', 'iaskspider','iaskspider', 'i\-bot','i-bot', 'icarus6j','icarus6j', 'ichiro','ichiro', 'icjobs\.de','icjobs.de', 'ilse','ilse', 'iltrovatore\-setaccio','iltrovatore-setaccio', 'imagelock','imagelock', 'implisensebot','implisensebot', 'inagist','inagist', 'incywincy','incywincy', 'infobot','infobot', 'infociousbot','infociousbot', 'infohelfer','infohelfer', 'infomine','infomine', 'informant','informant', 'infoseeksidewinder','infoseeksidewinder', 'infoseek','infoseek', 'infospider','infospider', 'inspectorwww','inspectorwww', 'insurancobot','insurancobot', 'integromedb\.org','integromedb.org', 'intelliagent','intelliagent', 'internet[_+\s]ninja','internet ninja', 'internetarchive','internetarchive', 'internetseer','internetseer', 'internetsupervision','internetsupervision', 'ips\-agent','ips-agent', 'irobot','irobot', 'iron33','iron33', 'isearch2006','isearch2006', 'israelisearch','israelisearch', 'iupui_research_bot','iupui_research_bot', 'izsearch','izsearch', 'jacobin[\x20]club','jacobin club', 'jakarta','jakarta', 'jbot','jbot', 'jcrawler','jcrawler', 'jeeves','jeeves', 'jennybot','jennybot', 'jobboerse','jobboerse', 'jobot','jobot', 'jobo','jobo', 'joebot','joebot', 'jrtwine[_+\s]software[_+\s]check[_+\s]favorites[_+\s]utility','jrtwine software check favorites utility', 'js\-kit','js-kit', 'jubii','jubii', 'jumpstation','jumpstation', 'justview','justview', 'kalambot','kalambot', 'kamano\.de_newsfeedverzeichnis','kamano.de_newsfeedverzeichnis', 'kapsi','kapsi', 'katipo','katipo', 'kazoombot','kazoombot', 'kevin','kevin', 'keyoshid','keyoshid', 'kilroy','kilroy', 'kinja\-imagebot','kinja-imagebot', 'kinjabot','kinjabot', 'knowitall','knowitall', 'knowledge\.com','knowledge.com', 'ko[_+\s]yappo[_+\s]robot','ko yappo robot', 'kouaa_krawler','kouaa_krawler', 'krugle','krugle', 'ksibot','ksibot', 'kummhttp','kummhttp', 'kurzor','kurzor', 'labelgrabber\.txt','labelgrabber.txt', 'lanshanbot','lanshanbot', 'larbin','larbin', 'largesmall[\x20]crawler','largesmall crawler', 'legs','legs', 'letscrawl\.com','letscrawl.com', 'libcrawl','libcrawl', 'lilina','lilina', 'link_valet_online','link_valet_online', 'linkbot','linkbot', 'linkdex\.com','linkdex.com', 'linkidator','linkidator', 'linkscan','linkscan', 'linkstats[\x20]bot','linkstats bot', 'linkwalker','linkwalker', 'lipperhey','lipperhey', 'livejournal\.com','livejournal.com', 'lmspider','lmspider', 'loadtimebot','loadtimebot', 'lockon','lockon', 'logo_gif','logo_gif', 'longurl','longurl', 'lssrocketcrawler','lssrocketcrawler', 'ltbot','ltbot', 'ltx71','ltx71', 'lwp\-request','lwp-request', 'lwp\-trivial','lwp-trivial', 'lycos[_+\s]','lycos ', 'macworm','macworm', 'madaali\.de','madaali.de', 'magpierss','magpierss', 'magpie','magpie', 'mapoftheinternet\.com','mapoftheinternet.com', 'marvin','marvin', 'mattie','mattie', 'mediabot','mediabot', 'mediafox','mediafox', 'megaindex','megaindex', 'megite','megite', 'memorybot','memorybot', 'mercator','mercator', 'meshexplorer','meshexplorer', 'metager2\-verification\-bot','metager2-verification-bot', 'metajobbot','metajobbot', 'metaspinner','metaspinner', 'metauri','metauri', 'miadev','miadev', 'microsoft[_+\s]url[_+\s]control','microsoft url control', 'microsoft[\x20]bits','microsoft bits', 'microsoft\-webdav\-miniredir','microsoft-webdav-miniredir', 'mindcrawler','mindcrawler', 'mindupbot','mindupbot', 'mini\-reptile','mini-reptile', 'minirank','minirank', 'misterbot','misterbot', 'miva','miva', 'mizzu_labs','mizzu_labs', 'mnogosearch','mnogosearch', 'moget','moget', 'momspider','momspider', 'monster','monster', 'motor','motor', 'movabletype','movabletype', 'ms[_+\s]search[_+\s]6\.0[_+\s]robot','ms search 6.0 robot', 'ms_search_4\.0_robot','ms_search_4.0_robot', 'msnbot\-udiscovery','msnbot-udiscovery', 'msrabot','msrabot', 'msrbot','msrbot', 'mt::telegraph::agent','mt::telegraph::agent', 'muncher','muncher', 'muscatferret','muscatferret', 'mwdsearch','mwdsearch', 'mydoyouhike','mydoyouhike', 'myweb','myweb', 'nagios','nagios', 'nasa_search','nasa_search', 'ndspider','ndspider', 'nederland\.zoek','nederland.zoek', 'netcarta','netcarta', 'netcraft','netcraft', 'netluchs','netluchs', 'netmechanic','netmechanic', 'netnewswire','netnewswire', 'netscoop','netscoop', 'netsprint','netsprint', 'netvibes','netvibes', 'newrelicpinger','newrelicpinger', 'newscan\-online','newscan-online', 'newsfox','newsfox', 'newsgatoronline','newsgatoronline', 'nextgensearchbot','nextgensearchbot', 'nhse','nhse', 'nicebot','nicebot', 'nimblecrawler','nimblecrawler', 'ning','ning', 'nomad','nomad', 'northstar','northstar', 'noxtrumbot','noxtrumbot', 'npbot','npbot', 'nzexplorer','nzexplorer', 'objectssearch','objectssearch', 'occam','occam', 'ocelli','ocelli', 'octopus','octopus', 'octora_beta_bot','octora_beta_bot', 'onet\.pl[_+\s]sa','onet.pl sa', 'onfolio','onfolio', 'openfind','openfind', 'opentaggerbot','opentaggerbot', 'openwebspider','openwebspider', 'optimizer','optimizer', 'oracle_ultra_search','oracle_ultra_search', 'orb_search','orb_search', 'orbiter','orbiter', 'packrat','packrat', 'pageboy','pageboy', 'panscient','panscient', 'parasite','parasite', 'passwordmaker\.org','passwordmaker.org', 'patric','patric', 'pear_http_request_class','pear_http_request_class', 'peerbot','peerbot', 'pegasus','pegasus', 'perignator','perignator', 'perman','perman', 'petersnews','petersnews', 'phantom','phantom', 'php[_+\s]version[_+\s]tracker','php version tracker', 'phpcrawl','phpcrawl', 'phpdig','phpdig', 'picmole','picmole', 'pictureofinternet','pictureofinternet', 'piltdownman','piltdownman', 'pimptrain','pimptrain', 'ping\.blo\.gs','ping.blo.gs', 'pingdom','pingdom', 'pioneer','pioneer', 'pita','pita', 'pitkow','pitkow', 'pjspider','pjspider', 'plinki','plinki', 'pluckfeedcrawler','pluckfeedcrawler', 'plumtreewebaccessor','plumtreewebaccessor', 'pogodak','pogodak', 'pompos','pompos', 'popdexter','popdexter', 'poppi','poppi', 'port_huron_labs','port_huron_labs', 'portalb','portalb', 'postfavorites','postfavorites', 'postpost','postpost', 'postrank','postrank', 'powermarks','powermarks', 'printfulbot','printfulbot', 'proodlebot','proodlebot', 'protopage','protopage', 'publiclibraryarchive','publiclibraryarchive', 'pyquery','pyquery', 'python','python', 'qihoobot','qihoobot', 'quipply','quipply', 'qwantify','qwantify', 'r6\_','r6\_', 'rambler','rambler', 'ratingburner','ratingburner', 'raven','raven', 'rbse','rbse', 'redalert','redalert', 'regator','regator', 'relevantnoise\.com','relevantnoise.com', 'resumerobot','resumerobot', 'rhcs','rhcs', 'riddler','riddler', 'road_runner','road_runner', 'robbie','robbie', 'robi','robi', 'robocrawl','robocrawl', 'robofox','robofox', 'robozilla','robozilla', 'rojo','rojo', 'rome[\x20]client','rome client', 'roverbot','roverbot', 'rpt\-httpclient','rpt-httpclient', 'rssgraffiti','rssgraffiti', 'rssimagesbot','rssimagesbot', 'ruffle','ruffle', 'rufusbot','rufusbot', 'rules','rules', 'safeads\.xyz','safeads.xyz', 'safetynetrobot','safetynetrobot', 'sage\+\+','sage++', 'sandcrawler','sandcrawler', 'savetheworldheritage','savetheworldheritage', 'sbider','sbider', 'schizozilla','schizozilla', 'scooter','scooter', 'scoutjet','scoutjet', 'scumbot','scumbot', 'search\-info','search-info', 'search_au','search_au', 'searchguild[_+\s]dmoz[_+\s]experiment','searchguild dmoz experiment', 'searchmetricsbot','searchmetricsbot', 'searchprocess','searchprocess', 'seekbot','seekbot', 'semalt','semalt', 'senrigan','senrigan', 'sensis_web_crawler','sensis_web_crawler', 'seodiver','seodiver', 'seokicks\.de','seokicks.de', 'seoscanners','seoscanners', 'sgscout','sgscout', 'shaggy','shaggy', 'shaihulud','shaihulud', 'shareaholicbot','shareaholicbot', 'shoutcast','shoutcast', 'sift','sift', 'simbot','simbot', 'simplepie','simplepie', 'sistrix','sistrix', 'site\-valet','site-valet', 'sitebot','sitebot', 'sitedomain\-bot','sitedomain-bot', 'sitetech','sitetech', 'skimbot','skimbot', 'skymob','skymob', 'slcrawler','slcrawler', 'slurp','slurp', 'slysearch','slysearch', 'smartspider','smartspider', 'smtbot','smtbot', 'snap\.com_beta_crawler','snap.com_beta_crawler', 'snappy','snappy', 'snooper','snooper', 'sohu\-search','sohu-search', 'sohu','sohu ( catchall )', 'solbot','solbot', 'speedy','speedy', 'sphere_scout','sphere_scout', 'spider[_+\s]monkey','spider monkey', 'spiderline','spiderline', 'spiderlytics','spiderlytics', 'spiderman','spiderman', 'spiderview','spiderview', 'spip','spip', 'sproose_crawler','sproose_crawler', 'spry','spry', 'sqworm','sqworm', 'ssearcher','ssearcher', 'steeler','steeler', 'steroid__download','steroid__download', 'stq_bot','stq_bot', 'Stratagems[\x20]Kumo','Stratagems Kumo', 'suchfin\-bot','suchfin-bot', 'suke','suke', 'summify\.com','summify.com', 'sunrise','sunrise', 'suntek','suntek', 'superbot','superbot', 'superfeedr','superfeedr', 'susie','susie', 'sven','sven', 'syndic8','syndic8', 'syndicapi','syndicapi', 'synoobot','synoobot', 'synthesio','synthesio', 't\-h\-u\-n\-d\-e\-r\-s\-t\-o\-n\-e','t-h-u-n-d-e-r-s-t-o-n-e', 'tach_bw','tach_bw', 'tagyu_agent','tagyu_agent', 'tailrank','tailrank', 'tarantula','tarantula', 'tarspider','tarspider', 'tcl_http_client_package','tcl_http_client_package', 'techbot','techbot', 'technoratibot','technoratibot', 'templeton','templeton', 'teoma','teoma', 'teragramcrawlersurf','teragramcrawlersurf', 'test_crawler','test_crawler', 'testbot','testbot', 'thumbsniper','thumbsniper', 'titan','titan', 'titin','titin', 'tkwww','tkwww', 'tlspider','tlspider', 'topblogsinfo','topblogsinfo', 'topicblogs','topicblogs', 'topix\.net','topix.net', 'trapit','trapit', 'trileet','trileet', 'turtlescanner','turtlescanner', 'turtle','turtle', 'tutorgigbot','tutorgigbot', 'tweetedtimes','tweetedtimes', 'twiceler','twiceler', 'twisted[\x20]pagegetter','twisted pagegetter', 'twitterbot','twitterbot', 'twitterfeed','twitterfeed', 'ubicrawler','ubicrawler', 'ucsd','ucsd', 'udmsearch','udmsearch', 'ultraseek','ultraseek', 'unchaos_bot_hybrid_web_search_engine','unchaos_bot_hybrid_web_search_engine', 'unido\-bot','unido-bot', 'unisterbot','unisterbot', 'universalfeedparser','universalfeedparser', 'unlost_web_crawler','unlost_web_crawler', 'unwindfetchor','unwindfetchor', 'updated','updated', 'urlck','urlck', 'ustc\-semantic\-group','ustc-semantic-group', 'vagabondo\-wap','vagabondo-wap', 'vagabondo','vagabondo', 'valkyrie','valkyrie', 'vermut','vermut', 'versus_crawler_from_eda\.baykan@epfl\.ch','versus_crawler_from_eda.baykan@epfl.ch', 'verticrawl','verticrawl', 'vespa_crawler','vespa_crawler', 'victoria','victoria', 'virus[_+\s]detector','virus_detector', 'visionsearch','visionsearch', 'voidbot','voidbot', 'voltron','voltron', 'vse/','vse', 'vwbot','vwbot', 'w3c[_+\s]css[_+\s]validator[_+\s]jfouffa','w3c_css_validator_jfouffa', 'w3index','w3index', 'w3m2','w3m2', 'wallpaper','wallpaper', 'wanderer','wanderer', 'wapspider','wapspider', 'wapspIRLider','wapspIRLider', 'watchmouse','watchmouse', 'wavefire','wavefire', 'waybackarchive\.org','waybackarchive.org', 'wazzup','wazzup', 'web_downloader','web_downloader', 'webbandit','webbandit', 'webbase','webbase', 'webcatcher','webcatcher', 'webclipping\.com','webclipping.com', 'webcollage','webcollage', 'webcompass','webcompass', 'webcopy','webcopy', 'webcrawl\.net','webcrawl.net', 'webdup','webdup', 'webfetcher','webfetcher', 'webfilter','webfilter', 'webfoot','webfoot', 'webinator','webinator', 'webindexer','webindexer', 'weblayers','weblayers', 'weblinker','weblinker', 'webminer','webminer', 'webmirror','webmirror', 'webmoose','webmoose', 'webquest','webquest', 'webreader','webreader', 'webreaper','webreaper', 'website[_+\s]monitoring[_+\s]bot','website monitoring bot', 'websnarf','websnarf', 'webspider','webspider', 'webvac','webvac', 'webvulncrawl','webvulncrawl', 'webwalker','webwalker', 'webwalk','webwalk', 'webwatch','webwatch', 'wells_search','wells_search', 'wer\-liefert\-was','wer-liefert-was', 'wesee:search','wesee:search', 'wevikabot','wevikabot', 'whatuseek','whatuseek', 'whowhere','whowhere', 'windows\-rss\-platform','windows-rss-platform', 'wired\-digital','wired-digital', 'zyborg','zyborg', 'wisenutbot','wisenutbot', 'wiumi','wiumi', 'wmir','wmir', 'wolp','wolp', 'wombat','wombat', 'wonderer','wonderer', 'woozweb','woozweb', 'wordpress','wordpress', 'worm','worm', 'wume_crawler','wume_crawler', 'wwwc','wwwc', 'wwweasel','wwweasel', 'wz101','wz101', 'xget','xget', 'xirq','xirq', 'xydo','xydo', 'y!j','y!j', 'yahoo![\x20]searchmonkey','yahoo! searchmonkey', 'yahoo!_mindset','yahoo!_mindset', 'yahoo\-blogs','yahoo-blogs', 'yahoo\-mmcrawler','yahoo-mmcrawler', 'yahoo\-newscrawler','yahoo-newscrawler', 'yahoo[\x20]pipes','yahoo pipes', 'yahoo\-verticalcrawler','yahoo-verticalcrawler', 'yahoocachesystem','yahoocachesystem', 'yahooexternalcache','yahooexternalcache', 'yahoofeedseeker','yahoofeedseeker', 'yahooseeker\-testing','yahooseeker-testing', 'yahooseeker','yahooseeker', 'yahooysmcm','yahooysmcm', 'yammer','yammer', 'yanga','yanga', 'yet\-another\-spider','yet-another-spider', 'yeti','yeti', 'yie8','yie8', 'yodaobot','yodaobot', 'yooglifetchagent','yooglifetchagent', 'youdao','youdao', 'yourls','yourls', 'z\-add_link_checker','z-add_link_checker', 'zealbot','zealbot', 'zemanta','zemanta', 'zend_http_client','zend_http_client', 'zeus','zeus', 'zhuaxia','zhuaxia', '[^a]fish','[^a]fish', '[\x20]netseer[\x20]',' netseer ', '^[1-3]$','^[1-3]$', '^finbot','^finbot', '^motorola$','^motorola$', '^msie','^msie', '^voyager/','^voyager', '^webindex$','webindex', '1\-more_scanner','1-more_scanner', # Generic robot 'robot','robot', 'blog','blog', 'checker','checker', 'crawl','crawl', 'discover','discover', 'feed','feed', 'fetcher','fetcher', 'hunter','hunter', 'link','link', 'scanner','scanner', 'seek','seek', 'sitemap','sitemap', 'spider','spider', 'sucker','sucker', 'validator','validator', 'bot[\s_+:,\.\;\/\\\-]','Unknown robot identified by bot\*', '[\s_+:,\.\;\/\\\-]bot','Unknown robot identified by \*bot', 'curl','Curl', 'php','A PHP script', 'ruby/','Ruby script', 'no_user_agent','empty user agent string', # Unknown robots identified by hit on robots.txt 'unknown','Unknown robot (identified by hit on robots.txt)' ); # RobotsAffiliateLib # This list try to tell by which Search Engine a robot is used #------------------------------------------------------------- %RobotsAffiliateLib = ( ); 1;