### Common/Large search engines, research
###

# Also see http://www.idlewords.com/boycott.pl
# The Internet doesn't want Microsoft to control search content.
# The story at http://www.theinquirer.net/index.html?article=12603 seems
# overly biased.
# Biased results are undesirable: http://www.intern.de/news/5876.html
User-agent: MSNBOT
Disallow: /

# These guys currently power Microsoft's search.msn.com
User-agent: Slurp
Disallow: /

# Microsoft Bing - allow now that google turned monopolist.
#User-agent: bingbot
#Disallow: /

# Once Google has fixed their webcrawler problem with causing 500MB traffic per
# day on a single 9MB file, we'll think about allowing them again.
User-agent: Googlebot
Crawl-delay: 10
#Disallow: /
#
User-agent: Googlebot-Image
Crawl-delay: 15
#Disallow: /
#
User-agent: Googlebot-Video
Crawl-delay: 60
#
User-agent: Googlebot-News
Crawl-delay: 15
#
User-agent: AdsBot-Google
Disallow: /
#
User-agent: AdsBot-Google-Mobile
Disallow: /
#
User-agent: Mediapartners-Google
Disallow: /

# Natural language research but go easy on .rpm/.deb.
User-agent: LCC
Crawl-delay: 90


### Troublemaker / Undesirable
###

# Why does a whois registry need to collect web server info?
User-Agent: SurveyBot
Disallow: /

# No use for this
User-Agent: SBIder
Disallow: /

# Ignores robots.txt!!
User-agent: MegaIndex.ru
Disallow: /

# Saturates the server. Goodbye.
User-agent: YandexBot
Disallow: /


### No only-for-pay services (exceptions available for a fee).
###

# We don't need another copyright police; server smashing?
User-agent: NPBot
Disallow: /

# http://www.turnitin.com/
User-agent: TurnitinBot
Disallow: /

# Get rid of pay-only search engines, esp with 404 info pages
User-agent: Evaal
Disallow: /

# As of yet unknown whether it's pay-only
User-Agent: RufusBot
Disallow: /

User-Agent: LinkWalker
Disallow: /

# Marketroids
User-agent: linkdexbot
Disallow: /

# Marketroids
User-agent: AhrefsBot
Disallow: /

# Marketroids
User-agent: SEOkicks-Robot
Disallow: /

# Marketroids (own website data is free)
# Wikipedia spamming.
User-agent: MJ12bot
Disallow: /

# Selling browser search add-ons.
User-agent: Cliqzbot
Disallow: /

# Marketroidal services
User-agent: BLEXBot
Disallow: /

# Marketroidal services
User-agent: SemrushBot
Disallow: /

# Pay services/don't say.
User-agent: Streamline3Bot
Disallow: /


### Single-platform bots are not permitted. These serve only a fraction of the
### Internet community.

# Get rid of search engines which provide service via binary-only proprietary
# technology only, and exclusively to users of one particular browser.
User-agent: girafa
Disallow: /

User-agent: boitho.com-dc
Disallow: /

User-agent: findlinks
Disallow: /


### Robots must say who they are / work for, why and what for they're collecting
### data.
### No anonymous robots. (Info not in English may be treated as anonymous.)

User-agent: Baiduspider
Disallow: /

# This is the Internet archive at archive.org.
## These guys don't leave info about themselves, don't say how to avoid being
## crawled, and reek of amazon.com. Yuck.
#User-agent: ia_archiver
#Disallow: /

User-agent: IRLbot
Disallow: /

User-agent: hl_ftien_spider_v1.1
Disallow: /

# their web page is empty
User-agent: aipbot
Disallow: /

User-agent: dloader(NaverRobot)
Disallow: /

# little info, probably pay-only, possibly ill-behaved
User-Agent: Exabot/
Disallow: /

User-agent: EvilSpider
Disallow: /

User-agent: FAST
Disallow: /

User-agent: Gigamega.bot
Disallow: /

User-agent: ichiro
Disallow: /

User-agent: icsbot-0.1
Disallow: /

# Only in Hungarian   http://robot.lapozz.com
#User-agent: LapozzBot
#Disallow: /

User-agent: NaverBot
Disallow: /

User-agent: noxtrumbot
Disallow: /

# Don't say who is behind them and why they crawl  http://www.omni-explorer.com/
User-Agent: OmniExplorer_Bot
Disallow: /

User-agent: OsO
Disallow: /

User-agent: psycheclone
Disallow: /

User-agent: ShablastBot
Disallow: /

User-agent: StackRambler
Disallow: /

User-agent: TestBot
Disallow: /

User-agent: thesubot
Disallow: /

User-agent: Twiceler
Disallow: /

# Only in French:
#VoilaBot http://www.voila.com/
User-agent: VoilaBot
Disallow: /

User-agent: voyager/
Disallow: /

User-agent: Java/
Disallow: /

User-agent: Sogou
Disallow: /


### Wikipedia's list
###

# Crawlers that are kind enough to obey, but which we'd rather not have
# unless they're feeding search engines.
User-agent: UbiCrawler
Disallow: /

User-agent: DOC
Disallow: /

User-agent: Zao
Disallow: /

# Some bots are known to be trouble, particularly those designed to copy
# entire sites. Please obey robots.txt.
User-agent: sitecheck.internetseer.com
Disallow: /

User-agent: Zealbot
Disallow: /

User-agent: MSIECrawler
Disallow: /

User-agent: SiteSnagger
Disallow: /

User-agent: WebStripper
Disallow: /

User-agent: WebCopier
Disallow: /

User-agent: Fetch
Disallow: /

User-agent: Offline Explorer
Disallow: /

User-agent: Teleport
Disallow: /

User-agent: TeleportPro
Disallow: /

User-agent: WebZIP
Disallow: /

User-agent: linko
Disallow: /

User-agent: HTTrack
Disallow: /

User-agent: Microsoft.URL.Control
Disallow: /

User-agent: Xenu
Disallow: /

User-agent: larbin
Disallow: /

User-agent: libwww
Disallow: /

User-agent: ZyBORG
Disallow: /

User-agent: Download Ninja
Disallow: /

# Misbehaving: requests much too fast:
User-agent: fast
Disallow: /

# The 'grub' distributed client has been *very* poorly behaved.
User-agent: grub-client
Disallow: /

# Doesn't follow robots.txt anyway, but...
User-agent: k2spider
Disallow: /

# A capture bot, downloads gazillions of pages with no public benefit
# http://www.webreaper.net/
User-agent: WebReaper
Disallow: /


### Default
###

User-agent: *
Crawl-delay: 30