사용자:풀빵/스크립트작업실/replace-project.py
보이기
# -*- coding: utf-8 -*-
"""
replace project
"""
#
# (C) Daniel Herding & the Pywikipediabot Team, 2004-2008
#
# Distributed under the terms of the MIT license.
#
# 토론 페이지 없을 경우 새 토론 페이지가 자동으로 생성되는 것을 막으려면...
#
# choice = wikipedia.inputChoice(
# u'Do you want to crate a new one?',
# ['Yes', 'No'], ['y', 'N'], 'N')
# if choice in ['n', 'N']:
# continue
#
# 이런 식으로 하면 됩니다.
from __future__ import generators
import sys, re, time
import wikipedia, pagegenerators, catlib, config
# Imports predefined replacements tasks from fixes.py
import fixes
# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
'¶ms;': pagegenerators.parameterHelp,
'&fixes-help;': fixes.help,
}
# must be set!
g_removal_tmpl = False
#g_prj_tmpl = u'{{화학 프로젝트}}\r\n'
#g_prj_regex = u'\{\{화학 프로젝트\}\}'
#g_prj_tmpl = u'{{화폐 프로젝트}}\r\n'
#g_prj_regex = u'{{화폐 프로젝트}}'
g_prj_tmpl = u'{{화학 프로젝트}}\r\n'
g_prj_regex = u'\{\{화학 프로젝트\}\}'
__version__='$Id: replace.py 5269 2008-04-24 15:19:15Z huji $'
# Summary messages in different languages
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
# below.`v
msg = {
'ar':u'%s روبوت : استبدال تلقائي للنص',
'de':u'Bot: Automatisierte Textersetzung %s',
'el':u'Ρομπότ: Αυτόματη αντικατάσταση κειμένου %s',
'en':u'Robot: Automated text replacement %s',
'es':u'Robot: Reemplazo automático de texto %s',
'fa':u'ربات: تغییر خودکار متن %s',
'fr':u'Bot : Remplacement de texte automatisé %s',
'he':u'בוט: החלפת טקסט אוטומטית %s',
'hu':u'Robot: Automatikus szövegcsere %s',
'ia':u'Robot: Reimplaciamento automatic de texto %s',
'id':u'Bot: Penggantian teks otomatis %s',
'is':u'Vélmenni: breyti texta %s',
'it':u'Bot: Sostituzione automatica %s',
'ja':u'ロボットによる: 文字置き換え %s',
'ka':u'რობოტი: ტექსტის ავტომატური შეცვლა %s',
'kk':u'Бот: Мәтінді өздікті алмастырды: %s',
'ksh':u'Bot: hät outomatesch Täx jetuusch: %s',
'lt':u'robotas: Automatinis teksto keitimas %s',
'nds':u'Bot: Text automaatsch utwesselt: %s',
'nds-nl':u'Bot: autematisch tekse vervungen %s',
'nl':u'Bot: automatisch tekst vervangen %s',
'nn':u'robot: automatisk teksterstatting: %s',
'no':u'bot: Automatisk teksterstatning: %s',
'pl':u'Robot automatycznie zamienia tekst %s',
'pt':u'Bot: Mudança automática %s',
'ru':u'Робот: Автоматизированная замена текста',
'sr':u'Бот: Аутоматска замена текста %s',
'sv':u'Bot: Automatisk textersättning: %s',
'zh': u'機器人:執行文字代換作業 %s',
}
class XmlDumpReplacePageGenerator:
"""
Iterator that will yield Pages that might contain text to replace.
These pages will be retrieved from a local XML dump file.
Arguments:
* xmlFilename - The dump's path, either absolute or relative
* xmlStart - Skip all articles in the dump before this one
* replacements - A list of 2-tuples of original text (as a
compiled regular expression) and replacement
text (as a string).
* exceptions - A dictionary which defines when to ignore an
occurence. See docu of the ReplaceRobot
constructor below.
"""
def __init__(self, xmlFilename, xmlStart, replacements, exceptions):
self.xmlFilename = xmlFilename
self.replacements = replacements
self.exceptions = exceptions
self.xmlStart = xmlStart
self.skipping = bool(xmlStart)
self.excsInside = []
if 'inside-tags' in self.exceptions:
self.excsInside += self.exceptions['inside-tags']
if 'inside' in self.exceptions:
self.excsInside += self.exceptions['inside']
import xmlreader
self.site = wikipedia.getSite()
dump = xmlreader.XmlDump(self.xmlFilename)
self.parser = dump.parse()
def __iter__(self):
try:
for entry in self.parser:
if self.skipping:
if entry.title != self.xmlStart:
continue
self.skipping = False
if not self.isTitleExcepted(entry.title) \
and not self.isTextExcepted(entry.text):
new_text = entry.text
for old, new in self.replacements:
new_text = wikipedia.replaceExcept(
new_text, old, new, self.excsInside)
if new_text != entry.text:
yield wikipedia.Page(self.site, entry.title)
except KeyboardInterrupt:
try:
if not self.skipping:
wikipedia.output(
u'To resume, use "-xmlstart:%s" on the command line.'
% entry.title)
except NameError:
pass
def isTitleExcepted(self, title):
if 'title' in self.exceptions:
for exc in self.exceptions['title']:
if exc.search(title):
return True
return False
def isTextExcepted(self, text):
if 'text-contains' in self.exceptions:
for exc in self.exceptions['text-contains']:
if exc.search(text):
return True
return False
class ReplaceRobot:
"""
A bot that can do text replacements.
"""
def __init__(self, generator, replacements, exceptions={},
acceptall=False, allowoverlap=False, recursive=False,
addedCat=None, sleep=None):
"""
Arguments:
* generator - A generator that yields Page objects.
* replacements - A list of 2-tuples of original text (as a
compiled regular expression) and replacement
text (as a string).
* exceptions - A dictionary which defines when not to change an
occurence. See below.
* acceptall - If True, the user won't be prompted before changes
are made.
* allowoverlap - If True, when matches overlap, all of them are
replaced.
* addedCat - If set to a value, add this category to every page
touched.
Structure of the exceptions dictionary:
This dictionary can have these keys:
title
A list of regular expressions. All pages with titles that
are matched by one of these regular expressions are skipped.
text-contains
A list of regular expressions. All pages with text that
contains a part which is matched by one of these regular
expressions are skipped.
inside
A list of regular expressions. All occurences are skipped which
lie within a text region which is matched by one of these
regular expressions.
inside-tags
A list of strings. These strings must be keys from the
exceptionRegexes dictionary in wikipedia.replaceExcept().
"""
self.generator = generator
self.replacements = replacements
self.exceptions = exceptions
self.acceptall = acceptall
self.allowoverlap = allowoverlap
self.recursive = recursive
if addedCat:
site = wikipedia.getSite()
cat_ns = site.category_namespaces()[0]
self.addedCat = wikipedia.Page(site,
cat_ns + ':' + addedCat)
self.sleep = sleep
def isTitleExcepted(self, title):
"""
Iff one of the exceptions applies for the given title, returns True.
"""
if 'title' in self.exceptions:
for exc in self.exceptions['title']:
if exc.search(title):
return True
return False
def isTextExcepted(self, original_text):
"""
Iff one of the exceptions applies for the given page contents,
returns True.
"""
if 'text-contains' in self.exceptions:
for exc in self.exceptions['text-contains']:
if exc.search(original_text):
return True
return False
def doReplacements(self, original_text):
"""
Returns the text which is generated by applying all replacements to
the given text.
"""
new_text = original_text
exceptions = []
if 'inside-tags' in self.exceptions:
exceptions += self.exceptions['inside-tags']
if 'inside' in self.exceptions:
exceptions += self.exceptions['inside']
for old, new in self.replacements:
if self.sleep != None:
time.sleep(self.sleep)
new_text = wikipedia.replaceExcept(new_text, old, new, exceptions,
allowoverlap=self.allowoverlap)
return new_text
def run(self):
"""
Starts the robot.
"""
# Run the generator which will yield Pages which might need to be
# changed.
new_text = u''
removal_tmpl_mode = False
try:
for page in self.generator:
if self.isTitleExcepted(page.title()):
wikipedia.output(
u'Skipping %s because the title is on the exceptions list.'
% page.aslink())
continue
try:
# Load the talkpage
if page.isTalkPage():
wikipedia.output(u'isTalkPage')
continue
#page.toggleTalkPage()
page = wikipedia.Page(page.site(),
page.site().namespace(page.namespace() + 1) + ':'
+ page.titleWithoutNamespace())
wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
% page.title())
if not page.isTalkPage():
wikipedia.output(u'not page.isTalkPage')
continue
if page.namespace() == 11 and g_removal_tmpl:
removal_tmpl_mode = True
if page.namespace() == 11 and not g_removal_tmpl:
continue
# Load the page's text from the wiki
if not page.exists() and not removal_tmpl_mode:
wikipedia.output(u'The talkpage is not exist... create a new one %s' % page.title())
new_text = g_prj_tmpl
try:
page.put_async(new_text, minorEdit = True)
except wikipedia.EditConflict:
wikipedia.output(u'Skipping %s because of edit conflict'
% (page.title(),))
except wikipedia.SpamfilterError, e:
wikipedia.output(
u'Cannot change %s because of blacklist entry %s'
% (page.title(), e.url))
except wikipedia.PageNotSaved, error:
wikipedia.output(u'Error putting page: %s'
% (error.args,))
except wikipedia.LockedPage:
wikipedia.output(u'Skipping %s (locked page)'
% (page.title(),))
continue
else:
wikipedia.output(u'The talkpage seems to be exist... edit %s' % page.title())
original_text = page.get(get_redirect=True)
if not removal_tmpl_mode :
#wikipedia.output(u'search %s in ..... ' % g_prj_regex)
pt = re.search(g_prj_regex, original_text)
if pt != None:
wikipedia.output(u'The template is found... ')
continue
if not page.canBeEdited():
wikipedia.output(u"You can't edit page %s"
% page.aslink())
continue
new_text = g_prj_tmpl + original_text
else: # removal_tmpl_mode
wikipedia.output(u'remove %s in ..... ' % g_prj_regex)
new_text = self.doReplacements(original_text)
except wikipedia.NoPage:
wikipedia.output(u'Page %s not found' % page.aslink())
continue
# Show the title of the page we're working on.
# Highlight the title in purple.
wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
% page.title())
wikipedia.showDiff(original_text, new_text)
if not self.acceptall:
choice = wikipedia.inputChoice(
u'Do you want to accept these changes?',
['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
if choice in ['a', 'A']:
self.acceptall = True
if choice in ['y', 'Y']:
page.put_async(new_text)
if self.acceptall:
try:
page.put_async(new_text)
except wikipedia.EditConflict:
wikipedia.output(u'Skipping %s because of edit conflict'
% (page.title(),))
except wikipedia.SpamfilterError, e:
wikipedia.output(
u'Cannot change %s because of blacklist entry %s'
% (page.title(), e.url))
except wikipedia.PageNotSaved, error:
wikipedia.output(u'Error putting page: %s'
% (error.args,))
except wikipedia.LockedPage:
wikipedia.output(u'Skipping %s (locked page)'
% (page.title(),))
finally:
wikipedia.output(u'... finally:!')
pass
def prepareRegexForMySQL(pattern):
pattern = pattern.replace('\s', '[:space:]')
pattern = pattern.replace('\d', '[:digit:]')
pattern = pattern.replace('\w', '[:alnum:]')
pattern = pattern.replace("'", "\\" + "'")
#pattern = pattern.replace('\\', '\\\\')
#for char in ['[', ']', "'"]:
# pattern = pattern.replace(char, '\%s' % char)
return pattern
def main():
add_cat = None
gen = None
# summary message
summary_commandline = None
# Array which will collect commandline parameters.
# First element is original text, second element is replacement text.
commandline_replacements = []
# A list of 2-tuples of original text and replacement text.
replacements = []
# Don't edit pages which contain certain texts.
exceptions = {
'title': [],
'text-contains': [],
'inside': [],
'inside-tags': [],
}
# Should the elements of 'replacements' and 'exceptions' be interpreted
# as regular expressions?
regex = False
# Predefined fixes from dictionary 'fixes' (see above).
fix = None
# the dump's path, either absolute or relative, which will be used
# if -xml flag is present
xmlFilename = None
useSql = False
PageTitles = []
# will become True when the user presses a ('yes to all') or uses the
# -always flag.
acceptall = False
# Will become True if the user inputs the commandline parameter -nocase
caseInsensitive = False
# Which namespaces should be processed?
# default to [] which means all namespaces will be processed
namespaces = []
# Do all hits when they overlap
allowoverlap = False
# Do not recurse replacement
recursive = False
# This factory is responsible for processing command line arguments
# that are also used by other scripts and that determine on which pages
# to work on.
genFactory = pagegenerators.GeneratorFactory()
# Load default summary message.
# BUG WARNING: This is probably incompatible with the -lang parameter.
wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg))
# Between a regex and another (using -fix) sleep some time (not to waste
# too much CPU
sleep = None
# Read commandline parameters.
for arg in wikipedia.handleArgs():
if arg == '-regex':
regex = True
elif arg.startswith('-xmlstart'):
if len(arg) == 9:
xmlStart = wikipedia.input(
u'Please enter the dumped article to start with:')
else:
xmlStart = arg[10:]
elif arg.startswith('-xml'):
if len(arg) == 4:
xmlFilename = wikipedia.input(
u'Please enter the XML dump\'s filename:')
else:
xmlFilename = arg[5:]
elif arg =='-sql':
useSql = True
elif arg.startswith('-page'):
if len(arg) == 5:
PageTitles.append(wikipedia.input(
u'Which page do you want to change?'))
else:
PageTitles.append(arg[6:])
elif arg.startswith('-excepttitle:'):
exceptions['title'].append(arg[13:])
elif arg.startswith('-excepttext:'):
exceptions['text-contains'].append(arg[12:])
elif arg.startswith('-exceptinside:'):
exceptions['inside'].append(arg[14:])
elif arg.startswith('-exceptinsidetag:'):
exceptions['inside-tags'].append(arg[17:])
elif arg.startswith('-fix:'):
fix = arg[5:]
elif arg.startswith('-sleep:'):
sleep = float(arg[7:])
elif arg == '-always':
acceptall = True
elif arg == '-recursive':
recursive = True
elif arg == '-nocase':
caseInsensitive = True
elif arg.startswith('-addcat:'):
add_cat = arg[8:]
elif arg.startswith('-namespace:'):
try:
namespaces.append(int(arg[11:]))
except ValueError:
namespaces.append(arg[11:])
elif arg.startswith('-summary:'):
wikipedia.setAction(arg[9:])
summary_commandline = True
elif arg.startswith('-allowoverlap'):
allowoverlap = True
else:
generator = genFactory.handleArg(arg)
if generator:
gen = generator
else:
commandline_replacements.append(arg)
if len(commandline_replacements) % 2:
raise wikipedia.Error, 'require even number of replacements.'
elif len(commandline_replacements) == 2 and fix == None:
replacements.append((commandline_replacements[0],
commandline_replacements[1]))
if summary_commandline == None:
wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg )
% (' (-' + commandline_replacements[0] + ' +'
+ commandline_replacements[1] + ')'))
elif len(commandline_replacements) > 1:
if fix == None:
for i in xrange (0, len(commandline_replacements), 2):
replacements.append((commandline_replacements[i],
commandline_replacements[i + 1]))
if summary_commandline == None:
pairs = [( commandline_replacements[i],
commandline_replacements[i + 1] )
for i in range(0, len(commandline_replacements), 2)]
replacementsDescription = '(%s)' % ', '.join(
[('-' + pair[0] + ' +' + pair[1]) for pair in pairs])
wikipedia.setAction(
wikipedia.translate(wikipedia.getSite(), msg )
% replacementsDescription)
else:
raise wikipedia.Error(
'Specifying -fix with replacements is undefined')
elif fix == None:
old = wikipedia.input(u'Please enter the text that should be replaced:')
new = wikipedia.input(u'Please enter the new text:')
change = '(-' + old + ' +' + new
replacements.append((old, new))
while True:
old = wikipedia.input(
u'Please enter another text that should be replaced, or press Enter to start:')
if old == '':
change += ')'
break
new = wikipedia.input(u'Please enter the new text:')
change = change + ' & -' + old + ' +' + new
replacements.append((old, new))
if summary_commandline != True:
default_summary_message = wikipedia.translate(wikipedia.getSite(), msg) % change
wikipedia.output(u'The summary message will default to: %s'
% default_summary_message)
summary_message = wikipedia.input(
u'Press Enter to use this default message, or enter a description of the\nchanges your bot will make:')
if summary_message == '':
summary_message = default_summary_message
wikipedia.setAction(summary_message)
else:
# Perform one of the predefined actions.
try:
fix = fixes.fixes[fix]
except KeyError:
wikipedia.output(u'Available predefined fixes are: %s'
% fixes.fixes.keys())
wikipedia.stopme()
sys.exit()
if 'regex' in fix:
regex = fix['regex']
if 'msg' in fix:
wikipedia.setAction(
wikipedia.translate(wikipedia.getSite(), fix['msg']))
if 'exceptions' in fix:
exceptions = fix['exceptions']
replacements = fix['replacements']
# already compile all regular expressions here to save time later
for i in range(len(replacements)):
old, new = replacements[i]
if not regex:
old = re.escape(old)
if caseInsensitive:
oldR = re.compile(old, re.UNICODE | re.IGNORECASE)
else:
oldR = re.compile(old, re.UNICODE)
replacements[i] = oldR, new
for exceptionCategory in ['title', 'text-contains', 'inside']:
if exceptionCategory in exceptions:
patterns = exceptions[exceptionCategory]
if not regex:
patterns = [re.escape(pattern) for pattern in patterns]
if caseInsensitive:
patterns = [re.compile(pattern, re.UNICODE | re.IGNORECASE)
for pattern in patterns]
else:
patterns = [re.compile(pattern, re.UNICODE)
for pattern in patterns]
exceptions[exceptionCategory] = patterns
if xmlFilename:
try:
xmlStart
except NameError:
xmlStart = None
gen = XmlDumpReplacePageGenerator(xmlFilename, xmlStart,
replacements, exceptions)
elif useSql:
whereClause = 'WHERE (%s)' % ' OR '.join(
["old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern)
for (old, new) in replacements])
if exceptions:
exceptClause = 'AND NOT (%s)' % ' OR '.join(
["old_text RLIKE '%s'" % prepareRegexForMySQL(exc.pattern)
for exc in exceptions])
else:
exceptClause = ''
query = u"""
SELECT page_namespace, page_title
FROM page
JOIN text ON (page_id = old_id)
%s
%s
LIMIT 200""" % (whereClause, exceptClause)
gen = pagegenerators.MySQLPageGenerator(query)
elif PageTitles:
pages = [wikipedia.Page(wikipedia.getSite(), PageTitle)
for PageTitle in PageTitles]
gen = iter(pages)
if not gen:
# syntax error, show help text from the top of this file
wikipedia.showHelp('replace')
wikipedia.stopme()
sys.exit()
if namespaces != []:
gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
if xmlFilename:
# XML parsing can be quite slow, so use smaller batches and
# longer lookahead.
preloadingGen = pagegenerators.PreloadingGenerator(gen,
pageNumber=20, lookahead=100)
else:
preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=60)
bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep)
bot.run()
if __name__ == "__main__":
try:
main()
finally:
wikipedia.stopme()