Utente:GiacoBot/itwiki

Da Wikipedia, l'enciclopedia libera.
Vai alla navigazione Vai alla ricerca
# -*- coding: utf-8  -*-
#
# (C) Daniel Herding, 2004
#
# Distributed under the terms of the MIT license.
#
__version__='$Id: replace.py,v 1.102 2006/03/12 16:35:54 wikipedian Exp $'

from __future__ import generators
import sys, re
import wikipedia, pagegenerators, catlib, config

# Summary messages in different languages
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
# below.
msg = {
       'de':u'Bot: Automatisierte Textersetzung %s',
       'en':u'Robot: Automated text replacement %s',
       'es':u'Robot: Reemplazo automático de texto %s',
       'fr':u'Bot : Remplacement de texte automatisé %s',
       'hu':u'Robot: Automatikus szövegcsere %s',
       'ia':u'Robot: Reimplaciamento automatic de texto %s',
       'is':u'Vélmenni: breyti texta %s',
       'it':u'Bot: Sostituzione automatica del testo %s',
       'ka':u'რობოტი: ტექსტის ავტომატური შეცვლა %s',
       'lt':u'Botas: Automatinis teksto keitimas %s',       
       'pt':u'Bot: Mudança automática %s',
       'sr':u'Бот: Аутоматска замена текста %s',
       }

# Predefined replacements tasks.
fixes = {
    # per it.wikipedia
    'accenti': {
        'regex': True,
        'msg': {
               'it':u'Correzione di alcuni errori comuni contenuti in questa [[Discussioni_Wikipedia:Bot/Sostituzioni/Espressioni_regolari|lista]]',
              },
        'replacements': [
                #congiunzioni (e non) terminanti in -chè
                (u'\\b([Aa])bbenchè\\b', ur'\1bbenché'),
                (u'\\b([aA])cciocchè\\b', ur'\1cciocché'),
                (u'\\b([aA])ffinchè\\b', ur'\1ffinché'),
                (u'\\b([aA])lcunchè\\b', ur'\1lcunché'),
                (u'\\b([aA])llorchè\\b', ur'\1llorché'),
                (u'\\b([aA])ltrochè\\b', ur'\1ltroché'),
                (u'\\b([aA])ncorchè\\b', ur'\1ncorché'),
                (u'\\b([aA])nzichè\\b', ur'\1nziché'),
                (u'\\b([aA])ttesochè\\b', ur'\1ttesoché'),
                (u'\\b([aA])vvegnachè\\b', ur'\1vvegnaché'),
                (u"\\b([aA])vvegnadiochè\\b", ur"\1vvegnadioché"),
                (u"\\b([aA])vvengachè\\b", ur"\1vvengaché"),
                (u"\\b([aA])vvengadiochè\\b", ur"\1vvengadioché"),
                (u'\\b([bB])enchè\\b', ur'\1enché'),
                (u'\\b([cC])hecchè\\b', ur'\1hecché'),
                (u"\\b([cC])iocchè\\b", ur"\1iocché"),
                (u'\\b([cC])omecchè\\b', ur'\1omecché'),
                (u"\\b([cC])onciofossechè\\b", ur"\1onciofosseché"),
                (u'\\b([cC])ontuttochè\\b', ur'\1ontuttoché'),
                (u'\\b([cC])osicchè\\b', ur'\1osicché'),
                (u'\\b([cC])otalchè\\b', ur'\1otalché'),
                (u'\\b([dD])acchè\\b', ur'\1acché'),
                (u'\\b([dD])appoichè\\b', ur'\1appoiché'),
                (u'\\b([dD])imodochè\\b', ur'\1imodoché'),
                (u"\\b([dD])opochè\\b", ur"\1opoché"),
                (u"\\b([dD])opodichè\\b", ur"\1opodiché"),
                (u'\\b([eE])ssendochè\\b', ur'\1ssendoché'),
                (u'\\b([fF])inattantochè\\b', ur'\1inattantoché'),
                (u'\\b([fF])inchè\\b', ur'\1inché'),
                (u'\\b([fF])intantochè\\b', ur'\1intantoché'),
                (u"\\b([fF])inacchè\\b", ur"\1inacché"),
                (u"\\b([fF])inattantochè\\b", ur"\1inattantoché"),
                (u'\\b([fF])uorchè\\b', ur'\1uorché'),
                (u'\\b([gG])iacchè\\b', ur'\1iacché'),
                (u'\\b([gG])ranchè\\b', ur'\1ranché'),
                (u"\\b([gG])iafossechè\\b", ur"\1iafosseché"),
                (u"\\b([gG])iafossecosachè\\b", ur"\1iafossecosaché"),
                (u"\\b([iI])nfinattantochè\\b", ur"\1nfinattantoché"),
                (u'\\b([lL])orchè\\b', ur'\1orché'),
                (u"\\b([iI])nquantochè\\b", ur"\1nquantoché"),
                (u'\\b([mM])acchè\\b', ur'\1acché'),
                (u'\\b([nN])onchè\\b', ur'\1onché'),
                (u"\\b([nN])onsochè\\b", ur"\1onsoché"),
                (u'\\b([oO])ltrechè\\b', ur'\1ltreché'),
                (u"\\b([oO])ndechè\\b", ur"\1ndeché"),
                (u'\\b([pP])erchè\\b', ur'\1erché'),
                (u'\\b([pP])erciocchè\\b', ur'\1erciocché'),
                (u'\\b([pP])erlochè\\b', ur'\1erloché'),
                (u'\\b([pP])erocchè\\b', ur'\1erocché'),
                (u'\\b([pP])oichè\\b', ur'\1oiché'),
                (u"\\b([pP])osciachè\\b", ur"\1osciaché"),
                (u'\\b([pP])ressochè\\b', ur'\1ressoché'),
                (u'\\b([pP])urchè\\b', ur'\1urché'),
                (u"\\b([qQ])uantochè\\b", ur"\1uantoché"),
                (u"\\b([qQ])uasichè\\b", ur"\1uasiché"),
                (u"\\b([sS])econdochè\\b", ur"\1econdoché"),
                (u'\\b([sS])ennonchè\\b', ur'\1ennonché'),
                (u'\\b([sS])enonchè\\b', ur'\1enonché'),
                (u'\\b([sS])icchè\\b', ur'\1icché'),
                (u'\\b([sS])inattantochè\\b', ur'\1inattantoché'),
                (u'\\b([sS])inchè\\b', ur'\1inché'),
                (u'\\b([sS])intantochè\\b', ur'\1intantoché'),
                (u"\\b([sS])tantechè\\b", ur"\1tanteché"),
                (u'\\b([tT])alchè\\b', ur'\1alché'),
                (u"\\b([tT])almentechè\\b", ur"\1almenteché"),
                (u'\\b([tT])antochè\\b', ur'\1antoché'),
                (u"\\b([tT])rannechè\\b", ur"\1ranneché"),
                (u'\\b([tT])uttochè\\b', ur'\1uttoché'),
                #passati remoti
                (u"\\b([aA])bbattè\\b", ur"\1bbatté"),
                (u"\\b([aA])ccedè\\b", ur"\1ccedé"),
                (u"\\b([aA])ddissè\\b", ur"\1ddissé"),
                (u"\\b([aA])dempiè\\b", ur"\1dempié"),
                (u"\\b([aA])nnettè\\b", ur"\1nnetté"),
                (u"\\b([aA])ntiprendè\\b", ur"\1ntiprendé"),
                (u"\\b([aA])ppartenè\\b", ur"\1ppartené"),
                (u"\\b([aA])ppendè\\b", ur"\1ppendé"),
                (u"\\b([aA])pprendè\\b", ur"\1pprendé"),
                (u"\\b([aA])rrendè\\b", ur"\1rrendé"),
                (u"\\b([aA])rrompè\\b", ur"\1rrompé"),
                (u"\\b([aA])ssistè\\b", ur"\1ssisté"),
                (u"\\b([aA])ssolvè\\b", ur"\1ssolvé"),
                (u"\\b([aA])stenè\\b", ur"\1stené"),
                (u"\\b([aA])ttenè\\b", ur"\1ttené"),
                (u"\\b([aA])vvedè\\b", ur"\1vvedé"),
                (u"\\b([bB])attè\\b", ur"\1atté"),
                (u"\\b([cC])edè\\b", ur"\1edé"),
                (u"\\b([cC])ernè\\b", ur"\1erné"),
                (u"\\b([cC])hiedè\\b", ur"\1hiedé"),
                (u"\\b([cC])ombattè\\b", ur"\1ombatté"),
                (u"\\b([cC])ompetè\\b", ur"\1ompeté"),
                (u"\\b([cC])ompiè\\b", ur"\1ompié"),
                (u"\\b([cC])omprendè\\b", ur"\1omprendé"),
                (u"\\b([cC])oncedè\\b", ur"\1oncedé"),
                (u"\\b([cC])oncernè\\b", ur"\1oncerné"),
                (u"\\b([cC])onnettè\\b", ur"\1onnetté"),
                (u"\\b([cC])onsistè\\b", ur"\1onsisté"),
                (u"\\b([cC])ontenè\\b", ur"\1ontené"),
                (u"\\b([cC])ontrobattè\\b", ur"\1ontrobatté"),
                (u"\\b([cC])onvedè\\b", ur"\1onvedé"),
                (u"\\b([cC])onvergè\\b", ur"\1onvergé"),
                (u"\\b([cC])onvivè\\b", ur"\1onvivé"),
                (u"\\b([cC])orrompè\\b", ur"\1orrompé"),
                (u"\\b([cC])redè\\b", ur"\1redé"),
                (u"\\b([dD])ecedè\\b", ur"\1ecedé"),
                (u"\\b([dD])eflettè\\b", ur"\1efletté"),
                (u"\\b([dD])elinquè\\b", ur"\1elinqué"),
                (u"\\b([dD])esistè\\b", ur"\1esisté"),
                (u"\\b([dD])etenè\\b", ur"\1etené"),
                (u"\\b([dD])evolvè\\b", ur"\1evolvé"),
                (u"\\b([dD])ibattè\\b", ur"\1ibatté"),
                (u"\\b([dD])ipendè\\b", ur"\1ipendé"),
                (u"\\b([dD])iprendè\\b", ur"\1iprendé"),
                (u"\\b([dD])irimè\\b", ur"\1irimé"),
                (u"\\b([dD])iscernè\\b", ur"\1iscerné"),
                (u"\\b([dD])isottenè\\b", ur"\1isottené"),
                (u"\\b([dD])isperdè\\b", ur"\1isperdé"),
                (u"\\b([dD])isplendè\\b", ur"\1isplendé"),
                (u"\\b([dD])issolvè\\b", ur"\1issolvé"),
                (u"\\b([dD])issovvennè\\b", ur"\1issovvenné"),
                (u"\\b([dD])istemè\\b", ur"\1istemé"),
                (u"\\b([dD])isvolvè\\b", ur"\1isvolvé"),
                (u"\\b([eE])ccedè\\b", ur"\1ccedé"),
                (u"\\b([eE])lidè\\b", ur"\1lidé"),
                (u"\\b([eE])ludè\\b", ur"\1ludé"),
                (u"\\b([eE])rompè\\b", ur"\1rompé"),
                (u"\\b([eE])sigè\\b", ur"\1sigé"),
                (u"\\b([eE])simè\\b", ur"\1simé"),
                (u"\\b([eE])sistè\\b", ur"\1sisté"),
                (u"\\b([eE])spandè\\b", ur"\1spandé"),
                (u"\\b([eE])stroquè\\b", ur"\1stroqué"),
                (u"\\b([eE])volvè\\b", ur"\1volvé"),
                (u"\\b([fF])endè\\b", ur"\1endé"),
                (u"\\b([fF])ervè\\b", ur"\1ervé"),
                (u"\\b([fF])lettè\\b", ur"\1letté"),
                (u"\\b([fF])rapprendè\\b", ur"\1rapprendé"),
                (u"\\b([fF])remè\\b", ur"\1remé"),
                (u"\\b([gG])enuflettè\\b", ur"\1enufletté"),
                (u"\\b([iI])mbattè\\b", ur"\1mbatté"),
                (u"\\b([iI])mbevè\\b", ur"\1mbevé"),
                (u"\\b([iI])mpiè\\b", ur"\1mpié"),
                (u"\\b([iI])mprendè\\b", ur"\1mprendé"),
                (u"\\b([iI])ncedè\\b", ur"\1ncedé"),
                (u"\\b([iI])ncombè\\b", ur"\1ncombé"),
                (u"\\b([iI])nfremè\\b", ur"\1nfremé"),
                (u"\\b([iI])nsistè\\b", ur"\1nsisté"),
                (u"\\b([iI])ntercedè\\b", ur"\1ntercedé"),
                (u"\\b([iI])nterprendè\\b", ur"\1nterprendé"),
                (u"\\b([iI])nterrompè\\b", ur"\1nterrompé"),
                (u"\\b([iI])ntessè\\b", ur"\1ntessé"),
                (u"\\b([iI])ntraprendè\\b", ur"\1ntraprendé"),
                (u"\\b([iI])ntrarompè\\b", ur"\1ntrarompé"),
                (u"\\b([iI])ntratessè\\b", ur"\1ntratessé"),
                (u"\\b([iI])ntrattenè\\b", ur"\1ntrattené"),
                (u"\\b([iI])ntravedè\\b", ur"\1ntravedé"),
                (u"\\b([iI])ntroflettè\\b", ur"\1ntrofletté"),
                (u"\\b([iI])rrompè\\b", ur"\1rrompé"),
                (u"\\b([mM])antenè\\b", ur"\1antené"),
                (u"\\b([mM])ietè\\b", ur"\1ieté"),
                (u"\\b([oO])ttenè\\b", ur"\1ttené"),
                (u"\\b([pP])endè\\b", ur"\1endé"),
                (u"\\b([pP])erdè\\b", ur"\1erdé"),
                (u"\\b([pP])ersistè\\b", ur"\1ersisté"),
                (u"\\b([pP])iovè\\b", ur"\1iové"),
                (u"\\b([pP])ossedè\\b", ur"\1ossedé"),
                (u"\\b([pP])otè\\b", ur"\1oté"),
                (u"\\b([pP])recedè\\b", ur"\1recedé"),
                (u"\\b([pP])reesistè\\b", ur"\1reesisté"),
                (u"\\b([pP])remè\\b", ur"\1remé"),
                (u"\\b([pP])rendè\\b", ur"\1rendé"),
                (u"\\b([pP])rescindè\\b", ur"\1rescindé"),
                (u"\\b([pP])resiedè\\b", ur"\1resiedé"),
                (u"\\b([pP])revedè\\b", ur"\1revedé"),
                (u"\\b([pP])rocedè\\b", ur"\1rocedé"),
                (u"\\b([pP])ropendè\\b", ur"\1ropendé"),
                (u"\\b([pP])rorompè\\b", ur"\1rorompé"),
                (u"\\b([pP])rovolvè\\b", ur"\1rovolvé"),
                (u"\\b([rR])apprendè\\b", ur"\1apprendé"),
                (u"\\b([rR])attenè\\b", ur"\1attené"),
                (u"\\b([rR])avvedè\\b", ur"\1avvedé"),
                (u"\\b([rR])ecedè\\b", ur"\1ecedé"),
                (u"\\b([rR])edigè\\b", ur"\1edigé"),
                (u"\\b([rR])endè\\b", ur"\1endé"),
                (u"\\b([rR])esistè\\b", ur"\1esisté"),
                (u"\\b([rR])etrocedè\\b", ur"\1etrocedé"),
                (u"\\b([rR])iannettè\\b", ur"\1iannetté"),
                (u"\\b([rR])ibattè\\b", ur"\1ibatté"),
                (u"\\b([rR])icedè\\b", ur"\1icedé"),
                (u"\\b([rR])icevè\\b", ur"\1icevé"),
                (u"\\b([rR])ichiedè\\b", ur"\1ichiedé"),
                (u"\\b([rR])iconnettè\\b", ur"\1iconnetté"),
                (u"\\b([rR])iconverrè\\b", ur"\1iconverré"),
                (u"\\b([rR])icredè\\b", ur"\1icredé"),
                (u"\\b([rR])iedè\\b", ur"\1iedé"),
                (u"\\b([rR])iempiè\\b", ur"\1iempié"),
                (u"\\b([rR])iflettè\\b", ur"\1ifletté"),
                (u"\\b([rR])ingodè\\b", ur"\1ingodé"),
                (u"\\b([rR])ipentè\\b", ur"\1ipenté"),
                (u"\\b([rR])ipetè\\b", ur"\1ipeté"),
                (u"\\b([rR])iprendè\\b", ur"\1iprendé"),
                (u"\\b([rR])isedè\\b", ur"\1isedé"),
                (u"\\b([rR])isiedè\\b", ur"\1isiedé"),
                (u"\\b([rR])isolvè\\b", ur"\1isolvé"),
                (u"\\b([rR])isplendè\\b", ur"\1isplendé"),
                (u"\\b([rR])itenè\\b", ur"\1itené"),
                (u"\\b([rR])ivedè\\b", ur"\1ivedé"),
                (u"\\b([rR])ivendè\\b", ur"\1ivendé"),
                (u"\\b([rR])ivivè\\b", ur"\1ivivé"),
                (u"\\b([rR])ompè\\b", ur"\1ompé"),
                (u"\\b([sS])battè\\b", ur"\1batté"),
                (u"\\b([sS])candè\\b", ur"\1candé"),
                (u"\\b([sS])cernè\\b", ur"\1cerné"),
                (u"\\b([sS])connettè\\b", ur"\1connetté"),
                (u"\\b([sS])ecernè\\b", ur"\1ecerné"),
                (u"\\b([sS])fottè\\b", ur"\1fotté"),
                (u"\\b([sS])occombè\\b", ur"\1occombé"),
                (u"\\b([sS])oprassedè\\b", ur"\1oprassedé"),
                (u"\\b([sS])opravvivè\\b", ur"\1opravvivé"),
                (u"\\b([sS])orprendè\\b", ur"\1orprendé"),
                (u"\\b([sS])ostenè\\b", ur"\1ostené"),
                (u"\\b([sS])pandè\\b", ur"\1pandé"),
                (u"\\b([sS])perdè\\b", ur"\1perdé"),
                (u"\\b([sS])plendè\\b", ur"\1plendé"),
                (u"\\b([sS])premè\\b", ur"\1premé"),
                (u"\\b([sS])ternè\\b", ur"\1terné"),
                (u"\\b([sS])trafottè\\b", ur"\1trafotté"),
                (u"\\b([sS])travedè\\b", ur"\1travedé"),
                (u"\\b([sS])tridè\\b", ur"\1tridé"),
                (u"\\b([tT])emè\\b", ur"\1emé"),
                (u"\\b([tT])enè\\b", ur"\1ené"),
                (u"\\b([tT])essè\\b", ur"\1essé"),
                (u"\\b([tT])ralucè\\b", ur"\1ralucé"),
                (u"\\b([tT])ransigè\\b", ur"\1ransigé"),
                (u"\\b([tT])rattenè\\b", ur"\1rattené"),
                (u"\\b([tT])ravedè\\b", ur"\1ravedé"),
                (u"\\b([vV])edè\\b", ur"\1edé"),
                (u"\\b([vV])endè\\b", ur"\1endé"),
                (u"\\b([vV])ertè\\b", ur"\1erté"),
                #Termini d'origine francese (ed italiani come caffè)
                (u'\\b([aA])ntirè\\b', ur'\1ntiré'),
                (u'\\b([aA])utodafè\\b', ur'\1utodafé'),
                (u'\\b([cC])annetè\\b', ur'\1anneté'),
                (u'\\b([cC])apitonnè\\b', ur'\1apitonné'),
                (u'\\b([cC])lichè\\b', ur'\1liché'),
                (u'\\b([cC])loisonnè\\b', ur'\1loisonné'),
                (u'\\b([cC])onsommè\\b', ur'\1onsommé'),
                (u"\\b([cC])impanzè\\b", ur"\1impanzé"),
                (u'\\b([cC])oupè\\b', ur'\1oupé'),
                (u'\\b([cC])raquelè\\b', ur'\1raquelé'),
                (u'\\b([dD])ecolletè\\b', ur'\1ecolleté'),
                (u'\\b([dD])écolletè\\b', ur'\1écolleté'),
                (u'\\b([dD])efilè\\b', ur'\1efilé'),
                (u'\\b([dD])éfilè\\b', ur'\1éfilé'),
                (u'\\b([dD])egagè\\b', ur'\1egagé'),
                (u'\\b([dD])égagè\\b', ur'\1égagé'),
                (u'\\b([dD])elavè\\b', ur'\1elavé'),
                (u'\\b([dD])élavè\\b', ur'\1élavé'),
                (u'\\b([dD])emodè\\b', ur'\1emodé'),
                (u'\\b([dD])émodè\\b', ur'\1émodé'),
                (u'\\b([dD])eracinè\\b', ur'\1eraciné'),
                (u'\\b([dD])éracinè\\b', ur'\1éraciné'),
                (u'\\b([dD])eshabillè\\b', ur'\1eshabillé'),
                (u'\\b([dD])éshabillè\\b', ur'\1éshabillé'),
                (u'\\b([eE])cartè\\b', ur'\1carté'),
                (u'\\b([eE])nfant gƒtè\\b', ur'\1nfant gƒté'),
                (u'\\b([eE])ngagè\\b', ur'\1ngagé'),
                (u'\\b([fF])lambè\\b', ur'\1lambé'),
                (u'\\b([fF])oncè\\b', ur'\1oncé'),
                (u'\\b([fF])risè\\b', ur'\1risé'),
                (u'\\b([gG])aufrè\\b', ur'\1aufré'),
                (u'\\b([gG])lacè\\b', ur'\1lacé'),
                (u"\\b([gG])ranmercè\\b", ur"\1ranmercé"),
                (u'\\b([hH])abituè\\b', ur'\1abitué'),
                (u'\\b([hH])ôtel meublè\\b', ur'\1ôtel meublé'),
                (u'\\b([iI])mprimè\\b', ur'\1mprimé'),
                (u'\\b([iI])nterrè\\b', ur'\1nterré'),
                (u'\\b([kK])aritè\\b', ur'\1arité'),
                (u'\\b([mM])arron glacè\\b', ur'\1arron glacé'),
                (u'\\b([mM])atelassè\\b', ur'\1atelassé'),
                (u'\\bmercè\\b', ur'mercé'),
                (u'\\b([mM])erzè\\b', ur'\1erzé'),
                (u'\\b([mM])eublè\\b', ur'\1eublé'),
                (u'\\b([mM])oirè\\b', ur'\1oiré'),
                (u'\\b([mM])oulinè\\b', ur'\1ouliné'),
                (u'\\b([nN])egligè\\b', ur'\1egligé'),
                (u'\\b([nN])égligè\\b', ur'\1égligé'),
                (u"\\b([nN])ontiscordardimè\\b", ur"\1ontiscordardimé"),
                (u'\\b([pP])ancarrè\\b', ur'\1ancarré'),
                (u'\\b([pP])âtè\\b', ur'\1âté'),
                (u'\\b([sS])aint-honorè\\b', ur'\1aint-honoré'),
                (u'\\b([sS])cimpanz[eè]\\b', ur'\1cimpanzé'),
                (u'\\b([sS])eparè\\b', ur'\1eparé'),
                (u'\\b([sS])oufflè\\b', ur'\1oufflé'),
                (u'\\b([tT])amurè\\b', ur'\1amuré'),
                (u'\\b([tT])rentatrè\\b', ur'\1rentatré'),
                (u'\\b([tT])ruffè\\b', ur'\1ruffé'),
                (u'\\b([vV])arietè\\b', ur'\1arieté'),
                (u'\\b([vV])ariétè\\b', ur'\1ariété'),
                (u'\\b([vV])icerè\\b', ur'\1iceré'),
                (u'\\b([vV])entitrè\\b', ur'\1entitré'),
                (u'\\b([aA])himé\\b', ur'\1himè'),
                (u'\\b([aA])mmazzacaffé\\b', ur'\1mmazzacaffè'),
                (u'\\b([aA])ppié\\b', ur'\1ppiè'),
                (u'\\b([bB])igné\\b', ur'\1ignè'),
                (u'\\b([bB])uffé\\b', ur'\1uffè'),
                (u'\\b([cC])abaré\\b', ur'\1abarè'),
                (u'\\b([cC])abriolé\\b', ur'\1abriolè'),
                (u'\\b([cC])anapé\\b', ur'\1anapè'),
                (u'\\b([cC])arcadé\\b', ur'\1arcadè'),
                (u'\\b([cC])hedivé\\b', ur'\1hedivè'),
                (u'\\b([cC])ioé\\b', ur'\1ioè'),
                (u'\\b([cC])occodé\\b', ur'\1occodè'),
                (u'\\b([cC])ontrobuffé\\b', ur'\1ontrobuffè'),
                (u'\\b([cC])orvé\\b', ur'\1orvè'),
                (u'\\b([cC])roscé\\b', ur'\1roscè'),
                (u'\\b([cC])upé\\b', ur'\1upè'),
                (u'\\b([dD])appié\\b', ur'\1appiè'),
                (u'\\b([dD])osacaffé\\b', ur'\1osacaffè'),
                (u'\\b([eE])uhoé\\b', ur'\1uhoè'),
                (u'\\b([fF])orfé\\b', ur'\1orfè'),
                (u'\\b([kK])arkadé\\b', ur'\1arkadè'),
                (u'\\b([kK])edivé\\b', ur'\1edivè'),
                (u'\\b([lL])acché\\b', ur'\1acchè'),
                (u'\\b([mM])acinacaffé\\b', ur'\1acinacaffè'),
                (u'\\b([mM])acramé\\b', ur'\1acramè'),
                (u'\\b([mM])ordoré\\b', ur'\1ordorè'),
                (u'\\b([mM])usmé\\b', ur'\1usmè'),
                (u'\\b([nN])arghilé\\b', ur'\1arghilè'),
                (u'\\b([pP])arché\\b', ur'\1archè'),
                (u'\\b([pP])uré\\b', ur'\1urè'),
                (u'\\b([rR])adiorelé\\b', ur'\1adiorelè'),
                (u'\\b([rR])amié\\b', ur'\1amiè'),
                (u'\\b([sS])ufflé\\b', ur'\1ufflè'),
                (u'\\b([tT])oppé\\b', ur'\1oppè'),
                (u'\\b([tT])ostacaffé\\b', ur'\1ostacaffè'),
                (u'\\b([tT])uppé\\b', ur'\1uppè'),
                (u'\\b([vV])ahiné\\b', ur'\1ahinè'),
                (u'\\bGiosué\\b', ur'Giosuè'),
                (u'\\bMosé\\b', ur'Mosè'),
                # Altre sostituzioni
                (u"(?m)(== ?[Ll]Collegamenti Esterni ?==)", ur"== Collegamenti esterni =="),
                (u"(?m)(== ?[Ll]ink [Ee]sterni ?==)", ur"== Collegamenti esterni =="),
                (u"(?m)(== ?[Vv]edi [Aa]nche ?==)", ur"== Voci correlate =="),
            ]
    },
}

class XmlDumpReplacePageGenerator:
    """
    Generator which will yield Pages to pages that might contain text to
    replace. These pages will be retrieved from a local XML dump file
    (cur table).
    """
    def __init__(self, xmlFilename, replacements, exceptions):
        """
        Arguments:
            * xmlFilename  - The dump's path, either absolute or relative
            * replacements - A list of 2-tuples of original text (as a compiled
                             regular expression) and replacement text (as a
                             string).
            * exceptions   - A list of compiled regular expression; pages which
                             contain text that matches one of these won't be
                             changed.
        """

        self.xmlFilename = xmlFilename
        self.replacements = replacements
        self.exceptions = exceptions
    
    def __iter__(self):
        import xmlreader
        mysite = wikipedia.getSite()
        dump = xmlreader.XmlDump(self.xmlFilename)
        for entry in dump.parse():
            skip_page = False
            for exception in self.exceptions:
                if exception.search(entry.text):
                    skip_page = True
                    break
            if not skip_page:
                # TODO: leave out pages that only have old inside nowiki, comments, math
                for old, new in self.replacements:
                    if old.search(entry.text):
                        yield wikipedia.Page(mysite, entry.title)
                        break
    

class ReplaceRobot:
    """
    A bot that can do text replacements.
    """
    def __init__(self, generator, replacements, exceptions = [], acceptall = False):
        """
        Arguments:
            * generator    - A generator that yields Page objects.
            * replacements - A list of 2-tuples of original text (as a compiled
                             regular expression) and replacement text (as a 
                             string).
            * exceptions   - A list of compiled regular expression; pages which
                             contain text that matches one of these won't be
                             changed.
            * acceptall    - If True, the user won't be prompted before changes
                             are made.
        """
        self.generator = generator
        self.replacements = replacements
        self.exceptions = exceptions
        self.acceptall = acceptall

    def checkExceptions(self, original_text):
        """
        If one of the exceptions applies for the given text, returns the 
        substring which matches the exception. Otherwise it returns None.
        """
        for exception in self.exceptions:
            hit = exception.search(original_text)
            if hit:
                return hit.group(0)
        return None

    def doReplacements(self, original_text):
        """
        Returns the text which is generated by applying all replacements to the
        given text.
        """
        new_text = original_text
        for old, new in self.replacements:
            new_text = wikipedia.replaceExceptMathNowikiAndComments(new_text, old, new)
        return new_text
        
    def run(self):
        """
        Starts the robot.
        """
        # Run the generator which will yield Pages which might need to be
        # changed.
        for page in self.generator:
            try:
                # Load the page's text from the wiki
                original_text = page.get()
                if not page.canBeEdited():
                    wikipedia.output(u'Skipping locked page %s' % page.title())
                    continue
            except wikipedia.NoPage:
                wikipedia.output(u'Page %s not found' % page.title())
                continue
            except wikipedia.IsRedirectPage:
                original_text = page.get(get_redirect=True)
            match = self.checkExceptions(original_text)
            # skip all pages that contain certain texts
            if match:
                wikipedia.output(u'Skipping %s because it contains %s' % (page.title(), match))
            else:
                new_text = self.doReplacements(original_text)
                if new_text == original_text:
                    wikipedia.output('No changes were necessary in %s' % page.title())
                else:
                    wikipedia.output(u'\n>>> %s <<<' % page.title())
                    wikipedia.showDiff(original_text, new_text)
                    if not self.acceptall:
                        choice = wikipedia.inputChoice(u'Do you want to accept these changes?',  ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
                        if choice in ['a', 'A']:
                            self.acceptall = True
                    if self.acceptall or choice in ['y', 'Y']:
                        page.put(new_text)

def prepareRegexForMySQL(pattern):
    pattern = pattern.replace('\s', '[:space:]')
    pattern = pattern.replace('\d', '[:digit:]')
    pattern = pattern.replace('\w', '[:alnum:]')
    
    pattern = pattern.replace("'", "\\" + "'")
    #pattern = pattern.replace('\\', '\\\\')
    #for char in ['[', ']', "'"]:
    #    pattern = pattern.replace(char, '\%s' % char)
    return pattern
    
                        
def main():
    gen = None
    # How we want to retrieve information on which pages need to be changed.
    # Can either be 'xmldump', 'textfile' or 'userinput'.
    source = None
    # Array which will collect commandline parameters.
    # First element is original text, second element is replacement text.
    commandline_replacements = []
    # A list of 2-tuples of original text and replacement text.
    replacements = []
    # Don't edit pages which contain certain texts.
    exceptions = []
    # Should the elements of 'replacements' and 'exceptions' be interpreted
    # as regular expressions?
    regex = False
    # Predefined fixes from dictionary 'fixes' (see above).
    fix = None
    # the dump's path, either absolute or relative, which will be used when source
    # is 'xmldump'.
    xmlFilename = None
    useSql = False
    # the textfile's path, either absolute or relative, which will be used when
    # source is 'textfile'.
    textfilename = None
    # the category name which will be used when source is 'category'.
    categoryname = None
    # pages which will be processed when the -page parameter is used
    PageTitles = []
    # a page whose referrers will be processed when the -ref parameter is used
    referredPageTitle = None
    # a page whose links will be processed when the -links parameter is used
    linkingPageTitle = None
    # will become True when the user presses a ('yes to all') or uses the -always
    # commandline paramater.
    acceptall = False
    # Which namespaces should be processed?
    # default to [] which means all namespaces will be processed
    namespaces = []
    # Which page to start
    startpage = None
    # Google query
    googleQuery = None
    # Load default summary message.
    wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg))

    # Read commandline parameters.
    for arg in wikipedia.handleArgs():
        if arg == '-regex':
            regex = True
        elif arg.startswith('-file'):
            if len(arg) >= 6:
                textfilename = arg[6:]
            gen = pagegenerators.TextfilePageGenerator(textfilename)
        elif arg.startswith('-cat'):
            if len(arg) == 4:
                categoryname = wikipedia.input(u'Please enter the category name:')
            else:
                categoryname = arg[5:]
            cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % categoryname)
            gen = pagegenerators.CategorizedPageGenerator(cat)
        elif arg.startswith('-xml'):
            if len(arg) == 4:
                xmlFilename = wikipedia.input(u'Please enter the XML dump\'s filename:')
            else:
                xmlFilename = arg[5:]
        elif arg =='-sql':
            useSql = True
        elif arg.startswith('-page'):
            if len(arg) == 5:
                PageTitles.append(wikipedia.input(u'Which page do you want to chage?'))
            else:
                PageTitles.append(arg[6:])
            source = 'specificPages'
        elif arg.startswith('-ref'):
            if len(arg) == 4:
                referredPageTitle = wikipedia.input(u'Links to which page should be processed?')
            else:
                referredPageTitle = arg[5:]
            referredPage = wikipedia.Page(wikipedia.getSite(), referredPageTitle)
            gen = pagegenerators.ReferringPageGenerator(referredPage)
        elif arg.startswith('-links'):
            if len(arg) == 6:
                linkingPageTitle = wikipedia.input(u'Links from which page should be processed?')
            else:
                linkingPageTitle = arg[7:]
            linkingPage = wikipedia.Page(wikipedia.getSite(), linkingPageTitle)
            gen = pagegenerators.LinkedPageGenerator(linkingPage)
        elif arg.startswith('-start'):
            if len(arg) == 6:
                firstPageTitle = wikipedia.input(u'Which page do you want to chage?')
            else:
                firstPageTitle = arg[7:]
            namespace = wikipedia.Page(wikipedia.getSite(), firstPageTitle).namespace()
            gen = pagegenerators.AllpagesPageGenerator(firstPageTitle, namespace)
        elif arg.startswith('-google'):
            if len(arg) >= 8:
                googleQuery = arg[8:]
            gen = pagegenerators.GoogleSearchPageGenerator(googleQuery)
        elif arg.startswith('-except:'):
            exceptions.append(arg[8:])
        elif arg.startswith('-fix:'):
            fix = arg[5:]
        elif arg == '-always':
            acceptall = True
        elif arg.startswith('-namespace:'):
            namespaces.append(int(arg[11:]))
        else:
            commandline_replacements.append(arg)

    if (len(commandline_replacements) == 2 and fix == None):
        replacements.append((commandline_replacements[0], commandline_replacements[1]))
        wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg ) % ' (-' + commandline_replacements[0] + ' +' + commandline_replacements[1] + ')')
    elif fix == None:
        old = wikipedia.input(u'Please enter the text that should be replaced:')
        new = wikipedia.input(u'Please enter the new text:')
        change = '(-' + old + ' +' + new
        replacements.append((old, new))
        while True:
            old = wikipedia.input(u'Please enter another text that should be replaced, or press Enter to start:')
            if old == '':
                change = change + ')'
                break
            new = wikipedia.input(u'Please enter the new text:')
            change = change + ' & -' + old + ' +' + new
            replacements.append((old, new))
        default_summary_message =  wikipedia.translate(wikipedia.getSite(), msg) % change
        wikipedia.output(u'The summary message will default to: %s' % default_summary_message)
        summary_message = wikipedia.input(u'Press Enter to use this default message, or enter a description of the changes your bot will make:')
        if summary_message == '':
            summary_message = default_summary_message
        wikipedia.setAction(summary_message)
    else:
        # Perform one of the predefined actions.
        try:
            fix = fixes[fix]
        except KeyError:
            wikipedia.output(u'Available predefined fixes are: %s' % fixes.keys())
            wikipedia.stopme()
            sys.exit()
        if fix.has_key('regex'):
            regex = fix['regex']
        if fix.has_key('msg'):
            wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), fix['msg']))
        if fix.has_key('exceptions'):
            exceptions = fix['exceptions']
        replacements = fix['replacements']

    
    # already compile all regular expressions here to save time later
    for i in range(len(replacements)):
        old, new = replacements[i]
        if not regex:
            old = re.escape(old)
        oldR = re.compile(old, re.UNICODE)
        replacements[i] = oldR, new
    for i in range(len(exceptions)):
        exception = exceptions[i]
        if not regex:
            exception = re.escape(exception)
        exceptionR = re.compile(exception, re.UNICODE)
        exceptions[i] = exceptionR
    
    if xmlFilename:
        gen = XmlDumpReplacePageGenerator(xmlFilename, replacements, exceptions)
    elif useSql:
        whereClause = 'WHERE (%s)' % ' OR '.join(["old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern) for (old, new) in replacements]) 
        if exceptions:
            exceptClause = 'AND NOT (%s)' % ' OR '.join(["old_text RLIKE '%s'" % prepareRegexForMySQL(exc.pattern) for exc in exceptions])
        else:
            exceptClause = ''
        query = u"""
SELECT page_namespace, page_title
FROM page
JOIN text ON (page_id = old_id)
%s
%s
LIMIT 200""" % (whereClause, exceptClause)
        gen = pagegenerators.MySQLPageGenerator(query)

    elif PageTitles:
        pages = [wikipedia.Page(wikipedia.getSite(), PageTitle) for PageTitle in PageTitles]
        gen = iter(pages)

    if not gen:
        # syntax error, show help text from the top of this file
        wikipedia.output(__doc__, 'utf-8')
        wikipedia.stopme()
        sys.exit()
    if namespaces != []:
        gen =  pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
    preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = 20)
    bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall)
    bot.run()


if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()