Utilisateur:FtiercelBot/fr-conj-ir.py
fr-conj-ir.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This bot goes over multiple pages of the home wiki, and edits them without
changing. This is for example used to get category links in templates
working.
Don't forget to set the ftout to your current list of words,
see below for a line that looks like:
ftout =open('/home/cmillet/wikitruks/wiktio/all/2005-12-14.txt', 'r')
This script understands various command-line arguments:
-start: used as -start:page_name, specifies that the robot should
go alphabetically through all pages on the home wiki,
starting at the named page.
-file: used as -file:file_name, read a list of pages to treat
from the named textfile. Page titles should be enclosed
in [[double-squared brackets]].
-ref: used as -start:page_name, specifies that the robot should
touch all pages referring to the named page.
-cat: used as -cat:category_name, specifies that the robot should
touch all pages in the named category.
All other parameters will be regarded as a page title; in this case, the bot
will only touch a single page.
"""
import wikipedia, wiktionary, pagegenerators, catlib
import sys
import re
import time
#endings = [<ms>, <fs>, <mp>, <fp>]
#endings = [u'', u'e', u's', u'es']
#pronEnding = [u'', u'', u'', u'']
heading = u'Annexe:Conjugaison française:'
ending = u'ir'
templateName = u'fr-conj-2'
alphanum = [u'a', u'b', u'c', u'd', u'e', u'f', u'g', u'h', u'i', u'j', u'k', u'l', u'm', u'n', u'o', u'p', u'q', u'r', u's', u't', u'u', u'v', u'w', u'x', u'y', u'z', u'0', u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9']
aList = [u'@', u'à', u'À', u'â', u'Â', u'ä', u'Ä']
cList = [u'ç']
eList = [u'€', u'é', u'É', u'è', u'È', u'ê', u'Ê', u'ë', u'Ë']
iList = [u'ï', u'Ï', u'î', u'Î']
nList = [u'ñ']
oList = [u'ô', u'Ô']
uList = [u'ù', u'Ù', u'û', u'Û', u'ü', u'Ü']
quoteList = [u'’', u'\'', u'\`', u'\"', u'\\', u'\/']
dotList = [u' ', u'.', u'&', u'~', u'{', u'(', u'[', u'-', u'|', u'_', u'^', u')', u']', u'=', u'°', u'+', u'=', u'}', u'£', u'$', u'¤', u'%', u'µ', u'*', u'?', u',', u';', u':', u'§', u'!', u'<', u'>']
commentCompiler = re.compile( u"\<\!\-\-(.*?)\-\-\>", re.DOTALL | re.MULTILINE)
templateCompiler = re.compile(u'\{\{' + templateName + u' *\r?\n?\| *.*?\}\}', re.DOTALL)
flexionCompiler = re.compile(u'\{\{' + templateName + u' *\r?\n?\| *(.*?)\}\}', re.DOTALL)
flexionParser = re.compile(u'\{\{' + templateName + u'.*?\}\}', re.DOTALL)
splitCompiler = re.compile(u'\|')
parameterCompiler = re.compile(u'^(.*?) *\= *(.*?)$')
spacesCompiler = re.compile(u'\s+')
class KeyBot:
def __init__(self, generator, acceptall = False):
self.generator = generator
self.acceptall = acceptall
def run(self):
for page in self.generator:
try:
hasBadParameters = False
hasStrangeParameter = False
word = page.title()
while len(word) > 0 and heading + word != page.title():
word = word[1:]
wordBase = word
while len(wordBase) > 0 and wordBase + ending != word:
wordBase = wordBase[:-1]
if wordBase + ending == word:
wikipedia.output(u'page: %s' % page.title())
thePage = page.get()
theChangedPage = thePage # as newtext, but without comment
# removing <!-- -->
oldText = commentCompiler.sub(u'', thePage)
# We need to do something here
newText = oldText
templateList = templateCompiler.findall(newText)
## maxIteration = int(100)
## while flexion and maxIteration > 0:
for oldTemplate in templateList:
flexion = flexionCompiler.search(oldTemplate)
parameterList = splitCompiler.split(flexion.group(1))
wordStart = parameterList[0].strip(u'\r\n ')
parameterList.pop(0)
if len(parameterList) > 0:
nothing = parameterList[0].strip(u'\r\n ')
if nothing == u'':
parameterList.pop(0)
##
## pron = parameterList[1].strip(u'\r\n ')
parameters = {}
parameterIndex = []
for parameter in parameterList:
parameter = parameter.strip(u'\r\n ')
parameterElmnt = parameterCompiler.search(parameter)
if parameterElmnt:
parameterIndex.append(parameterElmnt.group(1))
parameters[parameterElmnt.group(1)] = parameterElmnt.group(2)
else:
hasStrangeParameter = True
if u'cat' not in parameters:
parameters[u'cat'] = word
parameterIndex.append(u'cat')
if u'v1' in parameters or u'v2' in parameters or u'c1' in parameters or u'c2' in parameters or u'p.v1' in parameters or u'p.v2' in parameters or u'ill' in parameters or u'j' in parameters or u'e' in parameters:
hasBadParameters = True
# we upload the text
if hasBadParameters:
wikipedia.output(u'################### %s ####################' % word)
theTitle = word
encodedTitle = theTitle.encode('utf-8')
outputFile.write(encodedTitle)
outputFile.write("\r\n")
except wikipedia.NoPage:
wikipedia.output(u'Page %s does not exist?!?!'%page.aslink())
except wikipedia.IsRedirectPage:
pass
except wikipedia.LockedPage:
pass
def main():
#page generator
gen = None
pageTitle = []
for arg in wikipedia.handleArgs():
if arg:
if arg.startswith('-start:'):
gen = pagegenerators.AllpagesPageGenerator(arg[7:])
elif arg.startswith('-ref:'):
referredPage = wikipedia.Page(wikipedia.getSite(), arg[5:])
gen = pagegenerators.ReferringPageGenerator(referredPage)
elif arg.startswith('-links:'):
linkingPage = wikipedia.Page(wikipedia.getSite(), arg[7:])
gen = pagegenerators.LinkedPageGenerator(linkingPage)
elif arg.startswith('-file:'):
gen = pagegenerators.TextfilePageGenerator(arg[6:])
elif arg.startswith('-cat:'):
catGen = pagegenerators.TextfilePageGenerator('./cat.txt')
catPreloadingGen = pagegenerators.PreloadingGenerator(catGen)
for catPage in catPreloadingGen:
cat = catlib.Category(wikipedia.getSite(), catPage.title())
gen = pagegenerators.CategorizedPageGenerator(cat)
else:
pageTitle.append(arg)
if pageTitle:
page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
gen = iter([page])
if not gen:
wikipedia.showHelp('touch')
else:
preloadingGen = pagegenerators.PreloadingGenerator(gen)
bot = KeyBot(preloadingGen)
bot.run()
if __name__ == "__main__":
now = time.localtime()
filename = './articlesToChange-' + str(now.tm_hour) + '-' + str(now.tm_min) + '-' + str(now.tm_sec) + '.txt'
outputFile =open(filename, 'w')
try:
main()
finally:
wikipedia.stopme()
outputFile.close()