#! /usr/local/bin/python
# -*- coding: iso-8859-1 -*-
# Written by Ulf Göransson, ug@algonet.se
#
# Created when Wizards of the Coast decided to use their Gatherer
# card database to generate Oracle files. I think it does a poor job...
#
# Usage: oraclean.py [<file>]
#
# If no file is specified, the current "Vintage" list is retrieved
# from ww2.wizards.com.
#
# History:
# 2005-06-06 Fixes related to Saviors of Kamigawa
# 2005-11-08 Now handles hybrid mana costs from Ravnica correctly
# 2006-02-03 Guildpact related changes
# 2006-04-24 Dissension related changes
# 2006-05-08 Added HTTP and primitive HTML parsing
# 2006-07-11 Treat XMP the same as PRE, remove leading blank lines
# Coldsnap: Recover, snow mana and alternative CU
# New bugs: T without colon.
# 2006-07-16 Worked through the mana filter again.
# New URL
# 2006-08-19 Output file name fix
# 2006-09-26 Time Spiral: Suspend, Echo with cost and a new land pattern.
# 2007-01-27 Planar Chaos: "Suspend X", "has echo X".
# 2007-03-11 Now downloads "Vintage" list; all cards doesn't work anymore.
# 2007-05-05 Future Sight: Aura swap, Fortify and Transfigure.
# 2007-10-01 Lorwyn: Evoke, Planeswalker's casting cost like value.
# 2008-01-22 Morningtide: Prowl, Reinforce.
# New HTML format (DIV & BR instead of XMP)
# 2008-07-06 Shadowmoor: Hybrid costs with digit, untap symbol
# 2008-09-30 Shards of Alara: Unearth
# 2009-01-31 Conflux: Worldheart Phoenix activation cost fix
#
from __future__ import generators # Needed because I run an old version
import os, sys
import string, re
import time
import urllib
import sgmllib
# A very primitive SGML parser that locates and extracts the oracle text
# from an HTML file written by Gatherer.
# It will also work on a pure text file.
class PreTextExtractor(sgmllib.SGMLParser):
def __init__(self, fp):
sgmllib.SGMLParser.__init__(self)
self.fp = fp
self.echo = 1 # Start with echo on, works fine with pure text files
self.started = 0 # Don't echo just yet, though...
self.lines = [""]
self.entitydefs = {}
self.verbose = 0
def start_pre(self, attrs):
# We found a <pre> tag, start echoing
if self.verbose:
print "start_pre", attrs
self.echo = 1
def end_pre(self):
# End of pre
if self.verbose:
print "end_pre"
self.echo = 0
def start_xmp(self, attrs):
# We found an <xmp> tag, start echoing
if self.verbose:
print "start_xmp", attrs
self.echo = 1
def end_xmp(self):
# End of xmp
if self.verbose:
print "end_xmp"
self.echo = 0
def start_div(self, attrs):
# We found a <div> tag, look further
if self.verbose:
print "start_div", attrs
for attr, val in attrs:
if attr == "style" and val.find("Courier") >= 0:
if self.verbose:
print "Found matching div, enable echo"
# This ought to be the plain text part, start echoing
self.echo = 1
def end_div(self):
# End of div
if self.verbose:
print "end_div"
self.echo = 0
def start_html(self, attrs):
# We're not in a text file, turn echo off
if self.verbose:
print "start_html", attrs
self.echo = 0
def do_br(self, attrs):
if self.echo:
# We found a break in our "plain" text, start a new line
self.lines[-1] += '\n'
self.lines.append("")
# Take care of entity references
def unknown_entityref(self, name):
self.handle_data('&'+name)
def handle_data(self, data):
if self.echo:
self.lines[-1] += data
def __iter__(self):
# Iterate over all complete, echoable lines
lineno = 0
for line in self.fp.readlines():
# De-evolve a lonely break tag
line = line.replace('<br/>', '<br>')
self.feed(line)
self.goahead(0)
while self.lines[0] and self.lines[0][-1] in "\r\n":
# Complete line, give it back
line = self.lines[0]
del self.lines[0]
if line.strip() or self.started:
# ...provided that it isn't a leading blank
yield line
self.started = 1
if len(self.lines) == 0:
self.lines = [""]
CASTCOST = "cc"
HYBRID = "hy"
if len(sys.argv) == 1:
# No file, go for the URL
fp = urllib.urlopen("http://ww2.wizards.com/gatherer/index.aspx"+
"?setfilter=Vintage&output=Oracle+Spoiler")
out = open(time.strftime("gatherer-vin-%Y-%m-%d-cleaned.txt"), "w")
else:
# File, either HTML or text
ora = sys.argv[1]
fp = open(ora)
out = open(os.path.splitext(ora)[0]+"-cleaned.txt", "w")
# Reg-exp excesses and other filters
mana_pattern = "[\dXYZWUBRGS]+"
hybrid_pattern = "\([\dwubrgWUBRG] ?/ ?[\dwubrgWUBRG]\)"
mana_group = "("+mana_pattern+")"
pattern = ("(?:(?P<"+CASTCOST+">^"+mana_group+"$))|"+ # Casting cost
"(?:^"+mana_group+" // "+mana_group+"$)|"+
"(?:mana cost is "+mana_group+",)|"+
"(?:with mana cost "+mana_group+",)|"+
"(?:with "+mana_group+" in their)|"+
"(?: "+mana_group+" was spent)|"+
"(?:paid with either "+mana_group+" or "+mana_group+")|"+
"(?:"+mana_group+", (T)[,:])|"+ # Activation cost
"(?:^"+mana_group+"[,:])|"+
"(?:[\"\(]"+mana_group+"[,:])|"+
"(?:-- ?"+mana_group+"[,:])|"+
"(?:[Aa]dds? "+mana_group+" to)|"+ # Mana abilities
"(?:Add "+mana_group+" or "+mana_group+")|"+
"(?:Add "+mana_group+", "+mana_group+", or "+mana_group+")|"+
"(?:add up to "+mana_group+" to)|"+
"(?:add that much "+mana_group+" to)|"+
"(?:either "+mana_group+" or "+mana_group+" to)|"+
"(?:amount of "+mana_group+" )|"+
"(?:of "+mana_group+" and/or "+mana_group+" )|"+
"(?:pay(?:ing)? "+mana_group+" rather)|"+ # Alternative
"(?:pay "+mana_group+" or "+mana_group+" to)|"+
"(?:pays? "+mana_group+" or "+mana_group+"\.)|"+
"(?:pays? "+mana_group+" or)|"+
"(?:produce "+mana_group+")|"+
"(?:"+mana_group+" can be paid)|"+
"(?:any two mana or with "+mana_group+")|"+
"(?:additional "+mana_group+" to)|"+ # Additions & penalties
"(?:additional "+mana_group+" as)|"+
"(?:additional "+mana_group+" and/or "+mana_group+" )|"+
"(?:additional "+mana_group+" for)|"+
"(?:additional "+mana_group+" you)|"+
"(?:additional "+mana_group+"\.)|"+
"(?:pay "+mana_group+" and/or "+mana_group+" )|"+
# obsolete "(?:pay "+mana_group+" in addition)|"+
"(?:pay "+mana_group+" more)|"+
"(?:pay up to "+mana_group+"\.)|"+
"(?:[Pp]ays? "+mana_group+" an)|"+
"(?:pays? "+mana_group+"[\.,])|"+
"(?:[Pp]ays? "+mana_group+" for each)|"+
"(?:pays only "+mana_group+" for)|"+
"(?:pays "+mana_group+" before)|"+
"(?:each "+mana_group+" in)|"+
"(?:for each "+mana_group+" or "+mana_group+" spent)|"+
"(?:costs? "+mana_group+" more)|"+
"(?:mana less than "+mana_group+")|"+
"(?:costs? "+mana_group+" less)|"+ # Reductions
"(?:up to "+mana_group+" less)|"+
"(?:costs "+mana_group+",)|"+
"(?:costs? "+mana_group+" to play)|"+
"(?:reduced by "+mana_group+")|"+
"(?:reduced by up to "+mana_group+"\.)|"+
"(?:reduces its cost by "+mana_group+")|"+
"(?:produces "+mana_group+" instead)|"+ # Changes
"(?:([\dX]+)o?"+hybrid_pattern+")|"+ # Hybrid
"(?:(?P<"+HYBRID+">o?"+hybrid_pattern+"))|"+
"(?:^(T)[:,])|"+ # Tap
"(?:[-\",] ?(T)[:,])|"+
"(?: (T) in their costs)|"+
"(?: (T) in its activation cost)|"+
"(?:,? ?(o?Q)[:,])|"+ # Untap
"(?:(Q) is the untap)")
pattern2 = ("(?P<"+CASTCOST+">^(\b))|"+
"(?P<"+HYBRID+">^(\b))|"+
"(?:^Buyback "+mana_group+")|"+ # Extras
"(?:Cumulative upkeep "+mana_group+" or "+mana_group+")|"+
"(?:Cumulative upkeep "+mana_group+")|"+
"(?:Splice onto Arcane "+mana_group+")|"+
"(?:[Cc]ycling "+mana_group+")|"+
"(?:^Entwine "+mana_group+")|"+
"(?:^Equip "+mana_group+")|"+
"(?:^Flashback[- ]{1,2}"+mana_group+" |$)|"+
"(?:^Kicker "+mana_group+" and/or "+mana_group+")|"+
"(?:^Kicker "+mana_group+")|"+
"(?:the "+mana_group+" kicker cost)|"+
"(?:^Madness "+mana_group+")|"+
"(?:^Morph "+mana_group+")|"+
"(?:2/2 creature for (3))|"+
"(?:^Ninjutsu "+mana_group+")|"+
"(?:^Transmute "+mana_group+")|"+
"(?:^Replicate "+mana_group+")|"+
"(?:^Recover "+mana_group+")|"+
"(?:^Echo "+mana_group+")|"+
"(?:has echo "+mana_group+")|"+
"(?:^Suspend [\dX]+-"+mana_group+")|"+
"(?:^Aura swap "+mana_group+")|"+
"(?:^Fortify "+mana_group+")|"+
"(?:^Transfigure "+mana_group+")|"+
"(?:^Evoke "+mana_group+")|"+
"(?:^Prowl "+mana_group+")|"+
"(?:^Reinforce [\dX]+\-"+mana_group+")|"+
"(?:^Unearth "+mana_group+")|"+
"(?:has unearth "+mana_group+")")
cost_matchers = [re.compile(pattern2), re.compile(pattern)]
cost_splitter = re.compile("(\d+)|(\w)")
swaps = [(' - ', ' -- '),
('upkeep-', 'upkeep -- '),
('Buyback-', 'Buyback--'),
('Entwine-', 'Entwine--'),
('Equip-', 'Equip--'),
('Flashback-', 'Flashback--'),
('Kicker-', 'Kicker--'),
('Morph-', 'Morph--'),
('oG', 'G:'),
('oR', 'R:'),
('ocT', 'T'),
("T in","{T} in"),
("T or","{T} or"),
('T ', 'T: '),
('oSi', 'S'),
('oQ', 'Q'),
('Æ', 'AE'),
('/ /','//'),
('\x97','--'),
('\x8a0',' '),
('\xc2\xae','(R)'),
('\xc2 ',' '),
('\xae','(R)'),
('\xc3 ','à'),
('\xc3\xa9','é'),
('—','--'),
('‘',"'"),
('’',"'"),
('“','"'),
('”','"'),
('<I>',''),
('</I>',''),
('&','&')]
trans = string.maketrans('áàâäéèêëíìîïóòôöúùûü\x92\x93\x94\xa0',
'aaaaeeeeiiiioooouuuu\'"" ')
delchars = '\x81'
saw_casting_cost = 0
# Go through all lines and change whatever necessary
for line in PreTextExtractor(fp):
# Single character translation
line = line.translate(trans, delchars).strip()
# Simple string translation
for old, new in swaps:
line = line.replace(old, new)
# Cost translation
if not line:
saw_casting_cost = 0
for rex in cost_matchers:
start = 0
new_line = ""
while 1:
mat = rex.search(line, start)
start0 = start
if mat:
if mat.group(CASTCOST):
# Casting cost. Or is it?
if saw_casting_cost:
# No, we've seen casting cost already.
# This must be this Planeswalker value thing...
new_line += line
else:
# Yep, it is a casting cost alright. Convert and save!
cost = '{'+'}{'.join(filter(None, cost_splitter.split(line)))+'}'
new_line += cost
saw_casting_cost = 1
start = len(line)
elif mat.group(HYBRID):
# Special case for hybrid spell mana costs
new_line += line[start:mat.start(HYBRID)]
cost = '{'+mat.group(HYBRID).replace(' ', '')[-4:-1].upper()+'}'
new_line += cost
start = mat.end(HYBRID)
else:
for grix in [i for i in range(1, len(mat.groups())+1) if mat.group(i)]:
new_line += line[start:mat.start(grix)]
cost = '{'+'}{'.join(filter(None, cost_splitter.split(mat.group(grix))))+'}'
new_line += cost
start = mat.end(grix)
if start == start0:
new_line += line[start:]
break
line = new_line
out.write(line+'\n')
fp.close()
out.close()
Automatically generated 2014-10-26 by show.py