ug's Python CGI scripts: oraclean.py


Last modified: 2009-01-31 21.11.34
#! /usr/local/bin/python
# -*- coding: iso-8859-1 -*-
# Written by Ulf Göransson, ug@algonet.se
#
# Created when Wizards of the Coast decided to use their Gatherer
# card database to generate Oracle files. I think it does a poor job...
#
# Usage: oraclean.py [<file>]
#
# If no file is specified, the current "Vintage" list is retrieved
# from ww2.wizards.com.
#
# History:
# 2005-06-06 Fixes related to Saviors of Kamigawa
# 2005-11-08 Now handles hybrid mana costs from Ravnica correctly
# 2006-02-03 Guildpact related changes
# 2006-04-24 Dissension related changes
# 2006-05-08 Added HTTP and primitive HTML parsing
# 2006-07-11 Treat XMP the same as PRE, remove leading blank lines
#	     Coldsnap: Recover, snow mana and alternative CU
#	     New bugs: T without colon.
# 2006-07-16 Worked through the mana filter again.
#	     New URL
# 2006-08-19 Output file name fix
# 2006-09-26 Time Spiral: Suspend, Echo with cost and a new land pattern.
# 2007-01-27 Planar Chaos: "Suspend X", "has echo X".
# 2007-03-11 Now downloads "Vintage" list; all cards doesn't work anymore.
# 2007-05-05 Future Sight: Aura swap, Fortify and Transfigure.
# 2007-10-01 Lorwyn: Evoke, Planeswalker's casting cost like value.
# 2008-01-22 Morningtide: Prowl, Reinforce.
#	     New HTML format (DIV & BR instead of XMP)
# 2008-07-06 Shadowmoor: Hybrid costs with digit, untap symbol
# 2008-09-30 Shards of Alara: Unearth
# 2009-01-31 Conflux: Worldheart Phoenix activation cost fix
#

from __future__ import generators	# Needed because I run an old version

import os, sys
import string, re
import time
import urllib
import sgmllib

# A very primitive SGML parser that locates and extracts the oracle text
# from an HTML file written by Gatherer.
# It will also work on a pure text file.
class PreTextExtractor(sgmllib.SGMLParser):
    def __init__(self, fp):
        sgmllib.SGMLParser.__init__(self)
        self.fp = fp
        self.echo = 1	# Start with echo on, works fine with pure text files
        self.started = 0	# Don't echo just yet, though...
        self.lines = [""]
        self.entitydefs = {}
        self.verbose = 0

    def start_pre(self, attrs):
        # We found a <pre> tag, start echoing
        if self.verbose:
            print "start_pre", attrs
        self.echo = 1

    def end_pre(self):
        # End of pre
        if self.verbose:
            print "end_pre"
        self.echo = 0

    def start_xmp(self, attrs):
        # We found an <xmp> tag, start echoing
        if self.verbose:
            print "start_xmp", attrs
        self.echo = 1

    def end_xmp(self):
        # End of xmp
        if self.verbose:
            print "end_xmp"
        self.echo = 0

    def start_div(self, attrs):
        # We found a <div> tag, look further
        if self.verbose:
            print "start_div", attrs
        for attr, val in attrs:
            if attr == "style" and val.find("Courier") >= 0:
                if self.verbose:
                    print "Found matching div, enable echo"
                # This ought to be the plain text part, start echoing
                self.echo = 1

    def end_div(self):
        # End of div
        if self.verbose:
            print "end_div"
        self.echo = 0

    def start_html(self, attrs):
        # We're not in a text file, turn echo off
        if self.verbose:
            print "start_html", attrs
        self.echo = 0

    def do_br(self, attrs):
        if self.echo:
            # We found a break in our "plain" text, start a new line
            self.lines[-1] += '\n'
            self.lines.append("")

    # Take care of entity references
    def unknown_entityref(self, name):
	self.handle_data('&'+name)

    def handle_data(self, data):
        if self.echo:
            self.lines[-1] += data

    def __iter__(self):
        # Iterate over all complete, echoable lines
        lineno = 0
        for line in self.fp.readlines():
            # De-evolve a lonely break tag
            line = line.replace('<br/>', '<br>')
            self.feed(line)
            self.goahead(0)
            while self.lines[0] and self.lines[0][-1] in "\r\n":
                # Complete line, give it back
                line = self.lines[0]
                del self.lines[0]
                if line.strip() or self.started:
                    # ...provided that it isn't a leading blank
                    yield line
                    self.started = 1
                if len(self.lines) == 0:
                    self.lines = [""]

CASTCOST = "cc"
HYBRID = "hy"

if len(sys.argv) == 1:
    # No file, go for the URL
    fp = urllib.urlopen("http://ww2.wizards.com/gatherer/index.aspx"+
                        "?setfilter=Vintage&output=Oracle+Spoiler")
    out = open(time.strftime("gatherer-vin-%Y-%m-%d-cleaned.txt"), "w")
else:
    # File, either HTML or text
    ora = sys.argv[1]
    fp = open(ora)
    out = open(os.path.splitext(ora)[0]+"-cleaned.txt", "w")

# Reg-exp excesses and other filters
mana_pattern = "[\dXYZWUBRGS]+"
hybrid_pattern = "\([\dwubrgWUBRG] ?/ ?[\dwubrgWUBRG]\)"
mana_group = "("+mana_pattern+")"
pattern = ("(?:(?P<"+CASTCOST+">^"+mana_group+"$))|"+		# Casting cost
           "(?:^"+mana_group+" // "+mana_group+"$)|"+
           "(?:mana cost is "+mana_group+",)|"+
           "(?:with mana cost "+mana_group+",)|"+
           "(?:with "+mana_group+" in their)|"+
           "(?: "+mana_group+" was spent)|"+
           "(?:paid with either "+mana_group+" or "+mana_group+")|"+
           "(?:"+mana_group+", (T)[,:])|"+	# Activation cost
           "(?:^"+mana_group+"[,:])|"+
           "(?:[\"\(]"+mana_group+"[,:])|"+
           "(?:-- ?"+mana_group+"[,:])|"+
           "(?:[Aa]dds? "+mana_group+" to)|"+	# Mana abilities
           "(?:Add "+mana_group+" or "+mana_group+")|"+
           "(?:Add "+mana_group+", "+mana_group+", or "+mana_group+")|"+
           "(?:add up to "+mana_group+" to)|"+
           "(?:add that much "+mana_group+" to)|"+
           "(?:either "+mana_group+" or "+mana_group+" to)|"+
           "(?:amount of "+mana_group+" )|"+
           "(?:of "+mana_group+" and/or "+mana_group+" )|"+
           "(?:pay(?:ing)? "+mana_group+" rather)|"+	# Alternative
           "(?:pay "+mana_group+" or "+mana_group+" to)|"+
           "(?:pays? "+mana_group+" or "+mana_group+"\.)|"+
           "(?:pays? "+mana_group+" or)|"+
           "(?:produce "+mana_group+")|"+
           "(?:"+mana_group+" can be paid)|"+
           "(?:any two mana or with "+mana_group+")|"+
           "(?:additional "+mana_group+" to)|"+	# Additions & penalties
           "(?:additional "+mana_group+" as)|"+
           "(?:additional "+mana_group+" and/or "+mana_group+" )|"+
           "(?:additional "+mana_group+" for)|"+
           "(?:additional "+mana_group+" you)|"+
           "(?:additional "+mana_group+"\.)|"+
           "(?:pay "+mana_group+" and/or "+mana_group+" )|"+
# obsolete           "(?:pay "+mana_group+" in addition)|"+
           "(?:pay "+mana_group+" more)|"+
           "(?:pay up to "+mana_group+"\.)|"+
           "(?:[Pp]ays? "+mana_group+" an)|"+
           "(?:pays? "+mana_group+"[\.,])|"+
           "(?:[Pp]ays? "+mana_group+" for each)|"+
           "(?:pays only "+mana_group+" for)|"+
           "(?:pays "+mana_group+" before)|"+
           "(?:each "+mana_group+" in)|"+
           "(?:for each "+mana_group+" or "+mana_group+" spent)|"+
           "(?:costs? "+mana_group+" more)|"+
           "(?:mana less than "+mana_group+")|"+
           "(?:costs? "+mana_group+" less)|"+	# Reductions
           "(?:up to "+mana_group+" less)|"+
           "(?:costs "+mana_group+",)|"+
           "(?:costs? "+mana_group+" to play)|"+
           "(?:reduced by "+mana_group+")|"+
           "(?:reduced by up to "+mana_group+"\.)|"+
           "(?:reduces its cost by "+mana_group+")|"+
           "(?:produces "+mana_group+" instead)|"+	# Changes
           "(?:([\dX]+)o?"+hybrid_pattern+")|"+	# Hybrid
           "(?:(?P<"+HYBRID+">o?"+hybrid_pattern+"))|"+
           "(?:^(T)[:,])|"+		# Tap
           "(?:[-\",] ?(T)[:,])|"+
           "(?: (T) in their costs)|"+
           "(?: (T) in its activation cost)|"+
           "(?:,? ?(o?Q)[:,])|"+		# Untap
           "(?:(Q) is the untap)")
           
pattern2 = ("(?P<"+CASTCOST+">^(\b))|"+
            "(?P<"+HYBRID+">^(\b))|"+
            "(?:^Buyback "+mana_group+")|"+	# Extras
            "(?:Cumulative upkeep "+mana_group+" or "+mana_group+")|"+
            "(?:Cumulative upkeep "+mana_group+")|"+
            "(?:Splice onto Arcane "+mana_group+")|"+
            "(?:[Cc]ycling "+mana_group+")|"+
            "(?:^Entwine "+mana_group+")|"+
            "(?:^Equip "+mana_group+")|"+
            "(?:^Flashback[- ]{1,2}"+mana_group+" |$)|"+
            "(?:^Kicker "+mana_group+" and/or "+mana_group+")|"+
            "(?:^Kicker "+mana_group+")|"+
            "(?:the "+mana_group+" kicker cost)|"+
            "(?:^Madness "+mana_group+")|"+
            "(?:^Morph "+mana_group+")|"+
            "(?:2/2 creature for (3))|"+
            "(?:^Ninjutsu "+mana_group+")|"+
            "(?:^Transmute "+mana_group+")|"+
            "(?:^Replicate "+mana_group+")|"+
            "(?:^Recover "+mana_group+")|"+
            "(?:^Echo "+mana_group+")|"+
            "(?:has echo "+mana_group+")|"+
            "(?:^Suspend [\dX]+-"+mana_group+")|"+
            "(?:^Aura swap "+mana_group+")|"+
            "(?:^Fortify "+mana_group+")|"+
            "(?:^Transfigure "+mana_group+")|"+
            "(?:^Evoke "+mana_group+")|"+
            "(?:^Prowl "+mana_group+")|"+
            "(?:^Reinforce [\dX]+\-"+mana_group+")|"+
            "(?:^Unearth "+mana_group+")|"+
            "(?:has unearth "+mana_group+")")

cost_matchers = [re.compile(pattern2), re.compile(pattern)]
cost_splitter = re.compile("(\d+)|(\w)")
swaps = [(' - ', ' -- '),
         ('upkeep-', 'upkeep -- '),
         ('Buyback-', 'Buyback--'),
         ('Entwine-', 'Entwine--'),
         ('Equip-', 'Equip--'),
         ('Flashback-', 'Flashback--'),
         ('Kicker-', 'Kicker--'),
         ('Morph-', 'Morph--'),
         ('oG', 'G:'),
         ('oR', 'R:'),
         ('ocT', 'T'),
         ("T in","{T} in"),
         ("T or","{T} or"),
         ('T ', 'T: '),
         ('oSi', 'S'),
         ('oQ', 'Q'),
         ('Æ', 'AE'),
         ('/  /','//'),
         ('\x97','--'),
         ('\x8a0',' '),
         ('\xc2\xae','(R)'),
         ('\xc2 ',' '),
         ('\xae','(R)'),
         ('\xc3 ','à'),
         ('\xc3\xa9','é'),
         ('&#8212;','--'),
         ('&#8216;',"'"),
         ('&#8217;',"'"),
         ('&#8220;','"'),
         ('&#8221;','"'),
         ('&lt;I&gt;',''),
         ('&lt;/I&gt;',''),
         ('&amp;','&')]
trans = string.maketrans('áàâäéèêëíìîïóòôöúùûü\x92\x93\x94\xa0',
                         'aaaaeeeeiiiioooouuuu\'"" ')
delchars = '\x81'

saw_casting_cost = 0
# Go through all lines and change whatever necessary
for line in PreTextExtractor(fp):
    # Single character translation
    line = line.translate(trans, delchars).strip()
    # Simple string translation
    for old, new in swaps:
        line = line.replace(old, new)
    # Cost translation
    if not line:
        saw_casting_cost = 0
    for rex in cost_matchers:
        start = 0
        new_line = ""
        while 1:
            mat = rex.search(line, start)
            start0 = start
            if mat:
                if mat.group(CASTCOST):
                    # Casting cost. Or is it?
                    if saw_casting_cost:
                        # No, we've seen casting cost already.
                        # This must be this Planeswalker value thing...
                        new_line += line
                    else:
                        # Yep, it is a casting cost alright. Convert and save!
                        cost = '{'+'}{'.join(filter(None, cost_splitter.split(line)))+'}'
                        new_line += cost
                        saw_casting_cost = 1
                    start = len(line)
                elif mat.group(HYBRID):
                    # Special case for hybrid spell mana costs
                    new_line += line[start:mat.start(HYBRID)]
                    cost = '{'+mat.group(HYBRID).replace(' ', '')[-4:-1].upper()+'}'
                    new_line += cost
                    start = mat.end(HYBRID)
                else:
                    for grix in [i for i in range(1, len(mat.groups())+1) if mat.group(i)]:
                        new_line += line[start:mat.start(grix)]
                        cost = '{'+'}{'.join(filter(None, cost_splitter.split(mat.group(grix))))+'}'
                        new_line += cost
                        start = mat.end(grix)
            if start == start0:
                new_line += line[start:]
                break
        line = new_line
    out.write(line+'\n')
fp.close()
out.close()

/ug Home Up Python Powered
Ulf Göransson <ug@algonet.se>

Automatically generated 2014-10-26 by show.py