fpdb/pyfpdb/fpdb_import.py

533 lines
23 KiB
Python
Raw Normal View History

2010-07-08 20:01:03 +02:00
#!/usr/bin/python
# -*- coding: utf-8 -*-
#Copyright 2008-2010 Steffen Schaumburg
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU Affero General Public License as published by
#the Free Software Foundation, version 3 of the License.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU Affero General Public License
#along with this program. If not, see <http://www.gnu.org/licenses/>.
#In the "official" distribution you can find the license in agpl-3.0.txt.
# Standard Library modules
import os # todo: remove this once import_dir is in fpdb_import
import sys
2009-11-30 15:08:30 +01:00
from time import time, strftime, sleep, clock
import traceback
import math
import datetime
import re
2009-07-31 22:24:21 +02:00
import Queue
from collections import deque # using Queue for now
import threading
import logging
# logging has been set up in fpdb.py or HUD_main.py, use their settings:
log = logging.getLogger("importer")
import pygtk
import gtk
2010-08-16 02:57:03 +02:00
import locale
lang=locale.getdefaultlocale()[0][0:2]
if lang=="en":
def _(string): return string
else:
import gettext
try:
trans = gettext.translation("fpdb", localedir="locale", languages=[lang])
trans.install()
except IOError:
def _(string): return string
2010-08-16 02:57:03 +02:00
# fpdb/FreePokerTools modules
import Database
import Configuration
import Exceptions
2009-08-12 02:46:39 +02:00
# database interface modules
try:
import MySQLdb
2009-09-16 04:07:31 +02:00
except ImportError:
2010-08-16 02:57:03 +02:00
log.debug(_("Import database module: MySQLdb not found"))
else:
mysqlLibFound = True
try:
import psycopg2
2009-09-16 04:07:31 +02:00
except ImportError:
2010-08-16 02:57:03 +02:00
log.debug(_("Import database module: psycopg2 not found"))
else:
2009-07-31 22:24:21 +02:00
import psycopg2.extensions
2009-06-06 17:17:49 +02:00
psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
class Importer:
def __init__(self, caller, settings, config, sql = None):
"""Constructor"""
self.settings = settings
self.caller = caller
self.config = config
self.sql = sql
#log = Configuration.get_logger("logging.conf", "importer", log_dir=self.config.dir_log)
self.filelist = {}
self.dirlist = {}
self.siteIds = {}
self.addToDirList = {}
self.removeFromFileList = {} # to remove deleted files
self.monitor = False
self.updatedsize = {}
self.updatedtime = {}
self.lines = None
self.faobs = None # File as one big string
self.pos_in_file = {} # dict to remember how far we have read in the file
#Set defaults
self.callHud = self.config.get_import_parameters().get("callFpdbHud")
2009-08-04 22:41:04 +02:00
# CONFIGURATION OPTIONS
2009-03-26 23:55:16 +01:00
self.settings.setdefault("minPrint", 30)
self.settings.setdefault("handCount", 0)
2009-08-04 22:41:04 +02:00
#self.settings.setdefault("allowHudcacheRebuild", True) # NOT USED NOW
#self.settings.setdefault("forceThreads", 2) # NOT USED NOW
2009-07-31 22:24:21 +02:00
self.settings.setdefault("writeQSize", 1000) # no need to change
self.settings.setdefault("writeQMaxWait", 10) # not used
2009-08-06 22:12:50 +02:00
self.settings.setdefault("dropIndexes", "don't drop")
self.settings.setdefault("dropHudCache", "don't drop")
self.settings.setdefault("starsArchive", False)
self.settings.setdefault("testData", False)
2010-08-19 12:33:43 +02:00
self.settings.setdefault("cacheHHC", False)
2009-07-31 22:24:21 +02:00
self.writeq = None
self.database = Database.Database(self.config, sql = self.sql)
self.writerdbs = []
2009-08-04 22:41:04 +02:00
self.settings.setdefault("threads", 1) # value set by GuiBulkImport
2009-07-31 22:24:21 +02:00
for i in xrange(self.settings['threads']):
self.writerdbs.append( Database.Database(self.config, sql = self.sql) )
2009-11-30 15:08:30 +01:00
clock() # init clock in windows
#Set functions
def setCallHud(self, value):
self.callHud = value
def setMinPrint(self, value):
self.settings['minPrint'] = int(value)
def setHandCount(self, value):
self.settings['handCount'] = int(value)
def setQuiet(self, value):
self.settings['quiet'] = value
def setFailOnError(self, value):
self.settings['failOnError'] = value
def setHandsInDB(self, value):
self.settings['handsInDB'] = value
def setThreads(self, value):
self.settings['threads'] = value
2009-07-31 22:24:21 +02:00
if self.settings["threads"] > len(self.writerdbs):
for i in xrange(self.settings['threads'] - len(self.writerdbs)):
self.writerdbs.append( Database.Database(self.config, sql = self.sql) )
def setDropIndexes(self, value):
self.settings['dropIndexes'] = value
2009-08-04 22:41:04 +02:00
def setDropHudCache(self, value):
self.settings['dropHudCache'] = value
def setStarsArchive(self, value):
self.settings['starsArchive'] = value
def setPrintTestData(self, value):
self.settings['testData'] = value
def setFakeCacheHHC(self, value):
self.settings['cacheHHC'] = value
def getCachedHHC(self):
return self.handhistoryconverter
# def setWatchTime(self):
# self.updated = time()
def clearFileList(self):
self.updatedsize = {}
self.updatetime = {}
self.pos_in_file = {}
self.filelist = {}
2009-07-31 22:24:21 +02:00
def closeDBs(self):
self.database.disconnect()
for i in xrange(len(self.writerdbs)):
self.writerdbs[i].disconnect()
#Add an individual file to filelist
def addImportFile(self, filename, site = "default", filter = "passthrough"):
#TODO: test it is a valid file -> put that in config!!
#print "addimportfile: filename is a", filename.__class__
# filename now comes in as unicode
if filename in self.filelist or not os.path.exists(filename):
return
self.filelist[filename] = [site] + [filter]
if site not in self.siteIds:
# Get id from Sites table in DB
result = self.database.get_site_id(site)
if len(result) == 1:
self.siteIds[site] = result[0][0]
else:
if len(result) == 0:
2010-08-16 02:57:03 +02:00
log.error(_("Database ID for %s not found") % site)
else:
2010-08-16 02:57:03 +02:00
log.error(_("[ERROR] More than 1 Database ID found for %s - Multiple currencies not implemented yet") % site)
2009-02-25 15:59:11 +01:00
# Called from GuiBulkImport to add a file or directory.
def addBulkImportImportFileOrDir(self, inputPath, site = "PokerStars"):
2009-02-25 15:59:11 +01:00
"""Add a file or directory for bulk import"""
filter = self.config.hhcs[site].converter
2009-02-25 15:59:11 +01:00
# Bulk import never monitors
# if directory, add all files in it. Otherwise add single file.
# TODO: only add sane files?
if os.path.isdir(inputPath):
for subdir in os.walk(inputPath):
for file in subdir[2]:
self.addImportFile(unicode(os.path.join(subdir[0], file),'utf-8'),
site=site, filter=filter)
2009-02-25 15:59:11 +01:00
else:
self.addImportFile(unicode(inputPath,'utf-8'), site=site, filter=filter)
#Add a directory of files to filelist
#Only one import directory per site supported.
#dirlist is a hash of lists:
#dirlist{ 'PokerStars' => ["/path/to/import/", "filtername"] }
2009-11-03 21:29:05 +01:00
def addImportDirectory(self,dir,monitor=False, site="default", filter="passthrough"):
#gets called by GuiAutoImport.
2009-02-24 14:46:05 +01:00
#This should really be using os.walk
#http://docs.python.org/library/os.html
if os.path.isdir(dir):
if monitor == True:
self.monitor = True
self.dirlist[site] = [dir] + [filter]
#print "addImportDirectory: checking files in", dir
for file in os.listdir(dir):
#print " adding file ", file
self.addImportFile(os.path.join(dir, file), site, filter)
else:
log.warning(_("Attempted to add non-directory '%s' as an import directory") % str(dir))
def runImport(self):
2009-07-31 22:24:21 +02:00
""""Run full import on self.filelist. This is called from GuiBulkImport.py"""
2009-08-04 22:41:04 +02:00
#if self.settings['forceThreads'] > 0: # use forceThreads until threading enabled in GuiBulkImport
# self.setThreads(self.settings['forceThreads'])
2009-07-31 22:24:21 +02:00
# Initial setup
start = datetime.datetime.now()
2009-07-31 22:24:21 +02:00
starttime = time()
2010-08-16 02:57:03 +02:00
log.info(_("Started at %s -- %d files to import. indexes: %s") % (start, len(self.filelist), self.settings['dropIndexes']))
if self.settings['dropIndexes'] == 'auto':
2009-07-31 22:24:21 +02:00
self.settings['dropIndexes'] = self.calculate_auto2(self.database, 12.0, 500.0)
if 'dropHudCache' in self.settings and self.settings['dropHudCache'] == 'auto':
2009-07-31 22:24:21 +02:00
self.settings['dropHudCache'] = self.calculate_auto2(self.database, 25.0, 500.0) # returns "drop"/"don't drop"
if self.settings['dropIndexes'] == 'drop':
self.database.prepareBulkImport()
else:
2010-08-16 02:57:03 +02:00
log.debug(_("No need to drop indexes."))
#print "dropInd =", self.settings['dropIndexes'], " dropHudCache =", self.settings['dropHudCache']
2009-07-31 22:24:21 +02:00
if self.settings['threads'] <= 0:
(totstored, totdups, totpartial, toterrors) = self.importFiles(self.database, None)
else:
# create queue (will probably change to deque at some point):
self.writeq = Queue.Queue( self.settings['writeQSize'] )
# start separate thread(s) to read hands from queue and write to db:
for i in xrange(self.settings['threads']):
t = threading.Thread( target=self.writerdbs[i].insert_queue_hands
, args=(self.writeq, self.settings["writeQMaxWait"])
, name="dbwriter-"+str(i) )
t.setDaemon(True)
t.start()
# read hands and write to q:
(totstored, totdups, totpartial, toterrors) = self.importFiles(self.database, self.writeq)
if self.writeq.empty():
2010-08-16 02:57:03 +02:00
print _("writers finished already")
2009-07-31 22:24:21 +02:00
pass
else:
2010-08-16 02:57:03 +02:00
print _("waiting for writers to finish ...")
2009-07-31 22:24:21 +02:00
#for t in threading.enumerate():
# print " "+str(t)
#self.writeq.join()
#using empty() might be more reliable:
while not self.writeq.empty() and len(threading.enumerate()) > 1:
# TODO: Do we need to actually tell the progress indicator to move, or is it already moving, and we just need to process events...
2009-11-03 21:29:05 +01:00
while gtk.events_pending(): # see http://faq.pygtk.org/index.py?req=index for more hints (3.7)
gtk.main_iteration(False)
2009-07-31 22:24:21 +02:00
sleep(0.5)
2010-08-16 02:57:03 +02:00
print _(" ... writers finished")
2009-07-31 22:24:21 +02:00
# Tidying up after import
if self.settings['dropIndexes'] == 'drop':
self.database.afterBulkImport()
else:
2010-08-16 02:57:03 +02:00
print _("No need to rebuild indexes.")
if 'dropHudCache' in self.settings and self.settings['dropHudCache'] == 'drop':
2009-07-31 22:24:21 +02:00
self.database.rebuild_hudcache()
else:
2010-08-16 02:57:03 +02:00
print _("No need to rebuild hudcache.")
2009-07-31 22:24:21 +02:00
self.database.analyzeDB()
endtime = time()
return (totstored, totdups, totpartial, toterrors, endtime-starttime)
# end def runImport
def importFiles(self, db, q):
""""Read filenames in self.filelist and pass to import_file_dict().
Uses a separate database connection if created as a thread (caller
passes None or no param as db)."""
2009-01-16 18:24:00 +01:00
totstored = 0
totdups = 0
totpartial = 0
toterrors = 0
tottime = 0
for file in self.filelist:
2009-07-31 22:24:21 +02:00
(stored, duplicates, partial, errors, ttime) = self.import_file_dict(db, file
,self.filelist[file][0], self.filelist[file][1], q)
2009-01-16 18:24:00 +01:00
totstored += stored
totdups += duplicates
totpartial += partial
toterrors += errors
2009-07-31 22:24:21 +02:00
for i in xrange( self.settings['threads'] ):
print _("sending finish message queue length ="), q.qsize()
2009-07-31 22:24:21 +02:00
db.send_finish_msg(q)
return (totstored, totdups, totpartial, toterrors)
# end def importFiles
# not used currently
def calculate_auto(self, db):
"""An heuristic to determine a reasonable value of drop/don't drop"""
if len(self.filelist) == 1: return "don't drop"
if 'handsInDB' not in self.settings:
try:
2009-07-31 22:24:21 +02:00
tmpcursor = db.get_cursor()
tmpcursor.execute("Select count(1) from Hands;")
self.settings['handsInDB'] = tmpcursor.fetchone()[0]
except:
pass # if this fails we're probably doomed anyway
if self.settings['handsInDB'] < 5000: return "drop"
if len(self.filelist) < 50: return "don't drop"
if self.settings['handsInDB'] > 50000: return "don't drop"
return "drop"
2009-07-31 22:24:21 +02:00
def calculate_auto2(self, db, scale, increment):
"""A second heuristic to determine a reasonable value of drop/don't drop
This one adds up size of files to import to guess number of hands in them
Example values of scale and increment params might be 10 and 500 meaning
roughly: drop if importing more than 10% (100/scale) of hands in db or if
less than 500 hands in db"""
size_per_hand = 1300.0 # wag based on a PS 6-up FLHE file. Actual value not hugely important
# as values of scale and increment compensate for it anyway.
# decimal used to force float arithmetic
# get number of hands in db
if 'handsInDB' not in self.settings:
try:
2009-07-31 22:24:21 +02:00
tmpcursor = db.get_cursor()
tmpcursor.execute("Select count(1) from Hands;")
self.settings['handsInDB'] = tmpcursor.fetchone()[0]
except:
pass # if this fails we're probably doomed anyway
# add up size of import files
total_size = 0.0
for file in self.filelist:
if os.path.exists(file):
stat_info = os.stat(file)
total_size += stat_info.st_size
2009-07-31 22:24:21 +02:00
# if hands_in_db is zero or very low, we want to drop indexes, otherwise compare
# import size with db size somehow:
ret = "don't drop"
if self.settings['handsInDB'] < scale * (total_size/size_per_hand) + increment:
ret = "drop"
#print "auto2: handsindb =", self.settings['handsInDB'], "total_size =", total_size, "size_per_hand =", \
# size_per_hand, "inc =", increment, "return:", ret
return ret
2009-07-31 22:24:21 +02:00
#Run import on updated files, then store latest update time. Called from GuiAutoImport.py
def runUpdated(self):
2009-02-26 16:36:23 +01:00
#Check for new files in monitored directories
#todo: make efficient - always checks for new file, should be able to use mtime of directory
# ^^ May not work on windows
#rulog = open('runUpdated.txt', 'a')
#rulog.writelines("runUpdated ... ")
for site in self.dirlist:
self.addImportDirectory(self.dirlist[site][0], False, site, self.dirlist[site][1])
for file in self.filelist:
if os.path.exists(file):
stat_info = os.stat(file)
#rulog.writelines("path exists ")
if file in self.updatedsize: # we should be able to assume that if we're in size, we're in time as well
if stat_info.st_size > self.updatedsize[file] or stat_info.st_mtime > self.updatedtime[file]:
# print "file",file," updated", os.path.basename(file), stat_info.st_size, self.updatedsize[file], stat_info.st_mtime, self.updatedtime[file]
try:
if not os.path.isdir(file):
self.caller.addText("\n"+os.path.basename(file))
except KeyError: # TODO: What error happens here?
pass
(stored, duplicates, partial, errors, ttime) = self.import_file_dict(self.database, file, self.filelist[file][0], self.filelist[file][1], None)
try:
2010-01-28 11:19:19 +01:00
if not os.path.isdir(file): # Note: This assumes that whatever calls us has an "addText" func
2009-11-30 15:08:30 +01:00
self.caller.addText(" %d stored, %d duplicates, %d partial, %d errors (time = %f)" % (stored, duplicates, partial, errors, ttime))
except KeyError: # TODO: Again, what error happens here? fix when we find out ..
pass
self.updatedsize[file] = stat_info.st_size
self.updatedtime[file] = time()
else:
if os.path.isdir(file) or (time() - stat_info.st_mtime) < 60:
self.updatedsize[file] = 0
self.updatedtime[file] = 0
else:
self.updatedsize[file] = stat_info.st_size
self.updatedtime[file] = time()
else:
self.removeFromFileList[file] = True
self.addToDirList = filter(lambda x: self.addImportDirectory(x, True, self.addToDirList[x][0], self.addToDirList[x][1]), self.addToDirList)
for file in self.removeFromFileList:
if file in self.filelist:
del self.filelist[file]
self.addToDirList = {}
self.removeFromFileList = {}
self.database.rollback()
#rulog.writelines(" finished\n")
#rulog.close()
# This is now an internal function that should not be called directly.
2009-07-31 22:24:21 +02:00
def import_file_dict(self, db, file, site, filter, q=None):
#print "import_file_dict"
if os.path.isdir(file):
self.addToDirList[file] = [site] + [filter]
return (0,0,0,0,0)
conv = None
2010-01-28 11:19:19 +01:00
(stored, duplicates, partial, errors, ttime) = (0, 0, 0, 0, time())
2009-07-31 22:24:21 +02:00
# sc: is there any need to decode this? maybe easier to skip it than guess at the encoding?
#file = file.decode("utf-8") #(Configuration.LOCALE_ENCODING)
# Load filter, process file, pass returned filename to import_fpdb_file
if self.settings['threads'] > 0 and self.writeq is not None:
log.info((_("Converting %s") % file) + " (" + str(q.qsize()) + ")")
else:
log.info(_("Converting %s") % file)
hhbase = self.config.get_import_parameters().get("hhArchiveBase")
hhbase = os.path.expanduser(hhbase)
hhdir = os.path.join(hhbase,site)
try:
out_path = os.path.join(hhdir, file.split(os.path.sep)[-2]+"-"+os.path.basename(file))
except:
out_path = os.path.join(hhdir, "x"+strftime("%d-%m-%y")+os.path.basename(file))
filter_name = filter.replace("ToFpdb", "")
mod = __import__(filter)
obj = getattr(mod, filter_name, None)
if callable(obj):
idx = 0
if file in self.pos_in_file:
idx = self.pos_in_file[file]
else:
self.pos_in_file[file] = 0
hhc = obj( self.config, in_path = file, out_path = out_path, index = idx
, starsArchive = self.settings['starsArchive'], sitename = site )
if hhc.getStatus():
handlist = hhc.getProcessedHands()
self.pos_in_file[file] = hhc.getLastCharacterRead()
to_hud = []
for hand in handlist:
if hand is not None:
hand.prepInsert(self.database)
try:
hand.insert(self.database, printtest = self.settings['testData'])
except Exceptions.FpdbHandDuplicate:
duplicates += 1
else:
if self.callHud and hand.dbid_hands != 0:
to_hud.append(hand.dbid_hands)
else: # TODO: Treat empty as an error, or just ignore?
2010-08-16 02:57:03 +02:00
log.error(_("Hand processed but empty"))
# Call hudcache update if not in bulk import mode
# FIXME: Need to test for bulk import that isn't rebuilding the cache
if self.callHud:
2010-01-25 18:00:22 +01:00
for hand in handlist:
if hand is not None and not hand.is_duplicate:
2010-01-25 18:00:22 +01:00
hand.updateHudCache(self.database)
self.database.commit()
#pipe the Hands.id out to the HUD
for hid in to_hud:
try:
2010-08-16 02:57:03 +02:00
print _("fpdb_import: sending hand to hud"), hand.dbid_hands, "pipe =", self.caller.pipe_to_hud
self.caller.pipe_to_hud.stdin.write("%s" % (hid) + os.linesep)
except IOError, e:
2010-08-16 02:57:03 +02:00
log.error(_("Failed to send hand to HUD: %s") % e)
errors = getattr(hhc, 'numErrors')
stored = getattr(hhc, 'numHands')
stored -= duplicates
stored -= errors
# Really ugly hack to allow testing Hands within the HHC from someone
# with only an Importer objec
if self.settings['cacheHHC']:
self.handhistoryconverter = hhc
else:
# conversion didn't work
# TODO: appropriate response?
2010-01-28 11:19:19 +01:00
return (0, 0, 0, 1, time() - ttime)
else:
2010-08-16 02:57:03 +02:00
log.warning(_("Unknown filter filter_name:'%s' in filter:'%s'") %(filter_name, filter))
2010-01-28 11:19:19 +01:00
return (0, 0, 0, 1, time() - ttime)
ttime = time() - ttime
#This will barf if conv.getStatus != True
2009-01-16 18:24:00 +01:00
return (stored, duplicates, partial, errors, ttime)
2008-12-18 23:39:43 +01:00
def printEmailErrorMessage(self, errors, filename, line):
traceback.print_exc(file=sys.stderr)
2010-08-16 02:57:03 +02:00
print (_("Error No.%s please send the hand causing this to fpdb-main@lists.sourceforge.net so we can fix the problem.") % errors)
print _("Filename:"), filename
print _("Here is the first line of the hand so you can identify it. Please mention that the error was a ValueError:")
2008-12-18 23:39:43 +01:00
print self.hand[0]
2010-08-16 02:57:03 +02:00
print _("Hand logged to hand-errors.txt")
2008-12-18 23:39:43 +01:00
logfile = open('hand-errors.txt', 'a')
for s in self.hand:
logfile.write(str(s) + "\n")
logfile.write("\n")
logfile.close()
if __name__ == "__main__":
2010-08-16 02:57:03 +02:00
print _("CLI for fpdb_import is now available as CliFpdb.py")