mariadb-columnstore-engine/writeengine/bulk/bulkload.py

#!/usr/bin/python
##
## Bulkloader script by Martin Thomas
##

import os, sys, glob, shutil, xml.dom.minidom
import getopt
import logging
import time

logger = logging.getLogger()
shdlr = logging.StreamHandler()
fhdlr = logging.FileHandler(filename='bulkload.log' )
formatter = logging.Formatter('%(asctime)s:%(levelname)s: %(message)s')
shdlr.setFormatter(formatter)
fhdlr.setFormatter(formatter)
logger.addHandler(shdlr)
logger.addHandler(fhdlr)

## only report INFO or higher - change to WARNING to silence all logging
logger.setLevel(logging.INFO)


def usage():
    print """

    Bulkload.py is intended to automate the manual steps required to load the database and build indexes from scratch.

    - ipcs-pat will be built if missing
    - cpimport will be removed and rebuilt
    - PrimProc will be stopped and started
    - shared memory sgements wil be removed using ipcs-pat
    - database files will be removed
    - dbgen will be run with option 5
    - oid files and job files will be copied to correct locations
    - column data will be parsed and loaded using Job 299
    - index data will be exported, sorted and loaded using Job 300

    Options:
      -w or --wedir=   : Specify the write engine branch to use instead of the default trunk
      -n or --nocache= : Specify either col or idx and the -c flag will NOT be sent to cpimport
      -u or --usage    : Usage message

    Example:
      bulkload.py -w/home/adevelop/genii/we1.1 --nocache=idx
      Load the database using the we1.1 branch for writeengine and do not use cache when building indexes

    THIS SPACE LEFT INTENTIONALLY BLANK
    """

def find_paths():

  """Find DBRoot and BulkRoot."""
  try:
    config_file = os.environ['CALPONT_CONFIG_FILE']
  except KeyError:
    try:
        logger.info("Environment variable CALPONT_CONFIG_FILE not set, looking for system Columnstore.xml")
        config_file = '/usr/local/mariadb/columnstore/etc/Columnstore.xml'
        os.lstat(config_file)
    except:
        logger.error('No config file available')
        sys.exit('No config file available')
  try:
      xmldoc = xml.dom.minidom.parse(config_file)
      bulk_node = xmldoc.getElementsByTagName('BulkRoot')[0]
      db_node = xmldoc.getElementsByTagName('DBRoot1')[0]
      bulk_dir = bulk_node.childNodes[0].nodeValue
      data_dir = db_node.childNodes[0].nodeValue

  except Exception, e:
      logger.error('Error parsing config file')
      logger.error(e)
      sys.exit('Error parsing config file')

  return (bulk_dir, data_dir)

def check_dirs(bulkroot, dbroot):

    problem = 0
    res = 0
    reqd_dirs = {
    os.getenv('HOME')+'/genii' : "No genii directory found (contains tools required to continue) (%s)",
    bulkroot: "Bulkroot specified as %s but not found",
    bulkroot+'/job': "No job directory found - needed to store Job xml files (looked in %s)",
    bulkroot+'/data/import': "No data/import directory found - expected %s to hold data to be loaded",
    bulkroot+'/log': "No data/log directory found - expected %s to log into",
    dbroot : "DBroot specified as %s but not found"
    }
    for dir in reqd_dirs.keys():
        try:
            res = os.lstat(dir)
        except:
            problem = 1
            logger.error(reqd_dirs[dir]%dir)

    if problem:
        sys.exit(1)

def fix_hwm(job_file):

  """Find hwm in xml file and change to 0"""

  import re

  src_file = open(job_file, 'r')
  dst_file = open(job_file+'.tmp', 'w')

  rep = re.compile('hwm="1"')

  for line in src_file:
    line = rep.sub('hwm="0"', line)
    dst_file.write(line)
  # use os.rename instead of shutil.move to avoid problems traversing devices
  os.rename(job_file+'.tmp', job_file)

def find_indexes(job_file):

  """Find index definitions in job_file and return list of files to sort"""

  index_files = []
  try: # try because we may have an old version of python
    xmldoc = xml.dom.minidom.parse(job_file)

    for index_node in xmldoc.getElementsByTagName('Index'):
      index_files.append(index_node.getAttribute('mapName'))
  except:
    import re
    f = open(job_file)
    for line in f.read():
      b =re.search('mapName="(CPL_[0-9A-Z_]+)"', line)
      try: # try because not every line will match
        index_files.append(b.group(1))
      except: pass

  return index_files

def exec_cmd(cmd, args):
  """Execute command using subprocess module or if that fails,
     use os.system
  """

  try:
    import subprocess

    try:
      retcode = call(cmd + " "+args, shell=True)
      if retcode < 0:
        print >>sys.stderr, "Child was terminated by signal", -retcode
        sys.exit(-1)

      else:
        print >>sys.stderr, "Child returned", retcode

    except OSError, e:

      print >>sys.stderr, "Execution failed:", e
      sys.exit(-1)
  except:
    logger.info ('Old version of Python - subprocess not available, falling back to os.system')
    logger.info ('Executing: '+cmd+' '+args)
    res = os.system(cmd+' '+args)
    if res:
      logger.error('Bad return code %i from %s'%(res, cmd))
      sys.exit( res )


def build_tool(tool):
  """
  Use the tool dictionary to determine if required tool exists
  and build if not
  """

  if not os.path.exists(tool['path']+tool['tool']):
    logger.warn ("Building %s before continuing"%tool['tool'])
    curdir=os.getcwd()
    os.chdir(tool['path'])
    exec_cmd(tool['builder'], tool['args'])
    os.chdir(curdir)

def main():
  """
  Bulk load the database..
  Check that we can write OIDfiles, that all required tools exist,
  clean up old files, sort the index inserts and generally rock and roll
  """
  start_dir = curdir=os.getcwd() # remember where we started

  if not os.access('.', os.W_OK):
    os.chdir('/tmp')
    logger.warn('Changing to /tmp to have permission to write files')

  if not os.environ.has_key('LD_LIBRARY_PATH'):
      logger.info('No environment variable LD_LIBRARY_PATH')
  else:
      if len(os.getenv('LD_LIBRARY_PATH'))<5:
          logger.info('Suspicous LD_LIBRARY_PATH: %s'%os.getenv('LD_LIBRARY_PATH'))

  #-- figure out paths
  home = os.getenv('HOME')
  genii = home+'/genii'
  cache = {}
  cache['idx'] = '-c'
  cache['col'] = '-c'

#-- allow us to specify a write engine branch
  opts, args = getopt.getopt(sys.argv[1:], 'w:n:u', ['wedir=', 'nocache=', 'usage'])
  wedir = genii+'/writeengine'
  for opt, arg in opts:
      if opt =='-w' or opt =='--wedir':
          wedir = arg

      if opt == '-n' or opt == '--nocache':
          if (arg=='idx' or arg=='col'):
              cache[arg] = ''
              logger.info("No cache for %s"% arg)

      if opt == '-u' or opt == '--usage':
          usage()
          sys.exit()

  logger.info("Using writengine at %s"%wedir)

  (bulkroot, dbroot) = find_paths()

  logger.info ("Bulkroot: %s \tDBRoot: %s\n"%(bulkroot, dbroot))

  check_dirs(bulkroot, dbroot)

  if len(glob.glob(bulkroot+'/data/import/*tbl')) == 0:
    sys.exit("No files for import found in BulkRoot: %s"%(bulkroot))

  if  len(glob.glob(dbroot+'/000.dir'))==0:
    logger.info("No files found in DBRoot: %s (not fatal)"%dbroot)

## force rebuild cpimport and build ipcs-pat if required

  build_tool({'path':genii+'/versioning/BRM/',
              'tool':'ipcs-pat',
              'builder':'make', 'args':'tools'})

  build_tool({'path':wedir+'/bulk/',
            'tool':'cpimport',
            'builder':'make', 'args':'clean'})
  try:
    exec_cmd('rm -f', wedir+'/bulk/cpimport')
  except:
      pass

  try:
    os.lstat(start_dir+'/cpimport') # look in local directory first
  except:
    build_tool({'path':wedir+'/bulk/',
              'tool':'cpimport',
              'builder':'make', 'args':'cpimport'})


## clean up before starting
## remove old db files, removed old temp files, remove shared memory segments,
## kill old PrimProc and start new one

  logger.info ("Removing old DB files")
  exec_cmd('rm -fr ', dbroot+'/000.dir')

  logger.info ("Removing old temp files")
  exec_cmd('rm -fr ', bulkroot+'/data/import/*.idx.txt')

  logger.info ("Removing old process files")
  exec_cmd('rm -fr ', bulkroot+'/process/*.*')

  logger.info("Killing primProc")
  os.system('killall -q -u $USER PrimProc')

  logger.info ("kill controllernode and workernode")
  exec_cmd(genii+'/export/bin/dbrm', "stop ")

  time.sleep(2);
  logger.info ("Removing shared memory segments")
  exec_cmd(genii+'/versioning/BRM/ipcs-pat', '-d')

  logger.info("Starting controllernode workernode")
  exec_cmd(genii+'/export/bin/dbrm', "start ")

  logger.info("Starting primProc")
  exec_cmd(genii+'/export/bin/PrimProc', "> primproc.log &")

## run dbbuilder - add yes command at front to automatically answer questions
  logger.info ("Building db and indexes (no data inserted)")
  exec_cmd('yes | '+genii+'/tools/dbbuilder/dbbuilder', ' 5')

  logger.info ("Relocating OID files")

  for xmlfile in glob.glob('./Job*xml'):
    logger.info ("Copying %s to %s\n"%(xmlfile,  bulkroot+'/job'))
    # use os.rename instead of shutil.move to avoid problems traversing devices
    os.rename(xmlfile, bulkroot+'/job/'+xmlfile)

  logger.info("Using cpimport at %s"%(wedir+'/bulk/cpimport'))
  exec_cmd('time '+wedir+'/bulk/cpimport', '-j 299 ')
  exec_cmd(wedir+'/bulk/cpimport', '-c -j 300  ' )

## the following line allows either interactive use or module import
if __name__=="__main__": main()