Home>

demand

There is a tableThe amount of data is relatively large,Update every day,Its fields can be configured through xml configuration files,That is, the fields may be different each time the table is created.

When running upstream, it will be extracted from the source file according to the configuration.To the storage step, you need to create tables based on the configuration.

solve

Wrote a simple xml, the configuration needs fields and types

Read the corresponding data upstream

In the warehouse step,First delete the original table,Create new table based on configuration

xml file

<?xml version="1.0" encoding="utf-8"?>
<!-Table name, database name Flexible configuration of which library and table to insert->
<table name="top_query" db_name="evaluting_sys">
<!-Non-business primary key,Self-growth, configurable, other integer unsigned auto_increment->
<primary_key>
<name>id</name>
</primary_key>
<!-Start of field->
<field>
<name>query</name>
<type>varchar (200)</type>
<is_index>false</is_index>
<description>query</description>
</field>
<field>
<name>pv</name>
<type>integer</type>
<is_index>false</is_index>
<description>pv</description>
</field>
<field>
<name>avg_money</name>
<type>integer</type>
<is_index>false</is_index>
<description></description>
</field>
<!-End of field configuration->
</table>

Processing script

#!/usr/bin/python
#-*-coding:utf-8-*-
#author:wklken
#desc:use to read db xml config.
#-----------------------
#2012-02-18 created
#----------------------
import sys, os
from xml.dom import minidom, node
def read_dbconfig_xml (xml_file_path):
  content={}
  root=minidom.parse (xml_file_path)
  table=root.getelementsbytagname ("table") [0]
  #read dbname and table name.
  table_name=table.getattribute ("name")
  db_name=table.getattribute ("db_name")
  if len (table_name)>0 and len (db_name)>0:
    db_sql="create database if not exists` "+ db_name +" `;use" + db_name + ";"
    table_drop_sql="drop" + table_name + "if exists" + table_name + ";"
    content.update ({"db_sql":db_sql})
    content.update ({"table_sql":table_drop_sql})
  else:
    print "error:attribute is not define well! db_name =" + db_name + ";table_name =" + table_name
    sys.exit (1)
  #print table_name, db_name
  table_create_sql="create table" + table_name + "("
  #read primary cell
  primary_key=table.getelementsbytagname ("primary_key") [0]
  primary_key_name=primary_key.getelementsbytagname ("name") [0] .childnodes [0] .nodevalue
  table_create_sql +=primary_key_name + "integer not null auto_increment primary key,"
  #print primary_key.toxml ()
  #read ordernary field
  fields=table.getelementsbytagname ("field")
  f_index=0
  for field in fields:
    f_index +=1
    name=field.getelementsbytagname ("name") [0] .childnodes [0] .nodevalue
    type=field.getelementsbytagname ("type") [0] .childnodes [0] .nodevalue
    table_create_sql +=name + "" + type
    if f_index!=len (fields):
    table_create_sql +=","
    is_index=field.getelementsbytagname ("is_index") [0] .childnodes [0] .nodevalue
  table_create_sql +=");"
  content.update ({"table_create_sql":table_create_sql})
  #character set latin1 collate latin1_danish_ci;
  print content
if __name__ == "__main__":
read_dbconfig_xml (sys.argv [1])

Python parsing large XML files [sax]Requirements

Read the xml data file,The file is large.Need to process inserts into the database in real time

xml document

<persons>
<person>
  <id>100000</id>
  <sex>male</sex>
  <address>Beijing, Haidian District</address>
  <fansnum>437</fansnum>
  <summary>1989</summary>
  <wbnum>333</wbnum>
  <gznum>242</gznum>
  <blog>null</blog>
  <edu>University</edu>
  <work></work>
  <renzh>1</renzh>
  <brithday>February 14</brithday>
</person>
</persons>

deal with

Sax processing does not read in a node-like dimension like dom.It has only the start tag content and the end tag.

The processing idea is:through a handler, for the start tag,Content, end tag each has a processing function

Code and comments

person processing class

from xml.sax import handler, parsestring
class personhandler (handler.contenthandler):
 def __init __ (self, db_ops):
  #db op obj
  self.db_ops=db_ops
  #Store a person's map
  self.person=()
  #Current tag
  self.current_tag=""
  #Whether it is the content between tags, the purpose is to get the content between tags without interference from white space
  self.in_quote=0
 #Start, clear map
 def startelement (self, name, attr):
  #With person, clear the map
  if name == "person":
   self.person=()
  #Record status
  self.current_tag=name
  self.in_quote=1
 #End, insert the database
 def endelement (self, name):
  #Ending with person represents the end of reading a person's information
  if name == "person":
   #do something
   in_fields=tuple ([("" "+ self.person.get (i," ") +" "") for i in fields])
   print in_sql%in_fields
   db_ops.insert (in_sql%(in_fields))
  #deal with
  self.in_quote=0
 def characters (self, content):
  #If it is between tags,Update to map
  if self.in_quote:
   self.person.update ({self.current_tag:content})

Plus complete code

#!/usr/bin/python
#-*-coding:utf-8-*-
#parse_person.py
#version:0.1
#author:[email protected]
#desc:parse person.xml and out sql
import sys, os
import mysqldb
reload (sys)
sys.setdefaultencoding ("utf-8")
in_sql="insert into person (id, sex, address, fansnum, summary, wbnum, gznum, blog, edu, work, renzh, brithday) values ​​(%s,%s,%s,%s,%s,%s ,     %s,%s,%s,%s,%s,%s) "
fields=("id", "sex", "address", "fansnum", "summary", "wbnum", "gznum", "blog", "edu", "work", "renzh", "brithday" )
#Databasemethod
class db_connect:
  def __init __ (self, db_host, user, pwd, db_name, charset="utf8", use_unicode=true):
    print "init begin"
    print db_host, user, pwd, db_name, charset, use_unicode
    self.conn=mysqldb.connection (db_host, user, pwd, db_name, charset=charset, use_unicode=use_unicode)
    print "init end"
  def insert (self, sql):
    try:
      n=self.conn.cursor (). execute (sql)
      return n
    except mysqldb.warning, e:
      print "error:execute sql" ", sql," "failed"
  def close (self):
    self.conn.close ()
#person Processing Class
from xml.sax import handler, parsestring
class personhandler (handler.contenthandler):
  def __init __ (self, db_ops):
    #db op obj
    self.db_ops=db_ops
    #Store a person's map
    self.person=()
    #Current tag
    self.current_tag=""
    #Whether it is between tags
    self.in_quote=0
  #Start, clear map
  def startelement (self, name, attr):
    #With person, clear the map
    if name == "person":
     self.person=()
    #Record status
    self.current_tag=name
    self.in_quote=1
  #End, insert the database
  def endelement (self, name):
    #Ending with person represents the end of reading a person's information
    if name == "person":
      #do something
      in_fields=tuple ([("" "+ self.person.get (i," ") +" "") for i in fields])
      print in_sql%in_fields
      db_ops.insert (in_sql%(in_fields))
    #deal with
    self.in_quote=0
  def characters (self, content):
    #If it is between tags,Update to map
    if self.in_quote:
      self.person.update ({self.current_tag:content})
if __name__ == "__main__":
  f=open ("./person.xml")
  #If the source file is gbk transcode, if it is UTF-8, remove decode.encode
  db_ops=db_connect ("127.0.0.1", "root", "root", "test")
  parsestring (f.read (). decode ("gbk"). encode ("utf-8"), personhandler (db_ops))
  f.close ()
  db_ops.close ()
  • Previous Method for traversing object properties and values ​​in js
  • Next A brief introduction to the umask () and truncate () functions in C