Using SQLAlchemy for bulk uploading data

Bulk uploading huge amounts of data using Cx_oracle can be quite a pain. A faster way to do this would be to use SQLAlchemy to insert/update lot of entries in the database. It has a function called bulk_insert_mappings which can be used for this purpose. First you will have to define a class which mimics the structure of the table you want to insert data into. The method accepts two parameters- One is the class that was just defined. The second has a dictionary of the values you want to upload.



import sqlalchemy
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String,  create_engine
from sqlalchemy.orm import scoped_session, sessionmaker
Base = declarative_base()
metadata = Base.metadata
session = scoped_session(sessionmaker())
engine = None

class test_table(Base):
    tablename = "T_MATT_TEST_DATA"
    index = Column(String(40), primary_key=True)
    row_num = Column(String(40))
    load_date = Column(String(40))
    cyc_id = Column(String(40))
 
def init_db(db_name='sqlite:///test.db'):
     global engine
     engine = create_engine('oracle+cx_oracle://%s:%s@%s' % (db["user"], db["password"], dsn_tns))
     session.remove()
     session.configure(bind=engine, autoflush=False, expire_on_commit=False)
     metadata.drop_all(engine)
     metadata.create_all(engine)
 
init_db()
#df is the dataframe which has the data you want to load.
session.bulk_insert_mappings(
     test_table,
      [    dict(index='{}'.format(index),row_num='{}'.format(row['row_num']),load_date='{}'.format(row['load_date']),cyc_id='{}'.format(row['cyc_id'])) for index,row in df.iterrows()
     ])
session.commit()

If the dataframe is too large (more than 10000 rows) then its recommended to chunk the data frame into smaller chunks and upload it one chunk at a time

def chunker(seq,size):
     return(seq[pos:pos+size] for pos in range(0,len(seq),size))
batch_size=10000 #test different batchsizes
for chunk_df in chunker(df, batch_size):
     session.bulk_insert_mappings(
                         test_table,
                          [       dict(index='{}'.format(index),row_num='{}'.format(row['row_num']),load_date='{}'.format(row['load_date']),cyc_id='{}'.format(row['cyc_id'])) for index,row in chunk_df.iterrows()                       ])
     session.commit()

Mathew Thomas

Data -> Information -> Insight

Using SQLAlchemy for bulk uploading data

Leave a comment Cancel reply

Share this:

Related

Leave a comment Cancel reply