Mi buen amigo Pakdi Decnud me dio una excelente idea mientras estabamos almorzando el mismo dia del evento SAP CodeJam Montreal.
Pakdi me dijo..."Porque no haces una comparacion entre SAP HANA y digamos...MongoDB"...yo pense por supuesto, que era una excelente idea...asi que ayer me puse a investigar MongoDB...asi que seguramente se preguntaran..."Porque entonces estas hablando de MySQL y PostreSQL?"
Respuesta sencilla...y aqui esta lo que pienso...
- Realmente no entiendo MongoDB...todo el No-SQL es marciano para mi...
- MongoDB esta basado en "Documentos", lo cual significa que creas colecciones de documentos , no bases de datos o tablas...
- MongoDB no soporta Inner Joins y las agregaciones necesitan un framework que es mas raro que el mismo MongoDB...
- MongoDB no esta pensado para aplicaciones empresariales
Para este blog, queria tener bastante informacion...asi que como siempre, tome a mi amado Python y cree un script para generar 1 millon de registros para dos tablas. Un script por tabla.
La estructura de las tablas se muestra a continuacion...
DOC_HEADER
| Field Name | Data Type | Length |
|---|---|---|
| DOCUMENT_ID | VARCHAR | 8 |
| YEAR | VARCHAR | 4 |
| AREA | VARCHAR | 2 |
| NAME | VARCHAR | 20 |
| LAST_NAME | VARCHAR | 20 |
DOC_DETAIL
| Field Name | Data Type | Length |
|---|---|---|
| DOCUMENT_ID | VARCHAR | 8 |
| YEAR | VARCHAR | 4 |
| AREA | VARCHAR | 2 |
| AMOUNT | VARCHAR |
Y este es el script para generar 1 millon de registros en un archivo .CSV
| Doc_Header_Generator.py |
|---|
import random
import csv
names = ["Anne", "Gigi", "Juergen", "Ingo", "Inga", "Alvaro", "Mario",
"Julien", "Mike", "Michael", "Karin", "Rui", "John", "Rocky", "Sebastian",
"Kai-Yin", "Hester", "Katrin", "Uwe", "Vitaliy"]
last_names = ["Hardy", "Read", "Schmerder", "Sauerzapf", "Bereza", "Tejada",
"Herger", "Vayssiere", "Flynn", "Byczkowski", "Schattka",
"Nogueira", "Mayerhofer", "Ongkowidjojo", "Wieczorek", "Gau", "Hilbrecht",
"Staehr", "Kylau", "Rudnytskiy"]
area = ["001", "002", "003", "004", "005", "006", "007", "008", "009", "010"]
year = ["2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007",
"2008", "2009", "2010", "2011", "2012"]
def Generate_File(pSchema, pNumber):
iNumber = 0
c = csv.writer(open("Doc_Header.csv", "wb"))
while iNumber < pNumber:
queries = []
r_doc_id = random.randrange(1, 999999)
r_names = random.randrange(0, 20)
r_lastnames = random.randrange(0, 20)
r_areas = random.randrange(0, 10)
r_years = random.randrange(0, 13)
iNumber += 1
queries.append(r_doc_id)
queries.append(year[r_years])
queries.append(str(area[r_areas]))
queries.append(names[r_names])
queries.append(last_names[r_lastnames])
c.writerow(queries)
num_files = input("How many records?: \n")
Generate_File(num_files)
|
| Doc_Detail_Generator.py |
|---|
import random
import csv
area = ["001", "002", "003", "004", "005", "006", "007", "008", "009", "010"]
year = ["2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007",
"2008", "2009", "2010", "2011", "2012"]
def Generate_File(pSchema, pNumber):
iNumber = 0
c = csv.writer(open("Doc_Detail.csv", "wb"))
while iNumber < pNumber:
queries = []
r_doc_id = random.randrange(1, 999999)
r_areas = random.randrange(0, 10)
r_years = random.randrange(0, 13)
r_amount = random.randrange(0, 10000, 1)
iNumber += 1
queries.append(r_doc_id)
queries.append(year[r_years])
queries.append(str(area[r_areas]))
queries.append(r_amount)
c.writerow(queries)
num_files = input("How many records?: \n")
Generate_File(num_files)
|
Con los dos archivos listos, lo cargue en MySQL, PostgreSQL y SAP HANA.
Para medir la velocidad, cree tres scripts de Python utilizando...si...otra vez Bottle...
La idea basicamente es hacer join a las dos tablas, seleccionar el Document_Id, Year, Area y la suma del Amount.
Empezemos con el script de MySQL...
| MySQL_Bottle_Documents.py |
|---|
from bottle import get, run
import mysql.connector
import time
@get('/show_query')
def show_form():
counter = 0
start = time.clock()
conn = mysql.connector.Connect(host='localhost', user='root',
password='root', database='P075400')
cur = conn.cursor()
query = '''SELECT A.DOCUMENT_ID, A.YEAR, A.AREA, SUM(AMOUNT)
FROM DOC_HEADER AS A INNER JOIN DOC_DETAIL AS B
WHERE A.DOCUMENT_ID = B.DOCUMENT_ID
AND A.YEAR = B.YEAR
AND A.AREA = B.AREA
GROUP BY DOCUMENT_ID, YEAR, AREA'''
cur.execute(query)
output = "<DIV ALIGN='CENTER'><TABLE BORDER='1'>"
output += "<TR BGCOLOR='#B9C9FE'>"
output += "<TH>Document</TH><TH>Year</TH>"
output += "<TH>Area</TH><TH>Amount</TH>"
output += "</TR>"
for row in cur:
counter += 1
document_id = str(row[0])
year = str(row[1])
area = str(row[2])
amount = str(row[3])
output += "<TR BGCOLOR='#E8EDFF'>"
output += '''<TD>%s</TD><TD>%s</TD>
<TD>%s</TD><TD>%s</TD>''' %
(document_id, year, area, amount)
output += "</TR>"
output += "</TABLE>"
end = time.clock()
time_taken = end - start
output += "<H1>SAP HANA - %s records in %s seconds</H1></DIV>"\
% (counter, time_taken)
return output
run(host='localhost', port=8080)
|
Deje que el script corriera...y despues de mas de una hora...simplemente me aburri y termine el proceso...
Asi que, continue con PostgreSQL...
| PostgreSQL_Bottle_Documents.py |
|---|
from bottle import get, run
import psycopg2
import time
@get('/show_query')
def show_form():
counter = 0
start = time.clock()
conn = psycopg2.connect("dbname=P075400 user=postgres password=root")
cur = conn.cursor()
query = '''SELECT "DOC_HEADER"."DOCUMENT_ID", "DOC_HEADER"."YEAR",
"DOC_HEADER"."AREA", SUM("DOC_DETAIL"."AMOUNT") FROM
public."DOC_HEADER", public."DOC_DETAIL" WHERE
"DOC_HEADER"."DOCUMENT_ID" = "DOC_DETAIL"."DOCUMENT_ID"
AND "DOC_HEADER"."YEAR" = "DOC_DETAIL"."YEAR"
AND "DOC_HEADER"."AREA" = "DOC_DETAIL"."AREA"
GROUP BY "DOC_HEADER"."DOCUMENT_ID", "DOC_HEADER"."YEAR",
"DOC_HEADER"."AREA"'''
cur.execute(query)
output = "<DIV ALIGN='CENTER'><TABLE BORDER='1'>"
output += "<TR BGCOLOR='#B9C9FE'>"
output += "<TH>Document</TH><TH>Year</TH>"
output += "<TH>Area</TH><TH>Amount</TH>"
output += "</TR>"
for row in cur:
counter += 1
document_id = str(row[0])
year = str(row[1])
area = str(row[2])
amount = str(row[3])
output += "<TR BGCOLOR='#E8EDFF'>"
output += '''<TD>%s</TD><TD>%s</TD>
<TD>%s</TD><TD>%s</TD>''' %
(document_id, year, area, amount)
output += "</TR>"
output += "</TABLE>"
end = time.clock()
time_taken = end - start
output += "<H1>PostgreSQL - %s records in %s seconds</H1></DIV>"\
% (counter, time_taken)
return output
run(host='localhost', port=8080)
|
Esta vez...tuve suerte...
Con 2 millones de registros, PostgreSQL logro agregar el campo Amount y generar 7669 registros en 36 segundos...nada mal...
Para SAP HANA, decidi tomar total ventaja de los Calculation Views, asi que cree lo siguiente...
Hice un join a las dos tablas, use una proyeccion, aplique la agregacion y especifique el resultado...luego escribi este script en Python...
| SAPHANA_Bottle_Documents.py |
|---|
from bottle import get, run
import dbapi
import time
@get('/show_query')
def show_form():
counter = 0
start = time.clock()
conn = dbapi.connect('hanasvr-02', 30015, 'P075400', '5pA5kb6i')
cur = conn.cursor()
try:
ret = cur.execute("drop type test_out")
except dbapi.Error:
pass
try:
ret = cur.execute("drop procedure pyTest")
except dbapi.Error:
pass
queries = ['''create type test_out as table (DOCUMENT_ID NVARCHAR(8),
YEAR VARCHAR(4), AREA VARCHAR(2), AMOUNT BIGINT)''',
'''create procedure pyTest (OUT out_test TEST_OUT)\
language sqlscript reads sql data with result view ProcView as\
begin\
out_test = CE_CALC_VIEW("_SYS_BIC"."blag/CV_DOCUMENTS",
["DOCUMENT_ID","YEAR","AREA","AMOUNT"]);\
end''']
for query in queries:
cur.execute(query)
conn.commit()
query = '''select DOCUMENT_ID, YEAR, AREA, AMOUNT from ProcView'''
cur.execute(query)
ret = cur.fetchall()
output = "<DIV ALIGN='CENTER'><TABLE BORDER='1'>"
output += "<TR BGCOLOR='#B9C9FE'>"
output += "<TH>Document</TH><TH>Year</TH>"
output += "<TH>Area</TH><TH>Amount</TH>"
output += "</TR>"
for row in ret:
counter += 1
document_id = str(row["DOCUMENT_ID"])
year = str(row["YEAR"])
area = str(row["AREA"])
amount = str(row["AMOUNT"])
output += "<TR BGCOLOR='#E8EDFF'>"
output += '''<TD>%s</TD><TD>%s</TD>
<TD>%s</TD><TD>%s</TD>''' %
(document_id, year, area, amount)
output += "</TR>"
output += "</TABLE>"
end = time.clock()
time_taken = end - start
output += "<H1>SAP HANA - %s records in %s seconds</H1></DIV>"\
% (counter, time_taken)
return output
run(host='localhost', port=8080)
|
Luego de la ejecucion...no podia estar mas feliz...aqui esta el resultado...
SAP HANA manejo los mismos 2 millones de registros...genero los mismo 7669 registros agregados en solo 18 segundos...eso es 50% mas rapido que PostgreSQL y...bueno...solo digamos que...bastante mas rapido que MySQL...
Ahora...diganme que SAP HANA no es la mejor y mas rapida Base de Datos del mercado...Los reto -:)
Luego de hacer un ligero cambio en mi script de Python para SAP HANA...el nuevo tiempo de procesamiento, sin generar la tabla en Bottle es...
Saludos,
Blag.




No comments:
Post a Comment