treasure-data / pandas-td Goto Github PK
View Code? Open in Web Editor NEWInteractive data analysis with Pandas and Treasure Data.
Home Page: http://docs.treasuredata.com/articles/jupyter-pandas
License: Apache License 2.0
Interactive data analysis with Pandas and Treasure Data.
Home Page: http://docs.treasuredata.com/articles/jupyter-pandas
License: Apache License 2.0
to_td function fails due to following error.
--Error
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~/miniconda3/envs/analysis/lib/python3.5/http/client.py in send(self, data)
907 try:
--> 908 self.sock.sendall(data)
909 except TypeError:
~/miniconda3/envs/analysis/lib/python3.5/site-packages/urllib3/contrib/pyopenssl.py in sendall(self, data)
312 while total_sent < len(data):
--> 313 sent = self._send_until_done(data[total_sent:total_sent + SSL_WRITE_BLOCKSIZE])
314 total_sent += sent
~/miniconda3/envs/analysis/lib/python3.5/site-packages/urllib3/contrib/pyopenssl.py in _send_until_done(self, data)
300 try:
--> 301 return self.connection.send(data)
302 except OpenSSL.SSL.WantWriteError:
~/miniconda3/envs/analysis/lib/python3.5/site-packages/OpenSSL/SSL.py in send(self, buf, flags)
1723 if not isinstance(buf, bytes):
-> 1724 raise TypeError("data must be a memoryview, buffer or byte string")
1725 if len(buf) > 2147483647:
TypeError: data must be a memoryview, buffer or byte string
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-6-fdf3eeff2cf7> in <module>()
----> 1 td.to_td(df, 'kammy.tab0828_1', con, if_exists='replace', index=False)
~/miniconda3/envs/analysis/lib/python3.5/site-packages/pandas_td/td.py in to_td(frame, name, con, if_exists, time_col, time_index, index, index_label, chunksize, date_format)
720
721 # upload
--> 722 uploader.upload_frame(frame, chunksize)
723 uploader.wait_for_import(len(frame))
724
~/miniconda3/envs/analysis/lib/python3.5/site-packages/pandas_td/td.py in upload_frame(self, frame, chunksize)
407 self._display_progress(0)
408 for i, chunk in enumerate(self._chunk_frame(frame, chunksize)):
--> 409 self._upload(self._gzip(self._pack(chunk)))
410 self._display_progress(min(i * chunksize + chunksize, self.frame_size))
411 self.uploaded_at = datetime.datetime.utcnow().replace(microsecond=0)
~/miniconda3/envs/analysis/lib/python3.5/site-packages/pandas_td/td.py in _upload(self, data)
397 data_size = len(data)
398 unique_id = uuid.uuid4()
--> 399 elapsed = self.client.import_data(self.database, self.table, 'msgpack.gz', data, data_size, unique_id)
400 logger.debug('imported %d bytes in %.3f secs', data_size, elapsed)
401
~/miniconda3/envs/analysis/lib/python3.5/site-packages/tdclient/client.py in import_data(self, db_name, table_name, format, bytes_or_stream, size, unique_id)
600 Returns: second in float represents elapsed time to import data
601 """
--> 602 return self.api.import_data(db_name, table_name, format, bytes_or_stream, size, unique_id=unique_id)
603
604 def import_file(self, db_name, table_name, format, file, unique_id=None):
~/miniconda3/envs/analysis/lib/python3.5/site-packages/tdclient/import_api.py in import_data(self, db, table, format, bytes_or_stream, size, unique_id)
37
38 kwargs = {}
---> 39 with self.put(path, bytes_or_stream, size, **kwargs) as res:
40 code, body = res.status, res.read()
41 if code / 100 != 2:
~/miniconda3/envs/analysis/lib/python3.5/site-packages/tdclient/api.py in put(self, path, bytes_or_stream, size, headers, **kwargs)
273 response = None
274 try:
--> 275 response = self.send_request("PUT", url, body=stream, headers=headers, decode_content=True, preload_content=False)
276 if response.status < 500:
277 pass
~/miniconda3/envs/analysis/lib/python3.5/site-packages/tdclient/api.py in send_request(self, method, url, fields, body, headers, **kwargs)
357 return self.http.request(method, url, fields=fields, headers=headers, **kwargs)
358 else:
--> 359 return self.http.urlopen(method, url, body=body, headers=headers, **kwargs)
360
361 def raise_error(self, msg, res, body):
~/miniconda3/envs/analysis/lib/python3.5/site-packages/urllib3/poolmanager.py in urlopen(self, method, url, redirect, **kw)
319 response = conn.urlopen(method, url, **kw)
320 else:
--> 321 response = conn.urlopen(method, u.request_uri, **kw)
322
323 redirect_location = redirect and response.get_redirect_location()
~/miniconda3/envs/analysis/lib/python3.5/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
598 timeout=timeout_obj,
599 body=body, headers=headers,
--> 600 chunked=chunked)
601
602 # If we're going to release the connection in ``finally:``, then
~/miniconda3/envs/analysis/lib/python3.5/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
354 conn.request_chunked(method, url, **httplib_request_kw)
355 else:
--> 356 conn.request(method, url, **httplib_request_kw)
357
358 # Reset the timeout for the recv() on the socket
~/miniconda3/envs/analysis/lib/python3.5/http/client.py in request(self, method, url, body, headers)
1105 def request(self, method, url, body=None, headers={}):
1106 """Send a complete request to the server."""
-> 1107 self._send_request(method, url, body, headers)
1108
1109 def _set_content_length(self, body, method):
~/miniconda3/envs/analysis/lib/python3.5/http/client.py in _send_request(self, method, url, body, headers)
1150 # default charset of iso-8859-1.
1151 body = _encode(body, 'body')
-> 1152 self.endheaders(body)
1153
1154 def getresponse(self):
~/miniconda3/envs/analysis/lib/python3.5/http/client.py in endheaders(self, message_body)
1101 else:
1102 raise CannotSendHeader()
-> 1103 self._send_output(message_body)
1104
1105 def request(self, method, url, body=None, headers={}):
~/miniconda3/envs/analysis/lib/python3.5/http/client.py in _send_output(self, message_body)
934 self.send(msg)
935 if message_body is not None:
--> 936 self.send(message_body)
937
938 def putrequest(self, method, url, skip_host=False,
~/miniconda3/envs/analysis/lib/python3.5/http/client.py in send(self, data)
910 if isinstance(data, collections.Iterable):
911 for d in data:
--> 912 self.sock.sendall(d)
913 else:
914 raise TypeError("data should be a bytes-like object "
~/miniconda3/envs/analysis/lib/python3.5/site-packages/urllib3/contrib/pyopenssl.py in sendall(self, data)
310 def sendall(self, data):
311 total_sent = 0
--> 312 while total_sent < len(data):
313 sent = self._send_until_done(data[total_sent:total_sent + SSL_WRITE_BLOCKSIZE])
314 total_sent += sent
TypeError: object of type 'int' has no len()
This error happens if "requests" library version is over 2.16.0.
Current "requests" version is v2.19.1. While on the other hand v2.15.1 (available latest "requests" version) is bit old.
It's better to fix the error and supports latest "requests" version.
NG
2.19.1
2.18.4
2.17.3
2.17.2
2.17.1
2.17.0
2.16.5
2.16.0
OK
2.15.1
2.14.0
2.10.0
I have a dataframe to import but a vast majority of some rows in 2 of the columns are blanks (the other columns don't have any blanks).
When I use 'dtypes' on the DataFrame i get a float64 for those columns.
After a successful import usin "to_td" when I look at may table in TD those 2 columns are missing.
Is it because the first record uploaded for those columns is blank?
Is there a way to sort my DataFrame to avoid that or a parameter in the "to_td" method?
I need an api for getting executed job result easy to use like following.
df = td.read_td_job(engine, job_id)
I know I can get using ResultProxy
but it is dull...
df = td.td.ResultProxy(engine, con.client.job(job_id)).to_dataframe()
HI, devs member!
Our team is using pandas-td
in the ETL batch.
Thank you very much for developing a useful tool!
By the way, the below error was raised when I execute pd_td.read_td()
method.
Traceback (most recent call last):
...
File "venv/lib/python3.6/site-packages/pandas_td/td.py", line 269, in __iter__
for record in msgpack.Unpacker(self, encoding='utf-8'):
File "msgpack/_unpacker.pyx", line 317, in msgpack._cmsgpack.Unpacker.__init__
TypeError: __init__() got an unexpected keyword argument 'encoding'
And when I ignored the encoding
argument from __iter__
function, it worked as expected.
This is my just guest, probably this error is related to msgpack library update.
I think there a few approaches to avoid this.
encoding
argument from pandas-td.td.ResultProxy.__iter__()
How do you think?
to_td function is not able to upload numpy.datetime64 due to "can't serialize Timestamp".
The following is an example.
As a workaround, I need to convert it to str type.
import os
import sys
import pandas as pd
import pandas_td as td
import MySQLdb
from datetime import datetime
con = td.connect(apikey='test', endpoint='https://api.treasuredata.com')
db = MySQLdb.connect(host="test",
user="test", # your username
passwd="test", # your password
db="test")
info = pd.read_sql_query('desc ticket23235',db)
info
Field | Type | Null | Key | Default | Extra | |
---|---|---|---|---|---|---|
0 | id | int(11) | YES | None | ||
1 | t_timestamp | timestamp | YES | None | ||
2 | t_datetime | datetime | YES | None |
info = pd.read_sql_query('select id, t_timestamp from ticket23235',db)
info
id | t_timestamp | |
---|---|---|
0 | 0 | 2017-03-19 23:07:56 |
type(info.t_timestamp.values[0])
numpy.datetime64
td.to_td(info, 'db.test', con, if_exists = 'replace')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-83-925499af4e5e> in <module>()
----> 1 td.to_td(info, 'support.ticket23235', con, if_exists = 'replace')
/Users/takahashi/.pyenv/versions/2.7.12/lib/python2.7/site-packages/pandas_td/td.pyc in to_td(frame, name, con, if_exists, time_col, time_index, index, index_label, chunksize, date_format)
720
721 # upload
--> 722 uploader.upload_frame(frame, chunksize)
723 uploader.wait_for_import(len(frame))
724
/Users/takahashi/.pyenv/versions/2.7.12/lib/python2.7/site-packages/pandas_td/td.pyc in upload_frame(self, frame, chunksize)
407 self._display_progress(0)
408 for i, chunk in enumerate(self._chunk_frame(frame, chunksize)):
--> 409 self._upload(self._gzip(self._pack(chunk)))
410 self._display_progress(min(i * chunksize + chunksize, self.frame_size))
411 self.uploaded_at = datetime.datetime.utcnow().replace(microsecond=0)
/Users/takahashi/.pyenv/versions/2.7.12/lib/python2.7/site-packages/pandas_td/td.pyc in _pack(self, chunk)
385 row = row.astype('object')
386 row.dropna(inplace=True)
--> 387 packer.pack(dict(row))
388 return packer.bytes()
389
msgpack/_packer.pyx in msgpack._packer.Packer.pack (msgpack/_packer.cpp:3661)()
msgpack/_packer.pyx in msgpack._packer.Packer.pack (msgpack/_packer.cpp:3503)()
msgpack/_packer.pyx in msgpack._packer.Packer._pack (msgpack/_packer.cpp:2657)()
msgpack/_packer.pyx in msgpack._packer.Packer._pack (msgpack/_packer.cpp:3382)()
TypeError: can't serialize Timestamp('2017-03-19 23:07:56')
info.t_timestamp = info.t_timestamp.astype('string')
type(info.t_timestamp.values[0])
str
td.to_td(info, 'db.etst', con, if_exists = 'replace')
If you rename a column in pandas you can't seem to access it correctly in the time_col
parameter.
>>> df_renamed = pd.DataFrame(index=range(4), columns=('foo','to_be_time'))
>>> for i in range(4):
... df_renamed['foo'][i] = i
... df_renamed['to_be_time'][i]=10000*i
...
>>> print df_renamed
foo to_be_time
0 0 0
1 1 10000
2 2 20000
3 3 30000
>>> df_renamed.rename(columns={'to_be_time':'time'}, inplace=True)
>>> print df_renamed
foo time
0 0 0
1 1 10000
2 2 20000
3 3 30000
>>> pandas_td.to_td(df_renamed, 'kh_dev.test', td_conn, if_exists='replace', time_col='time', index=False)
>>> sleep(10)
>>> pandas_td.read_td_table('test', engine)
foo time
0 0 0
1 1 0
2 2 0
3 3 0
Other renamed column seem to be fine
Got an exception when downloading a large result:
# issued at 2015-12-04T07:16:53Z
URL: https://console.treasuredata.com/jobs/xxx
# started at 2015-12-04T07:16:54Z
2015-12-04 07:17:18: rows pending running done / total
Stage-0: 40 0 0 1 / 1
Stage-1: 751M 0 0 118 / 118
Result size: 143,064,012 bytes
Download: 121,634,816 / 143,064,012 bytes (85.02%)
---------------------------------------------------------------------------
TimeoutError Traceback (most recent call last)
<ipython-input-16-167c480c3e61> in <module>()
----> 1 get_ipython().run_cell_magic('td_presto', '-o d1', "select ...")
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/site-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2291 magic_arg_s = self.var_expand(line, stack_depth)
2292 with self.builtin_trap:
-> 2293 result = fn(magic_arg_s, cell)
2294 return result
2295
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/site-packages/pandas_td/ipython.py in td_presto(self, line, cell)
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/site-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
191 # but it's overkill for just that one bit of state.
192 def magic_deco(arg):
--> 193 call = lambda f, *a, **k: f(*a, **k)
194
195 if callable(arg):
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/site-packages/pandas_td/ipython.py in td_presto(self, line, cell)
307 @magic.cell_magic
308 def td_presto(self, line, cell):
--> 309 return self.run_query('presto', line, cell)
310
311 # extension
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/site-packages/pandas_td/ipython.py in run_query(self, engine_type, line, cell)
289 if args.dry_run:
290 return self.display_code_block()
--> 291 d = td.read_td_query(query, engine)
292
293 # output
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/site-packages/pandas_td/td.py in read_td_query(query, engine, index_col, parse_dates, distributed_join, params)
528 # execute
529 r = engine.execute(header + query, **params)
--> 530 return r.to_dataframe(index_col=index_col, parse_dates=parse_dates)
531
532 def read_td_job(job_id, engine, index_col=None, parse_dates=None):
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/site-packages/pandas_td/td.py in to_dataframe(self, index_col, parse_dates)
283 def to_dataframe(self, index_col=None, parse_dates=None):
284 columns = [c[0] for c in self.description]
--> 285 frame = pd.DataFrame(iter(self), columns=columns)
286 if parse_dates is not None:
287 frame = self._parse_dates(frame, parse_dates)
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
258 elif isinstance(data, (list, types.GeneratorType)):
259 if isinstance(data, types.GeneratorType):
--> 260 data = list(data)
261 if len(data) > 0:
262 if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/site-packages/pandas_td/td.py in __iter__(self)
267
268 def __iter__(self):
--> 269 for record in msgpack.Unpacker(self, encoding='utf-8'):
270 yield record
271
msgpack/_unpacker.pyx in msgpack._unpacker.Unpacker.__next__ (msgpack/_unpacker.cpp:459)()
msgpack/_unpacker.pyx in msgpack._unpacker.Unpacker._unpack (msgpack/_unpacker.cpp:400)()
msgpack/_unpacker.pyx in msgpack._unpacker.Unpacker.read_from_file (msgpack/_unpacker.cpp:367)()
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/site-packages/pandas_td/td.py in read(self, size)
262 self._iter = self.engine.iter_content(self.job, size)
263 try:
--> 264 return next(self._iter)
265 except StopIteration:
266 return ''
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/site-packages/pandas_td/td.py in iter_content(self, job, chunk_size)
229 d = zlib.decompressobj(16+zlib.MAX_WBITS)
230 with contextlib.closing(self._start_download(job)) as r:
--> 231 for chunk in r.iter_content(chunk_size):
232 curval += len(chunk)
233 self._display_progress(job, curval)
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/site-packages/requests/models.py in generate()
655 if hasattr(self.raw, 'stream'):
656 try:
--> 657 for chunk in self.raw.stream(chunk_size, decode_content=True):
658 yield chunk
659 except ProtocolError as e:
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/site-packages/requests/packages/urllib3/response.py in stream(self, amt, decode_content)
324 else:
325 while not is_fp_closed(self._fp):
--> 326 data = self.read(amt=amt, decode_content=decode_content)
327
328 if data:
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/site-packages/requests/packages/urllib3/response.py in read(self, amt, decode_content, cache_content)
280 else:
281 cache_content = False
--> 282 data = self._fp.read(amt)
283 if amt != 0 and not data: # Platform-specific: Buggy versions of Python.
284 # Close the connection when no data is returned
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/http/client.py in read(self, amt)
431 # Amount is given, implement using readinto
432 b = bytearray(amt)
--> 433 n = self.readinto(b)
434 return memoryview(b)[:n].tobytes()
435 else:
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/http/client.py in readinto(self, b)
471 # connection, and the user is reading more bytes than will be provided
472 # (for example, reading in 1k chunks)
--> 473 n = self.fp.readinto(b)
474 if not n and b:
475 # Ideally, we would raise IncompleteRead if the content-length
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/socket.py in readinto(self, b)
569 while True:
570 try:
--> 571 return self._sock.recv_into(b)
572 except timeout:
573 self._timeout_occurred = True
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/ssl.py in recv_into(self, buffer, nbytes, flags)
922 "non-zero flags not allowed in calls to recv_into() on %s" %
923 self.__class__)
--> 924 return self.read(nbytes, buffer)
925 else:
926 return socket.recv_into(self, buffer, nbytes, flags)
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/ssl.py in read(self, len, buffer)
784 raise ValueError("Read on closed or unwrapped SSL socket.")
785 try:
--> 786 return self._sslobj.read(len, buffer)
787 except SSLError as x:
788 if x.args[0] == SSL_ERROR_EOF and self.suppress_ragged_eofs:
/Users/knishida/miniconda3/envs/workflow/lib/python3.5/ssl.py in read(self, len, buffer)
568 """
569 if buffer is not None:
--> 570 v = self._sslobj.read(len, buffer)
571 else:
572 v = self._sslobj.read(len or 1024)
TimeoutError: [Errno 60] Operation timed out
A declarative, efficient, and flexible JavaScript library for building user interfaces.
๐ Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. ๐๐๐
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google โค๏ธ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.