Section 4: Field Aliasing, Serialization and Deserialization(下)

說明:為了簡化,本文下方會把幾個簡短的輸出測試寫在一起,例如:

m = Model(FirstName="Isaac")
m

data = {"FirstName": "Isaac"}  # dict
m = Model.model_validate(data)
m
Model(first_name='Isaac')
Model(first_name='Isaac')

alias, validation_alias & serialization_alias

原則:只要設定 validation_alias 或 serialization_alias,優先程度就比純 alias 高。

普通別名 視為 反序列化序列化 的預設值,並將 驗證別名序列化別名 視為覆蓋。

set
alias
set
validation_alias
set
serialization_alias
deserialization serialization
:one: alias alias
:two: validation_alias alias
:three: alias serialization_alias
:four: validation_alias serialization_alias

▌僅設定 validation_alias,未設定 alias

from pydantic import BaseModel, Field, ConfigDict, ValidationError

class Model(BaseModel):
    # populate_by_name=True: we can now deserialize either by alias, if any, or field name
    model_config = ConfigDict(populate_by_name=True)
    first_name: str = Field(validation_alias="FirstName")  # 沒有設定 alias

m = Model(FirstName="Isaac")
m

data = {"FirstName": "Isaac"}  # dict
m = Model.model_validate(data)
m
Model(first_name='Isaac')
Model(first_name='Isaac')
m.model_dump()
m.model_dump(by_alias=True)
{'first_name': 'Isaac'}
{'first_name': 'Isaac'}

the validation alias only used for deserializing.

aliasvalidation_alias 皆設定

class Model(BaseModel):
    model_config = ConfigDict(populate_by_name=True)
    first_name: str = Field(validation_alias="FirstName", alias="firstName")

m = Model.model_validate(data)
m
Model(first_name='Isaac')
m.model_dump()
m.model_dump(by_alias=True)  # 指定 by alias
{'first_name': 'Isaac'} # 仍是 field name
{'firstName': 'Isaac'}  # 因為有指定 by alias

alias, validation_aliasserialization_alias 皆設定

class Model(BaseModel):
    model_config = ConfigDict(populate_by_name=True)
    
    first_name: str = Field(
        validation_alias="FirstName", 
        alias="firstName", 
        serialization_alias="givenName"
    )

m = Model.model_validate(data)
m
Model(first_name='Isaac')
m.model_dump()
m.model_dump(by_alias=True)  # 指定 by alias
{'first_name': 'Isaac'} # 仍是 field name
{'givenName': 'Isaac'}  # 因為 by alias 被 serialization_alias 取代(覆寫)了

▌為什麼需要這三種不同的別名?

範例:使用 自動產生的別名,但其中一個欄位需要 特殊的別名 來進行 驗證序列化

這種情況下,first_namelast_name 將分別自動產生別名 firstNamelastName

from pydantic.alias_generators import to_camel

class Model(BaseModel):
    model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)
    first_name: str
    last_name: str

data = {
    "firstName": "Isaac",
    "lastName": "Newton"
}
m = Model.model_validate(data)
m.model_dump()
{'first_name': 'Isaac', 'last_name': 'Newton'}
class Model(BaseModel):
    model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)
    first_name: str = Field(
        validation_alias="FirstName", serialization_alias="givenName"
    )
    last_name: str

data = {
    "FirstName": "Isaac",
    "lastName": "Newton"
}
m = Model.model_validate(data)
m
Model(first_name='Isaac', last_name='Newton')
m.model_dump()
m.model_dump(by_alias=True)
{'first_name': 'Isaac', 'last_name': 'Newton'}
{'givenName': 'Isaac', 'lastName': 'Newton'}

AliasChoices

指定 驗證別名 時,實際上可以定義許多個。

示範在反序列化資料時處理 FirstNameGivenName

from pydantic import BaseModel, Field, ConfigDict, ValidationError
from pydantic import AliasChoices

class Model(BaseModel):
    model_config = ConfigDict(populate_by_name=True, alias_generator=to_camel)
    
    first_name: str = Field(
        validation_alias=AliasChoices("FirstName", "GivenName"),  ## 這裡
        serialization_alias="givenName"
    )
    last_name: str

data = {
    "FirstName": "Isaac",
    "lastName": "Newton"
}
m = Model.model_validate(data)
m
Model(first_name='Isaac', last_name='Newton')
m.model_dump()
m.model_dump(by_alias=True)
{'first_name': 'Isaac', 'last_name': 'Newton'}
{'givenName': 'Isaac', 'lastName': 'Newton'}
data = {
    "GivenName": "Isaac",
    "lastName": "Newton"
}
m = Model.model_validate(data)
m
Model(first_name='Isaac', last_name='Newton')
m.model_dump()
m.model_dump(by_alias=True)
{'first_name': 'Isaac', 'last_name': 'Newton'}
{'givenName': 'Isaac', 'lastName': 'Newton'}

如果同時包含這兩種情況會怎麼樣?

data = {
    "GivenName": "Isaac",
    "FirstName": "Isaac2",
    "lastName": "Newton"
}
m = Model.model_validate(data)
m
Model(first_name='Isaac2', last_name='Newton')

問題是:什麼情況下,我們會這樣使用?

處理設定檔時。可能具有定義各種資源的連接字串的設定,但不同資源對該設定使用不同的名稱。

例如:

data = {
    "databases": {
        "redis": {
            "name": "Local Redis",
            "redis_conn": "redis://secret@localhost:9000/1"
        },
        "pgsql": {
            "name": "Local Postgres",
            "pgsql_conn": "postgresql://user:secret@localhost"
        },
        "nosql": {
            "name": "Local MongoDB",
            "mongo_conn": "mongodb://USERNAME:PASSWORD@HOST/DATABASE"
        }
    }
}

我們想要一個適用於上述三種資料庫的 Model,因此不同的連接欄位名稱,是我們可以使用的方法。

class Database(BaseModel):
    name: str
    connection: str = Field(
        validation_alias=AliasChoices("redis_conn", "pgsql_conn", "mongo_conn")
    )

databases = {}

for key, value in data["databases"].items():
    m = Database.model_validate(value)
    databases[key] = m

databases
{'redis': Database(name='Local Redis', connection='redis://secret@localhost:9000/1'),
 'pgsql': Database(name='Local Postgres', connection='postgresql://user:secret@localhost'),
 'nosql': Database(name='Local MongoDB', connection='mongodb://USERNAME:PASSWORD@HOST/DATABASE')}

Database

之後會介紹如何製作更複雜的 Model,來處理類似的情形,而不用我們手動處理。先看看簡單的示範:

class Databases(BaseModel):
    databases: dict[str, Database]

databases = Databases.model_validate(data)
databases
Databases(databases={'redis': Database(name='Local Redis', connection='redis://secret@localhost:9000/1'), 'pgsql': Database(name='Local Postgres', connection='postgresql://user:secret@localhost'), 'nosql': Database(name='Local MongoDB', connection='mongodb://USERNAME:PASSWORD@HOST/DATABASE')})
print(databases.model_dump_json(indent=2))
{
  "databases": {
    "redis": {
      "name": "Local Redis",
      "connection": "redis://secret@localhost:9000/1"
    },
    "pgsql": {
      "name": "Local Postgres",
      "connection": "postgresql://user:secret@localhost"
    },
    "nosql": {
      "name": "Local MongoDB",
      "connection": "mongodb://USERNAME:PASSWORD@HOST/DATABASE"
    }
  }
}

@field_serializer

這節內容講欄位值客製化,最常見的兩個類型:

  1. 序列化 日期(date)& 日期時間(datetime) 物件。因為世界各地大家的習慣都不一樣。

  2. 浮點的小數位數

Pydantic 提供了四種方式來做 客製序列化

老師使用 @field_serializer 裝飾器來實作上述功能(日期時間、浮點的小數位數)。

參數

Name Type Description Default
fields str Which field(s) the method should be called on. ()
return_type Any Optional return type for the function, if omitted it will be inferred from the type annotation. PydanticUndefined
mode Literal[‘plain’, ‘wrap’] The serialization mode.
plain means the function will be called instead of the default serialization logic,
wrap means the function will be called with an argument to optionally call the default serialization logic.
‘plain’
when_used Literal[‘always’, ‘unless-none’, ‘json’, ‘json-unless-none’] Determines the serializer will be used for serialization. ‘always’
check_fields bool None Whether to check that the fields actually exist on the model.

when_used

  • always :預設值。序列化成為 dict 或 JSON 時使用

  • unless-none :值為 None 時不使用

  • json :僅在序列化為 JSON 時使用

  • json-unless-none :序列化為 JSON 時使用,除非值為 None(即前兩者相加 json
    + unless-none

回傳值

Type Description
Callable[[Any], Any] The decorator function.

:one: always + datetime

總是做序列化(一):值不為 None

from pydantic import BaseModel, field_serializer
from datetime import datetime

class Model(BaseModel):
    dt: datetime | None = None

    @field_serializer("dt", when_used="always")
    def serialize_name(self, value):
        print(f"type = {type(value)}")
        return value

m = Model(dt="2020-01-01T12:00:00")
m

m.model_dump()

m.model_dump_json()
Model(dt=datetime.datetime(2020, 1, 1, 12, 0))

type = <class 'datetime.datetime'>
{'dt': datetime.datetime(2020, 1, 1, 12, 0)}

type = <class 'datetime.datetime'>
'{"dt":"2020-01-01T12:00:00"}'

:one: always + None

總是做序列化(一):值為 None

m = Model()
m

m.model_dump()

m.model_dump_json()
Model(dt=None)

type = <class 'NoneType'>
{'dt': None}

type = <class 'NoneType'>
'{"dt":null}'

:two: unless_none

值為 None 時不做序列化

from datetime import datetime

class Model(BaseModel):
    dt: datetime | None = None

    @field_serializer("dt", when_used="unless-none")
    def serialize_name(self, value):
        print(f"type = {type(value)}")
        return value

m = Model(dt="2020-01-01T12:00:00")
m

m.model_dump()

m = Model()
m

m.model_dump()

m.model_dump_json()
Model(dt=datetime.datetime(2020, 1, 1, 12, 0))

type = <class 'datetime.datetime'>
{'dt': datetime.datetime(2020, 1, 1, 12, 0)}

Model(dt=None)

{'dt': None}

'{"dt":null}'

:four: json-unless-none:

只處理 JSON,不處理 dict :heavy_plus_sign: 值為 None 時不處理

from datetime import datetime

class Model(BaseModel):
    dt: datetime | None = None

    @field_serializer("dt", when_used="json-unless-none")
    def serialize_name(self, value):
        print(f"type = {type(value)}")
        return value.strftime("%Y/%-m/%-d %I:%M %p")

m = Model(dt="2020-01-01T12:00:00")  ## serializer 不處理 dict
m

m.model_dump()

m.model_dump_json()  ## serializer 處理 JSON

m = Model()  ## serializer 不處理 None
m

m.model_dump_json()  ## serializer 不處理 None
Model(dt=datetime.datetime(2020, 1, 1, 12, 0))

{'dt': datetime.datetime(2020, 1, 1, 12, 0)}

type = <class 'datetime.datetime'>
'{"dt":"2020/1/1 12:00 PM"}'

Model(dt=None)

'{"dt":null}'

FieldSerializationInfo - 1

如果我們想對 dict 和 JSON 做不同的序列化處理。

可能這個功能太常被使用,乾脆寫一個 method(FieldSerializationInfo 的 mode_is_json),連判斷式都不用寫。

from pydantic import FieldSerializationInfo

class Model(BaseModel):
    dt: datetime | None = None

    @field_serializer("dt", when_used="unless-none")
    def dt_serializer(self, value, info: FieldSerializationInfo):
        print(f"info={info}")
        return value

m = Model(dt=datetime(2020, 1, 1))
m

m.model_dump()

m.model_dump_json()
Model(dt=datetime.datetime(2020, 1, 1, 0, 0))

## mode='python'
info=SerializationInfo(include=None, exclude=None, mode='python', by_alias=False, exclude_unset=False, exclude_defaults=False, exclude_none=False, round_trip=False)
{'dt': datetime.datetime(2020, 1, 1, 0, 0)}

## mode='json'
info=SerializationInfo(include=None, exclude=None, mode='json', by_alias=False, exclude_unset=False, exclude_defaults=False, exclude_none=False, round_trip=False)
'{"dt":"2020-01-01T00:00:00"}'
class Model(BaseModel):
    dt: datetime | None = None

    @field_serializer("dt", when_used="unless-none")
    def dt_serializer(self, value, info: FieldSerializationInfo):
        print(f"mode_is_json={info.mode_is_json()}")
        return value

m = Model(dt=datetime(2020, 1, 1))
m.model_dump()

m.model_dump_json()
mode_is_json=False
{'dt': datetime.datetime(2020, 1, 1, 0, 0)}

mode_is_json=True
'{"dt":"2020-01-01T00:00:00"}'

FieldSerializationInfo - 2

另一個案例示範(日期時間格式用 UTC)。告訴我們有很多彈性應用。

先寫一個函式備用:

這段程式是將一個 datetime 物件轉換為 UTC 時區格式。它會檢查輸入的 datetime 物件是否已經包含時區資訊,並根據情況進行適當的處理。

  • 如果日期時間是幼稚的,請使其意識到,並假設幼稚的日期時間已經是 UTC

  • 如果日期時間已知,請將其變更為 UTC

import pytz

def make_utc(dt: datetime) -> datetime:
    if dt.tzinfo is None:
        dt = pytz.utc.localize(dt)
    else:
        dt = dt.astimezone(pytz.utc)
    return dt
dt = make_utc(datetime.now())
dt

dt.isoformat()
datetime.datetime(2024, 5, 23, 12, 31, 30, 733881, tzinfo=<UTC>)

2024-05-23T12:31:30.733881+00:00

更改此日期時間的序列化格式:

dt.strftime("%Y-%m-%dT%H:%M:%SZ")
2024-05-24T12:31:30Z
def dt_utc_json_serializer(dt: datetime) -> str:
    dt = make_utc(dt)
    return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
class Model(BaseModel):
    dt: datetime | None = None

    @field_serializer("dt", when_used="unless-none")
    def dt_serializer(self, dt, info: FieldSerializationInfo):
        if info.mode_is_json():
            return dt_utc_json_serializer(dt)
        return make_utc(dt)

m = Model(dt=datetime(2020, 1, 1))
m

m.model_dump()

m.model_dump_json()
Model(dt=datetime.datetime(2020, 1, 1, 0, 0))

{'dt': datetime.datetime(2020, 1, 1, 0, 0, tzinfo=<UTC>)}

{"dt":"2020-01-01T00:00:00Z"}

不是 UTC 格式的日期時間:

eastern = pytz.timezone('US/Eastern')
dt = eastern.localize(datetime(2020, 1, 1))
dt
datetime.datetime(2020, 1, 1, 0, 0, tzinfo=<DstTzInfo 'US/Eastern' EST-1 day, 19:00:00 STD>)
m = Model(dt=dt)
m

m.model_dump()

m.model_dump_json()
Model(dt=datetime.datetime(2020, 1, 1, 0, 0, tzinfo=<DstTzInfo 'US/Eastern' EST-1 day, 19:00:00 STD>))

{'dt': datetime.datetime(2020, 1, 1, 5, 0, tzinfo=<UTC>)}

{"dt":"2020-01-01T05:00:00Z"}

延伸閱讀 datetime 中的 naive time 和 aware time 是什麼意思?


project

本節專案要求:

  • 自動產生駝峰式大小寫別名。

  • type_ 我們模型中的欄位是 type 按照來源資料提供的,也應該序列化為 type

  • 我們收到的資料包含以下需要對應到我們自己的模型的欄位名稱,但希望我們的駝峰欄位名稱用於序列化。考慮到這一點(不重命名欄位名稱):

    • number_of_doors 提供為 doors

    • manufactured_date 提供為 completionDate

  • 該字段 base_msrp_usd 提供為 msrpUSD ,我們希望序列化名稱為 baseMSRPUSD

  • 我們希望 JSON 序列化輸出為 manufactured_date 以下模式:(YYYY/MM/DD 例如2020/01/01 ),但序列化為 Python 字典應保留為物件 date

from pydantic import Field, field_serializer
from pydantic.alias_generators import to_camel

# 汽車資料模型
class Automobile(BaseModel):
    # 設定模型配置
    model_config = ConfigDict(
        extra="forbid",  # 禁止額外的欄位
        str_strip_whitespace=True,  # 去除字串前後的空白
        validate_default=True,  # 驗證預設值
        validate_assignment=True,  # 在賦值時進行驗證
        alias_generator=to_camel,  # 使用 camel case 別名生成器 # ===> 1
    )
    
    # 定義欄位
    manufacturer: str  # 製造商
    series_name: str  # 系列名稱
    type_: AutomobileType = Field(alias="type")  # 車型,使用 `type` 作為別名 # ===> 2
    is_electric: bool = False  # 是否為電動車
    manufactured_date: date = Field(validation_alias="completionDate")  # 生產日期,
        # 使用 `completionDate` 作為驗證別名 # ===> 3-2
    base_msrp_usd: float = Field(
        validation_alias="msrpUSD",  # ===> 4
        serialization_alias="baseMSRPUSD" # ===> 4
    )  # 基本建議零售價(美元),使用 `msrpUSD` 作為驗證別名,`baseMSRPUSD` 作為序列化別名
    vin: str  # 車輛識別碼
    number_of_doors: int = Field(default=4, validation_alias="doors")  # 車門數量,
        # 預設為 4,使用 `doors` 作為驗證別名 # ===> 3-1
    registration_country: str | None = None  # 註冊國家
    license_plate: str | None = None  # 車牌號碼

    # 自訂欄位序列化方法,針對 `manufactured_date` 欄位在序列化為 JSON 時進行處理
    # ===> 5
    @field_serializer("manufactured_date", when_used="json-unless-none")
    def serialize_date(self, value: date) -> str:
        return value.strftime("%Y/%m/%d")  # 將日期格式化為 "YYYY/MM/DD"
car = Automobile.model_validate_json(data_json)
car
Automobile(manufacturer='BMW', series_name='M4', type_=<AutomobileType.convertible: 'Convertible'>, is_electric=False, manufactured_date=datetime.date(2023, 1, 1), base_msrp_usd=93300.0, vin='1234567890', number_of_doors=2, registration_country='France', license_plate='AAA-BBB')
assert car.model_dump() == expected_serialized_dict
assert car.model_dump(by_alias=True) == expected_serialized_dict_by_alias
assert car.model_dump_json(by_alias=True) == expected_serialized_json_by_alias