Dimension reduction and statistical learning #

Get packages and JPMaQS data #

Packages #

# >>> Define constants <<< #
import os

# Minimum Macrosynergy package version required for this notebook
MIN_REQUIRED_VERSION: str = "1.0.0"

# DataQuery credentials: Remember to replace with your own client ID and secret
DQ_CLIENT_ID: str = os.getenv("DQ_CLIENT_ID")
DQ_CLIENT_SECRET: str = os.getenv("DQ_CLIENT_SECRET")

# Define any Proxy settings required (http/https)
PROXY = {}

# Start date for the data (argument passed to the JPMaQSDownloader class)
START_DATE: str = "2000-01-01"
# Standard library imports
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from functools import partial
import itertools

# Scikit-learn imports
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.metrics import make_scorer, root_mean_squared_error

# Macrosynergy package imports
import macrosynergy.management as msm
import macrosynergy.panel as msp
import macrosynergy.pnl as msn
import macrosynergy.signal as mss
import macrosynergy.learning as msl
import macrosynergy.visuals as msv
from macrosynergy.download import JPMaQSDownload

warnings.simplefilter("ignore")
# Check installed Macrosynergy package meets version requirement
import macrosynergy as msy

msy.check_package_version(required_version=MIN_REQUIRED_VERSION)

Data #

# IRS cross-section lists

cids_dm = [
    "AUD",
    "CAD",
    "CHF",
    "EUR",
    "GBP",
    "JPY",
    "NOK",
    "NZD",
    "SEK",
    "USD",
]  # DM currency areas

cids_iliq = [
    "NOK",
    "NZD",
]

cids = sorted(set(cids_dm) - set(cids_iliq))
# Category tickers

cpi = [
    # CPI inflation
    "CPIH_SA_P1M1ML12",
    "CPIH_SJA_P6M6ML6AR",
    "CPIC_SA_P1M1ML12",
    "CPIC_SJA_P6M6ML6AR",
    "INFE2Y_JA",
]
ppi = [
    # PPI inflation
    "PPIH_NSA_P1M1ML12_3MMA",
    "PPIH_SA_P6M6ML6AR",
    "PGDPTECH_SA_P1M1ML12_3MMA",
    "PGDP_SA_P1Q1QL4",
]
inf = cpi + ppi

dem = [
    # Domestic demand growth
    "RRSALES_SA_P1M1ML12_3MMA",
    "RRSALES_SA_P1Q1QL4",
    "RPCONS_SA_P1Q1QL4",
    "RPCONS_SA_P1M1ML12_3MMA",
    "IMPORTS_SA_P1M1ML12_3MMA",
]
out = [
    # Output growth
    "INTRGDP_NSA_P1M1ML12_3MMA",
    "RGDPTECH_SA_P1M1ML12_3MMA",
    "IP_SA_P1M1ML12_3MMA",
]
lab = [
    # Labour market tightening and tightness
    "EMPL_NSA_P1M1ML12_3MMA",
    "EMPL_NSA_P1Q1QL4",
    "UNEMPLRATE_NSA_3MMA_D1M1ML12",
    "UNEMPLRATE_NSA_D1Q1QL4",
    "UNEMPLRATE_SA_3MMAv5YMA",
    "WAGES_NSA_P1M1ML12_3MMA",
    "WAGES_NSA_P1Q1QL4",
]
ecg = dem + out + lab

mon = [
    # Money and liquidity growth
    "MNARROW_SJA_P1M1ML12",
    "MBROAD_SJA_P1M1ML12",
    "MBASEGDP_SA_D1M1ML6",
    "INTLIQGDP_NSA_D1M1ML6",
]
crh = [
    # Credit and housing market
    "PCREDITBN_SJA_P1M1ML12",
    "PCREDITGDP_SJA_D1M1ML12",
    "HPI_SA_P1M1ML12_3MMA",
    "HPI_SA_P1Q1QL4",
]
mcr = mon + crh

main = inf + ecg + mcr

adds = [
    # Additional variables for benchmark calculation
    "RGDP_SA_P1Q1QL4_20QMM",
    "INFTEFF_NSA",
    "WFORCE_NSA_P1Y1YL1_5YMM",
    "WFORCE_NSA_P1Q1QL4_20QMM",
]

ecos = main + adds

rets = [
    # Target returns
    "DU05YXR_VT10",
    "DU05YXR_NSA",
]

xcats = ecos + rets

# Asset return tickers for benchmark correlation analysis
xtra = ["USD_EQXR_NSA", "USD_GB10YXR_NSA"]

tickers = [cid + "_" + xcat for cid in cids for xcat in xcats] + xtra
# Download series from J.P. Morgan DataQuery by tickers

start_date = "2000-01-01"
print(f"Maximum number of tickers is {len(tickers)}")

# Retrieve credentials

client_id: str = os.getenv("DQ_CLIENT_ID")
client_secret: str = os.getenv("DQ_CLIENT_SECRET")

with JPMaQSDownload(client_id=client_id, client_secret=client_secret) as dq:
    df = dq.download(
        tickers=tickers,
        start_date=start_date,
        suppress_warning=True,
        metrics=["value"],
        report_time_taken=True,
        show_progress=True,
    )
Maximum number of tickers is 306
Downloading data from JPMaQS.
Timestamp UTC:  2025-09-11 13:13:39
Connection successful!
Requesting data: 100%|██████████████████████████| 16/16 [00:03<00:00,  4.84it/s]
Downloading data: 100%|█████████████████████████| 16/16 [00:15<00:00,  1.05it/s]
Time taken to download data: 	19.78 seconds.
Some expressions are missing from the downloaded data. Check logger output for complete list.
56 out of 306 expressions are missing. To download the catalogue of all available expressions and filter the unavailable expressions, set `get_catalogue=True` in the call to `JPMaQSDownload.download()`.
dfx = df.copy()
dfx.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1657842 entries, 0 to 1657841
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   real_date  1657842 non-null  datetime64[ns]
 1   cid        1657842 non-null  object        
 2   xcat       1657842 non-null  object        
 3   value      1657842 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 50.6+ MB

Renaming and availability #

Renaming #

dict_repl = {
    # Wages
    "WAGES_NSA_P1Q1QL4": "WAGES_NSA_P1M1ML12_3MMA",
    # House prices
    "HPI_SA_P1Q1QL4": "HPI_SA_P1M1ML12_3MMA",
    # Labour market
    "EMPL_NSA_P1Q1QL4": "EMPL_NSA_P1M1ML12_3MMA",
    "UNEMPLRATE_NSA_D1Q1QL4": "UNEMPLRATE_NSA_3MMA_D1M1ML12",
    "UNEMPLRATE_SA_D2Q2QL2": "UNEMPLRATE_SA_D6M6ML6",
    # Other
    "RRSALES_SA_P1Q1QL4": "RRSALES_SA_P1M1ML12_3MMA",
    "RPCONS_SA_P1Q1QL4": "RPCONS_SA_P1M1ML12_3MMA",
    "WFORCE_NSA_P1Y1YL1_5YMM": "WFORCE_NSA_P1Q1QL4_20QMM",
}

for key, value in dict_repl.items():
    dfx["xcat"] = dfx["xcat"].str.replace(key, value)


eco_lists = [inf, ecg, mcr, adds]  # remove replaced tickers from economic concept lists
for i in range(len(eco_lists)):
    eco_lists[i][:] = [xc for xc in eco_lists[i] if xc in dfx["xcat"].unique()]

Check availability #

xcatx = inf
msm.check_availability(df=dfx, xcats=xcatx, cids=cids, missing_recent=False)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/f9ba8a05ab4ab795ec6b33b106e6379d7d1841d88f659ffbd6f5864ec796c570.png
xcatx = ecg
msm.check_availability(df=dfx, xcats=xcatx, cids=cids, missing_recent=False)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/b7fb67c13a2dc90a09ad0805f4205bc2edd6aba36797b626f1f53ad9772c86f6.png
xcatx = mcr
msm.check_availability(df=dfx, xcats=xcatx, cids=cids, missing_recent=False)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/1fb13c33af86ea3ef5b9af2eae4f9e56701ddb088bc4c226d3b80ef6226a57fb.png
xcatx = adds + rets
msm.check_availability(df=dfx, xcats=xcatx, cids=cids, missing_recent=False)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/ed45e95178c7503daf10e4e9998a245e79d5034c2ba135d6c6344a3f871e9072.png

Transformation and checks #

dict_labels = {}
dict_factorz = {}

Inflation shortfall #

# All excess inflation rates

xcatx = inf

dfa = msp.panel_calculator(
    df=dfx, calcs=[f"X{xcat}N = - {xcat} + INFTEFF_NSA" for xcat in xcatx], cids=cids
)
dfx = msm.update_df(dfx, dfa)

xinf = [f"X{xcat}N" for xcat in inf]
# Category-wise sequential normalization

xcatx = xinf

for xcat in xcatx:
    dfa = msp.make_zn_scores(
        df=dfx,
        xcat=xcat,
        cids=cids,
        neutral="zero",
        thresh=3,
        est_freq="M",
        pan_weight=1,
        postfix="_ZN",
    )
    dfx = msm.update_df(dfx, dfa)

# Labels

dict_labels["XCPIH_SA_P1M1ML12N_ZN"] = "CPI, %oya, excess, negative"
dict_labels["XCPIH_SJA_P6M6ML6ARN_ZN"] = "CPI, %6m/6m, saar, excess, negative"
dict_labels["XCPIC_SA_P1M1ML12N_ZN"] = "Core CPI, %oya, excess, negative"
dict_labels["XCPIC_SJA_P6M6ML6ARN_ZN"] = "Core CPI, %6m/6m, saar, excess, negative"
dict_labels["XINFE2Y_JAN_ZN"] = "CPI inflation expectations, excess, negative"
dict_labels["XPPIH_NSA_P1M1ML12_3MMAN_ZN"] = "PPI, %oya, 3mma, excess, negative"
dict_labels["XPPIH_SA_P6M6ML6ARN_ZN"] = "PPI, %6m/6m, saar, excess, negative"
dict_labels["XPGDPTECH_SA_P1M1ML12_3MMAN_ZN"] = "GDP deflator nowcast, %oya, 3mma, excess, negative"
dict_labels["XPGDP_SA_P1Q1QL4N_ZN"] = "GDP deflator, %oya, excess, negative"

# Factors

dict_factorz["XCPINZ"] = [f"X{xcat}N_ZN" for xcat in cpi]
dict_factorz["XPPINZ"] = [f"X{xcat}N_ZN" for xcat in ppi]
dict_factorz["XINFNZ_BROAD"] = [f"X{xcat}N_ZN" for xcat in inf]
# Visual check of factor groups

factor = "XINFNZ_BROAD"  # XCPINZ, XPPINZ, XINFNZ_BROAD
xcatx = dict_factorz[factor]

msp.view_timelines(
    dfx,
    xcats=xcatx,
    cids=cids,
    ncol=3,
    aspect=2,
    height=1.8,
    start="2000-01-01",
    same_y=True,
    title="Inflation shortfall categories, normalized around neutral levels",
    title_fontsize=20,
    xcat_labels=dict_labels,
)

height = len(xcatx)
width = round(height * 1.5)

msp.correl_matrix(
    dfx,
    xcats=xcatx,
    cids=cids,
    freq="M",
    size=(width, height),
    cluster=True,
    title="Cross correlations of inflation shortfall categories, normalized around neutral level",
    title_fontsize=18,
    xcat_labels={xcat:dict_labels[xcat] for xcat in xcatx},
)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/6c70c7af31c3b48ae64d209369d3210c5f0cbea0d4255faaf6f2e6c135c57e9d.png https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/46d4644f20b70daf4490c6e73032ed3f41929953e5f4af5e5ce5c356c5a42955.png

Economic activity shortfall #

# All excess activity growth rates

act = [xcat for xcat in ecg if xcat in dem + out]
xcatx = act

dfa = msp.panel_calculator(
    df=dfx,
    calcs=[f"X{xcat}N = - {xcat} + RGDP_SA_P1Q1QL4_20QMM" for xcat in xcatx],
    cids=cids,
)
dfx = msm.update_df(dfx, dfa)

xact = ["X" + xcat + "N" for xcat in act]
# Excess employment, excess wage growth and unemployment shortfalls

calcs = []
calcs += [
    "XEMPL_NSA_P1M1ML12_3MMAN = - EMPL_NSA_P1M1ML12_3MMA + WFORCE_NSA_P1Q1QL4_20QMM"
]
calcs += [
    "WAGEGROWTH_NEUTRAL = INFTEFF_NSA + RGDP_SA_P1Q1QL4_20QMM - WFORCE_NSA_P1Q1QL4_20QMM"
]
calcs += ["XWAGES_NSA_P1M1ML12_3MMAN = - WAGES_NSA_P1M1ML12_3MMA + WAGEGROWTH_NEUTRAL"]

dfa = msp.panel_calculator(df=dfx, calcs=calcs, cids=cids)
dfx = msm.update_df(dfx, dfa)

xoth = [xcat for xcat in list(dfa["xcat"].unique()) if xcat != "WAGEGROWTH_NEUTRAL"] + [
    "UNEMPLRATE_NSA_3MMA_D1M1ML12",
    "UNEMPLRATE_SA_3MMAv5YMA",
]
xecg = xact + xoth
# Category-wise sequential normalization

xcatx = xecg

for xcat in xcatx:
    dfa = msp.make_zn_scores(
        df=dfx,
        xcat=xcat,
        cids=cids,
        neutral="zero",
        thresh=3,
        est_freq="M",
        pan_weight=1,
        postfix="_ZN",
    )
    dfx = msm.update_df(dfx, dfa)

# Labels

dict_labels["XRRSALES_SA_P1M1ML12_3MMAN_ZN"] = "Retail sales, %oya, 3mma, excess, negative"
dict_labels["XRPCONS_SA_P1M1ML12_3MMAN_ZN"] = "Private consumption, %oya, 3mma, excess, negative"
dict_labels["XIMPORTS_SA_P1M1ML12_3MMAN_ZN"] = "Imports, %oya, 3mma, excess, negative"
dict_labels["XINTRGDP_NSA_P1M1ML12_3MMAN_ZN"] = "GDP, intuitive nowcast, %oya, 3mma, excess, negative"
dict_labels["XRGDPTECH_SA_P1M1ML12_3MMAN_ZN"] = "GDP, technical nowcast, %oya, 3mma, excess, negative"
dict_labels["XIP_SA_P1M1ML12_3MMAN_ZN"] = "Industrial production, %oya, 3mma, excess, negative"
dict_labels["XEMPL_NSA_P1M1ML12_3MMAN_ZN"] = "Employment, %oya, 3mma, excess, negative"
dict_labels["XWAGES_NSA_P1M1ML12_3MMAN_ZN"] = "Wages, %oya, 3mma, excess, negative"
dict_labels["UNEMPLRATE_NSA_3MMA_D1M1ML12_ZN"] = "Unemployment rate, diff oya, 3mma"
dict_labels["UNEMPLRATE_SA_3MMAv5YMA_ZN"] = "Unemployment rate, diff over 5yma"

# Factors

dict_factorz["XDEMNZ"] = [f"{xcat}_ZN" for xcat in xecg if any(s in xcat for s in dem)]
dict_factorz["XOUTNZ"] = [f"{xcat}_ZN" for xcat in xecg if any(s in xcat for s in out)]
dict_factorz["XLABNZ"] = [f"{xcat}_ZN" for xcat in xecg if any(s in xcat for s in lab)]
dict_factorz["XECGNZ_BROAD"] = [
    f"{xcat}_ZN" for xcat in xecg if any(s in xcat for s in ecg)
]
# Visual check of factor groups

factor = "XECGNZ_BROAD"  # XDEMNZ, XOUTNZ, XLABNZ, XECGNZ_BROAD
xcatx = dict_factorz[factor]

msp.view_timelines(
    dfx,
    xcats=xcatx,
    cids=cids,
    ncol=3,
    aspect=2,
    height=1.8,
    start="2000-01-01",
    same_y=True,
    title="Activity shortfall categories, normalized around neutral levels",
    title_fontsize=20,
    xcat_labels=dict_labels,
)

height = len(xcatx)
width = round(height * 1.5)

msp.correl_matrix(
    dfx,
    xcats=xcatx,
    cids=cids,
    freq="M",
    size=(width, height),
    cluster=True,
    title="Cross correlations of activity shortfall categories, normalized around neutral level",
    title_fontsize=18,
    xcat_labels={xcat:dict_labels[xcat] for xcat in xcatx},
)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/9a317fb738f352d4dfb78179ad0f422f267a0a500a503a001006004e47be3d43.png https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/def829e3ec9e0f5d70a1ac9672765105f5785926a60b6d907855f62cfc9af484.png

Excess money and credit growth #

# Excess money and credit growth rates

xcatx = [
    "MNARROW_SJA_P1M1ML12",
    "MBROAD_SJA_P1M1ML12",
    "PCREDITBN_SJA_P1M1ML12",
    "PCREDITGDP_SJA_D1M1ML12",
]

dfa = msp.panel_calculator(
    df=dfx,
    calcs=[
        f"X{xcat}N = - {xcat} + ( RGDP_SA_P1Q1QL4_20QMM + INFTEFF_NSA )"
        for xcat in xcatx
    ],
    cids=cids,
)
dfx = msm.update_df(dfx, dfa)

xmcr1 = ["X" + xcat + "N" for xcat in xcatx]
# Excess house price growth & pseudo excess liquidity growth

calcs = []
calcs += ["XHPI_SA_P1M1ML12_3MMAN = - HPI_SA_P1M1ML12_3MMA + INFTEFF_NSA"]
calcs += ["INTLIQGDP_NSA_D1M1ML6N = - INTLIQGDP_NSA_D1M1ML6"]
calcs += ["MBASEGDP_SA_D1M1ML6N = - MBASEGDP_SA_D1M1ML6"]

dfa = msp.panel_calculator(df=dfx, calcs=calcs, cids=cids)
dfx = msm.update_df(dfx, dfa)

xmcr2 = list(dfa["xcat"].unique())
xmcr = xmcr1 + xmcr2
# Category-wise sequential normalization

xcatx = xmcr

for xcat in xcatx:
    dfa = msp.make_zn_scores(
        df=dfx,
        xcat=xcat,
        cids=cids,
        neutral="zero",
        thresh=3,
        est_freq="M",
        pan_weight=1,
        postfix="_ZN",
    )
    dfx = msm.update_df(dfx, dfa)

# Labels

dict_labels["XMNARROW_SJA_P1M1ML12N_ZN"] = "Narrow money, %oya, excess, negative"
dict_labels["XMBROAD_SJA_P1M1ML12N_ZN"] = "Broad money, %oya, excess, negative"
dict_labels["XPCREDITBN_SJA_P1M1ML12N_ZN"] = "Private credit, %oya, excess, negative"
dict_labels["XPCREDITGDP_SJA_D1M1ML12N_ZN"] = "Private credit, diff oya, % of GDP, excess, negative"
dict_labels["XHPI_SA_P1M1ML12_3MMAN_ZN"] = "House prices, %oya, 3mma, excess, negative"
dict_labels["MBASEGDP_SA_D1M1ML6N_ZN"] = "Monetary base, diff over 6 months, % of GDP, excess, negative"
dict_labels["INTLIQGDP_NSA_D1M1ML6N_ZN"] = "Liquidity, diff over 6 months, % of GDP, excess, negative"

# Factors

dict_factorz["XMONNZ"] = [f"{xcat}_ZN" for xcat in xmcr if any(s in xcat for s in mon)]
dict_factorz["XCRHNZ"] = [f"{xcat}_ZN" for xcat in xmcr if any(s in xcat for s in crh)]
dict_factorz["XMCRNZ_BROAD"] = [
    f"{xcat}_ZN" for xcat in xmcr if any(s in xcat for s in mcr)
]
factor = "XMCRNZ_BROAD"  # XMONNZ, XCRHNZ
xcatx = dict_factorz[factor]

msp.view_timelines(
    dfx,
    xcats=xcatx,
    cids=cids,
    ncol=3,
    aspect=2,
    height=1.8,
    start="2000-01-01",
    same_y=True,
    title="Money and credit shortfall categories, normalized around neutral levels",
    title_fontsize=20,
    xcat_labels=dict_labels,
)

height = len(xcatx)
width = round(height * 1.6)

msp.correl_matrix(
    dfx,
    xcats=xcatx,
    cids=cids,
    freq="M",
    size=(15, 10),
    cluster=True,
    title="Cross correlations of money and credit shortfall categories, normalized around neutral level",
    title_fontsize=18,
    xcat_labels={xcat:dict_labels[xcat] for xcat in xcatx},
)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/6e86a90aa968c23eda3d232b9bdb7239114277af849e00e883a9e558eecf8849.png https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/70dcf498ea8c1bbeb38ea56445f12eb1cf3937b282f812d7acbc736162ebb96d.png

Equally-weighted conceptual scores #

dfa = pd.DataFrame(columns=dfx.columns)
for k, v in dict_factorz.items():
    dfaa = msp.linear_composite(
        df=dfx,
        xcats=v,
        cids=cids,
        new_xcat=k,
    )
    dfa = msm.update_df(dfa, dfaa)
dfx = msm.update_df(dfx, dfa)

narrow_factorz = [k for k in dict_factorz.keys() if "BROAD" not in k]
broad_factorz = [k for k in dict_factorz.keys() if "BROAD" in k]
dict_factlabels ={
    "XINFNZ_BROAD": "Broad inflation shortfall",
    "XECGNZ_BROAD": "Broad economic activity shortfall",
    "XMCRNZ_BROAD": "Broad money and credit growth shortfall",
    "XCPINZ": "CPI inflation shortfall",
    "XPPINZ": "PPI inflation shortfall",
    "XDEMNZ": "Domestic demand growth shortfall",
    "XOUTNZ": "Output growth shortfall",
    "XLABNZ": "Labour market slack",
    "XMONNZ": "Money growth shortfall",
    "XCRHNZ": "Credit and house price growth shortfall",
}
dict_labels.update(dict_factlabels)
# Visual check of factor groups

xcatx = broad_factorz  # narrow_factorz broad_factorz

msp.view_timelines(
    dfx,
    xcats=xcatx,
    cids=cids,
    ncol=3,
    aspect=2,
    height=1.8,
    start="2000-01-01",
    same_y=True,
    title="Broad factor scores",
    title_fontsize=20,
    xcat_labels={xcat:dict_labels[xcat] for xcat in xcatx},
)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/9b24b8276eddb13e325a931b7209978ce50d0bff20f6f4addd3619b02c2e3d8b.png
xcatx = broad_factorz + narrow_factorz

height = len(xcatx)
width = round(height * 1.5)

msp.correl_matrix(
    dfx,
    xcats=xcatx,
    cids=cids,
    freq="M",
    size=(width, height),
    cluster=False,
    title="Cross-correlation of narrow and broad factor scores",
    title_fontsize=20,
    xcat_labels={xcat:dict_labels[xcat] for xcat in xcatx},
)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/43fb571014c41a40b106c9f414128a05151ce64270c9a4dcf91839289808f98f.png

Target returns #

xcatx = ["DU05YXR_NSA", "DU05YXR_VT10"]

msp.view_timelines(
    dfx,
    xcats=xcatx,
    cumsum=True,
    cids=cids,
    ncol=3,
    aspect=1.8,
    start="2000-01-01",
    same_y=True,
    title="Cumulative excess returns on 5-year IRS fixed receiver positions, unadjusted and 10% vol-target",
    title_fontsize=26,
)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/0b7c4932783c18eeb3f0280255cbc4ce6cfa0b07a9a8e6d447b95ad41441c975.png

Signal generation #

Preparation #

scorer = {"RMSE": make_scorer(root_mean_squared_error, greater_is_better=False)}
splitter = {"Rolling": msl.RollingKFoldPanelSplit(5)}
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cross_decomposition import PLSRegression


class PLSTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_components=2):
        self.n_components = n_components
        self.model = PLSRegression(n_components=n_components)

    def fit(self, X, y=None):
        # PLS needs y for supervised decomposition
        # If you're fitting inside a preprocessing pipeline, pass y through
        self.model.fit(X, y)
        return self

    def transform(self, X):
        return self.model.transform(X)
from sklearn.base import MetaEstimatorMixin


class DataFrameTransformer(BaseEstimator, TransformerMixin, MetaEstimatorMixin):
    def __init__(self, transformer, column_names=None):
        # Attributes
        self.transformer = transformer
        self.column_names = column_names

    def fit(self, X, y=None):
        # Fit estimator
        self.transformer.fit(X, y)

        return self

    def transform(self, X):
        # Transform the data
        transformation = self.transformer.transform(X)

        if isinstance(transformation, pd.DataFrame):
            return transformation
        else:
            # scikit-learn returns a numpy array, convert it back to DataFrame
            if self.column_names is None:
                columns = [f"Factor_{i}" for i in range(transformation.shape[1])]
            else:
                columns = self.column_names[: transformation.shape[1]]

            return pd.DataFrame(
                data=transformation,
                columns=columns,
                index=X.index,
            )

Kitchen-sink PCA and PLS #

xcatx = list(
    itertools.chain(*[dict_factorz[narrow_xcat] for narrow_xcat in narrow_factorz])
) + ["DU05YXR_VT10"]
cidx = cids

so_full = msl.SignalOptimizer(
    df=dfx,
    xcats=xcatx,
    cids=cidx,
    freq="W",
    lag=1,
    xcat_aggs=["last", "sum"],
)
so_full.calculate_predictions(
    name="KS",
    models={
        "PCR": Pipeline(
            [
                ("scaler", msl.PanelStandardScaler()),
                ("pca", msl.PanelPCA(n_components=3)),
                ("twlr", msl.TimeWeightedLinearRegression()),
            ]
        ),
        "PLS": Pipeline(
            [
                ("scaler", msl.PanelStandardScaler()),
                ("pls", DataFrameTransformer(PLSTransformer(n_components=3))),
                ("twlr", msl.TimeWeightedLinearRegression()),
            ]
        ),
    },
    scorers=scorer,
    hyperparameters={
        "PCR": {"twlr__fit_intercept": [True, False], "twlr__half_life": [1*52, 3*52, 5*52], "pca__n_components": [3, 0.95]},
        "PLS": {"twlr__fit_intercept": [True, False], "twlr__half_life": [1*52, 3*52, 5*52], "pls__transformer__n_components": [3, 5]},
    },
    inner_splitters=splitter,
    min_cids=3,
    test_size=4,
    store_correlations=True,
    min_periods=24,
)

so_full.models_heatmap(
    "KS",
    title="Model selection heatmap for kitchen sink approach",
    title_fontsize=18,
    figsize=(12, 4),
)

dfa = so_full.get_optimized_signals("KS")
dfx = msm.update_df(dfx, dfa)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/5a81a8d4506c2f3fd74ad57e8b9cb32be6e716e1814587ea3095d8b567437adc.png

Broad-factor PCA and PLS #

xcatx = list(
    itertools.chain(*[dict_factorz[narrow_xcat] for narrow_xcat in narrow_factorz])
) + ["DU05YXR_VT10"]
cidx = cids

so_broad = msl.SignalOptimizer(
    df=dfx,
    xcats=xcatx,
    cids=cidx,
    freq="W",
    lag=1,
    xcat_aggs=["last", "sum"],
)
so_broad.calculate_predictions(
    name="BROAD",
    models={
        "PCR_1EACH": Pipeline(
            [
                ("scaler", msl.PanelStandardScaler()),
                (
                    "ct",
                    DataFrameTransformer(
                        ColumnTransformer(
                            [
                                (
                                    "pca_inf",
                                    msl.PanelPCA(n_components=1),
                                    dict_factorz["XINFNZ_BROAD"],
                                ),
                                (
                                    "pca_grow",
                                    msl.PanelPCA(n_components=1),
                                    dict_factorz["XECGNZ_BROAD"],
                                ),
                                (
                                    "pca_lend",
                                    msl.PanelPCA(n_components=1),
                                    dict_factorz["XMCRNZ_BROAD"],
                                ),
                            ]
                        ),
                        column_names=["INF", "GROW", "LEND"],
                    ),
                ),
                ("twlr", msl.TimeWeightedLinearRegression()),
            ]
        ),
        "PCR_0.95EACH": Pipeline(
            [
                ("scaler", msl.PanelStandardScaler()),
                (
                    "ct",
                    DataFrameTransformer(
                        ColumnTransformer(
                            [
                                (
                                    "pca_inf",
                                    PCA(n_components=0.95),
                                    dict_factorz["XINFNZ_BROAD"],
                                ),
                                (
                                    "pca_grow",
                                    PCA(n_components=0.95),
                                    dict_factorz["XECGNZ_BROAD"],
                                ),
                                (
                                    "pca_lend",
                                    PCA(n_components=0.95),
                                    dict_factorz["XMCRNZ_BROAD"],
                                ),
                            ]
                        )
                    ),
                ),
                ("twlr", msl.TimeWeightedLinearRegression()),
            ]
        ),
        "PLS_1EACH": Pipeline(
            [
                ("scaler", msl.PanelStandardScaler()),
                (
                    "ct",
                    DataFrameTransformer(
                        ColumnTransformer(
                            [
                                (
                                    "pls_inf",
                                    PLSTransformer(n_components=1),
                                    dict_factorz["XINFNZ_BROAD"],
                                ),
                                (
                                    "pls_grow",
                                    PLSTransformer(n_components=1),
                                    dict_factorz["XECGNZ_BROAD"],
                                ),
                                (
                                    "pls_lend",
                                    PLSTransformer(n_components=1),
                                    dict_factorz["XMCRNZ_BROAD"],
                                ),
                            ]
                        ),
                        column_names=["INF", "GROW", "LEND"],
                    ),
                ),
                ("twlr", msl.TimeWeightedLinearRegression()),
            ]
        ),
    },
    scorers=scorer,
    hyperparameters={
        "PCR_1EACH": {"twlr__fit_intercept": [True, False], "twlr__half_life": [1*52, 3*52, 5*52]},
        "PCR_0.95EACH": {"twlr__fit_intercept": [True, False], "twlr__half_life": [1*52, 3*52, 5*52]},
        "PLS_1EACH": {"twlr__fit_intercept": [True, False], "twlr__half_life": [1*52, 3*52, 5*52]},
    },
    inner_splitters=splitter,
    min_cids=3,
    min_periods=24,
    test_size=4,
)


so_broad.models_heatmap(
    "BROAD",
    title="Model selection heatmap for broad factor generation approach",
    title_fontsize=18,
    figsize=(12, 4),
)

dfa = so_broad.get_optimized_signals("BROAD")
dfx = msm.update_df(dfx, dfa)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/266fa17af13ffda69ec7ff24644b81e16f7d88975e6795cb7ea3f149e7d9f7ee.png

2-stage factor PCA/PLS #

xcatx = list(
    itertools.chain(*[dict_factorz[narrow_xcat] for narrow_xcat in narrow_factorz])
) + ["DU05YXR_VT10"]
cidx = cids

so_narrow = msl.SignalOptimizer(
    df=dfx,
    xcats=xcatx,
    cids=cidx,
    freq="W",
    lag=1,
    xcat_aggs=["last", "sum"],
)
so_narrow.calculate_predictions(
    name="TWOSTAGE",
    models={
        "PCR": Pipeline(
            [
                ("scaler", msl.PanelStandardScaler()),
                (
                    "ct",
                    DataFrameTransformer(
                        transformer=ColumnTransformer(
                            [
                                (
                                    "pca_cpi",
                                    PCA(n_components=1),
                                    dict_factorz["XCPINZ"],
                                ),
                                (
                                    "pca_ppi",
                                    PCA(n_components=1),
                                    dict_factorz["XPPINZ"],
                                ),
                                (
                                    "pca_dem",
                                    PCA(n_components=1),
                                    dict_factorz["XDEMNZ"],
                                ),
                                (
                                    "pca_out",
                                    PCA(n_components=1),
                                    dict_factorz["XOUTNZ"],
                                ),
                                (
                                    "pca_lab",
                                    PCA(n_components=1),
                                    dict_factorz["XLABNZ"],
                                ),
                                (
                                    "pca_mon",
                                    PCA(n_components=1),
                                    dict_factorz["XMONNZ"],
                                ),
                                (
                                    "pca_cre",
                                    PCA(n_components=1),
                                    dict_factorz["XCRHNZ"],
                                ),
                            ]
                        ),
                        column_names=["CPI", "PPI", "DEM", "OUT", "LAB", "MON", "CRE"],
                    ),
                ),
                ("scaler2", msl.PanelStandardScaler()),
                (
                    "ct2",
                    DataFrameTransformer(
                        ColumnTransformer(
                            [
                                ("pca_inf", PCA(n_components=1), ["CPI", "PPI"]),
                                (
                                    "pca_grow",
                                    PCA(n_components=1),
                                    ["DEM", "OUT", "LAB"],
                                ),
                                ("pca_lend", PCA(n_components=1), ["MON", "CRE"]),
                            ]
                        )
                    ),
                ),
                ("twlr", msl.TimeWeightedLinearRegression()),
            ]
        ),
        "PLS": Pipeline(
            [
                ("scaler", msl.PanelStandardScaler()),
                (
                    "ct",
                    DataFrameTransformer(
                        transformer=ColumnTransformer(
                            [
                                (
                                    "pls_cpi",
                                    PLSTransformer(n_components=1),
                                    dict_factorz["XCPINZ"],
                                ),
                                (
                                    "pls_ppi",
                                    PLSTransformer(n_components=1),
                                    dict_factorz["XPPINZ"],
                                ),
                                (
                                    "pls_dem",
                                    PLSTransformer(n_components=1),
                                    dict_factorz["XDEMNZ"],
                                ),
                                (
                                    "pls_out",
                                    PLSTransformer(n_components=1),
                                    dict_factorz["XOUTNZ"],
                                ),
                                (
                                    "pls_lab",
                                    PLSTransformer(n_components=1),
                                    dict_factorz["XLABNZ"],
                                ),
                                (
                                    "pls_mon",
                                    PLSTransformer(n_components=1),
                                    dict_factorz["XMONNZ"],
                                ),
                                (
                                    "pls_cre",
                                    PLSTransformer(n_components=1),
                                    dict_factorz["XCRHNZ"],
                                ),
                            ]
                        ),
                        column_names=["CPI", "PPI", "DEM", "OUT", "LAB", "MON", "CRE"],
                    ),
                ),
                ("scaler2", msl.PanelStandardScaler()),
                (
                    "ct2",
                    DataFrameTransformer(
                        ColumnTransformer(
                            [
                                (
                                    "pls_inf",
                                    PLSTransformer(n_components=1),
                                    ["CPI", "PPI"],
                                ),
                                (
                                    "pls_grow",
                                    PLSTransformer(n_components=1),
                                    ["DEM", "OUT", "LAB"],
                                ),
                                (
                                    "pls_lend",
                                    PLSTransformer(n_components=1),
                                    ["MON", "CRE"],
                                ),
                            ]
                        ),
                    ),
                ),
                ("twlr", msl.TimeWeightedLinearRegression()),
            ]
        ),
    },
    scorers=scorer,
    hyperparameters={
        "PCR": {"twlr__fit_intercept": [True, False], "twlr__half_life": [1*52, 3*52, 5*52]},
        "PLS": {"twlr__fit_intercept": [True, False], "twlr__half_life": [1*52, 3*52, 5*52]},
    },
    inner_splitters=splitter,
    min_cids=3,
    min_periods=24,
    test_size=4,
)


so_narrow.models_heatmap(
    "TWOSTAGE",
    title="Model selection heatmap for 2-stage factor generation approach",
    title_fontsize=18,
    figsize=(12, 4),
)

dfa = so_narrow.get_optimized_signals("TWOSTAGE")
dfx = msm.update_df(dfx, dfa)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/918579e83c9c2047e5da7759e07fd0c60318f2ad0554eabb1a31462c91ff49d3.png

Conceptual parity #

dfa = msp.linear_composite(
    df=dfx, xcats=broad_factorz, cids=cids, new_xcat="PARITY"
)

dfx = msm.update_df(dfx, dfa)

Value checks #

Kitchen-sink component inference #

The kitchen-sink latent factors have converged to inflation, growth and lending factors in order.

so_full.correlations_heatmap(name="KS",feature_name="PCA 1")
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/becbf2b0aba9bf90a79f813ba783fb394c43b1492fe97d79e9a36cb6305b2313.png
so_full.correlations_heatmap(name="KS",feature_name="PCA 2")
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/69c39334b705ae6f795ebe41a473154b555d7b511d1300f90328147c90662a70.png
so_full.correlations_heatmap(name="KS",feature_name="PCA 3")
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/dcf9d9a45751a20f08f960b1dedb341f553bb79a8ead781fe75e1b8b98d4734b.png

Forward correlation and accuracy #

dict_labels["BROAD"] = "Regression-based learning using broad factor generation approach"
dict_labels["TWOSTAGE"] = "Regression-based learning using 2-stage factor generation approach"
dict_labels["KS"] = "Regression-based learning using kitchen sink factor generation approach"
dict_labels["PARITY"] = "Signal generation with 2-stage conceptual parity"
xcatx = ["KS", "BROAD", "TWOSTAGE", "PARITY"]
titles = [dict_labels[k] for k in xcatx]
crs = []

for xcat in xcatx:
    cr = msp.CategoryRelations(
        df=dfx,
        xcats=[xcat, "DU05YXR_VT10"],
        cids=cids,
        freq="M",
        lag=1,
        xcat_aggs=["last", "sum"],
        slip=1,
        start="2002-01-11",
    )
    crs.append(cr)

msv.multiple_reg_scatter(
    cat_rels=crs,
    ncol=2,
    nrow=2,
    figsize=(14, 12),
    prob_est="map",
    coef_box="upper left",
    title="Signals and subsequent monthly 5-year IRS receiver returns, 8 developed markets since 2002",
    title_fontsize=20,
    subplot_titles=titles,
    xlab="Signal value at the end of month",
    ylab="Subsequent month 5-year IRS excess return on vol-targeted position (10% ar)",
    share_axes=False,
    label_fontsize=14,
)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/982c18d484e2b0fa208c52355d85de155dbba943a747ccdd701623521086020a.png
xcatx = ["KS", "BROAD", "TWOSTAGE", "PARITY"]
dict_short_labels = {}
dict_short_labels["BROAD"] = "Broad factor approach"
dict_short_labels["TWOSTAGE"] = "2-stage factor approach"
dict_short_labels["KS"] = "Kitchen sink approach"
dict_short_labels["PARITY"] = "2-stage conceptual parity"


srr = mss.SignalReturnRelations(
    dfx,
    cids=cids,
    sigs=xcatx,
    rets=["DU05YXR_VT10"],
    freqs=["M", "W"],
    start="2002-02-28",
)

pd.set_option("display.precision", 3)
display(srr.multiple_relations_table().round(3))

srr.accuracy_bars(
    type="signals",
    title="Accuracy of weekly signals in predicting 5-year duration returns, 8 developed markets since 2002",
    freq="W",
    size=(13, 5),
    x_labels=dict_short_labels
)
accuracy bal_accuracy pos_sigr pos_retr pos_prec neg_prec pearson pearson_pval kendall kendall_pval auc
Return Signal Frequency Aggregation
DU05YXR_VT10 BROAD M last 0.542 0.525 0.754 0.546 0.558 0.492 0.094 0.0 0.057 0.000 0.519
KS M last 0.545 0.529 0.750 0.546 0.560 0.498 0.087 0.0 0.048 0.001 0.522
PARITY M last 0.531 0.533 0.473 0.551 0.587 0.480 0.114 0.0 0.063 0.000 0.534
TWOSTAGE M last 0.536 0.519 0.732 0.546 0.556 0.483 0.093 0.0 0.049 0.001 0.515
BROAD W last 0.538 0.527 0.758 0.535 0.548 0.507 0.049 0.0 0.033 0.000 0.520
KS W last 0.535 0.524 0.751 0.535 0.547 0.500 0.038 0.0 0.027 0.000 0.518
PARITY W last 0.513 0.515 0.470 0.538 0.554 0.476 0.059 0.0 0.034 0.000 0.515
TWOSTAGE W last 0.536 0.525 0.734 0.535 0.548 0.502 0.046 0.0 0.030 0.000 0.520
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/53700677654ad7c7f64b8bfd2da023bf9123805f2b975e0ec61b28100af7e2b3.png

Naive PnLs #

xcatx = ["KS", "BROAD", "TWOSTAGE", "PARITY"]

naive_pnl = msn.NaivePnL(
    dfx,
    cids=cids,
    ret="DU05YXR_VT10",
    sigs=xcatx,
    start="2002-02-28",
    bms=["USD_GB10YXR_NSA", "USD_EQXR_NSA"],
)

for sig in xcatx:
    naive_pnl.make_pnl(
        sig,
        sig_neg=False,
        sig_op="zn_score_pan",
        thresh=2,
        rebal_freq="weekly",
        vol_scale=10,
        rebal_slip=1,
        pnl_name=sig,
    )

naive_pnl.plot_pnls(
    title="Cumulative combined naive PnLs for normalized unbiased signals in eight developed markets",
    title_fontsize=18,
    figsize=(14, 8),
    xcat_labels={xcat:dict_labels[xcat] for xcat in xcatx},
    legend_fontsize=14,
)

pd.set_option("display.precision", 2)
naive_pnl.evaluate_pnls(pnl_cats=naive_pnl.pnl_names, label_dict=dict_short_labels).round(2)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/6fbeb65aa83262a840fd66d612981b99216602da080363ccfa7b93a9fbcb13f9.png
xcat Broad factor approach 2-stage factor approach Kitchen sink approach 2-stage conceptual parity
Return % 6.81 6.79 6.02 5.9
St. Dev. % 10.0 10.0 10.0 10.0
Sharpe Ratio 0.68 0.68 0.6 0.59
Sortino Ratio 0.99 0.98 0.86 0.86
Max 21-Day Draw % -11.58 -11.38 -12.26 -12.43
Max 6-Month Draw % -17.56 -18.24 -18.02 -26.76
Peak to Trough Draw % -27.33 -30.43 -27.83 -41.79
Top 5% Monthly PnL Share 0.84 0.89 0.93 1.09
USD_GB10YXR_NSA correl 0.4 0.36 0.38 -0.1
USD_EQXR_NSA correl -0.2 -0.18 -0.18 -0.01
Traded Months 284 284 284 284
xcatx = ["KS", "BROAD", "TWOSTAGE", "PARITY"]

naive_pnl = msn.NaivePnL(
    dfx,
    ret="DU05YXR_VT10",
    sigs=xcatx,
    start="2002-02-28",
    bms=["USD_GB10YXR_NSA", "USD_EQXR_NSA"],
)

for sig in xcatx:
    naive_pnl.make_pnl(
        sig,
        sig_neg=False,
        sig_op="zn_score_pan",
        sig_add=1,
        thresh=2,
        rebal_freq="weekly",
        vol_scale=10,
        rebal_slip=1,
        pnl_name=sig,
    )

naive_pnl.plot_pnls(
    title="Cumulative combined naive PnLs for long-biased normalized signals in eight developed markets",
    title_fontsize=18,
    figsize=(14, 8),
    xcat_labels={xcat:dict_labels[xcat] for xcat in xcatx},
    legend_fontsize=14,
)

pd.set_option("display.precision", 2)
naive_pnl.evaluate_pnls(pnl_cats=naive_pnl.pnl_names, label_dict=dict_short_labels).round(2)
https://macrosynergy.com/notebooks.build/data-science/dimension-reduction-and-statistical-learning/_images/5bed77e4c87f31cdf2017848ebf2c8719253ca2029cf8423b680f2f8d6a279ee.png
xcat Broad factor approach 2-stage factor approach Kitchen sink approach 2-stage conceptual parity
Return % 5.43 5.43 4.85 8.19
St. Dev. % 10.0 10.0 10.0 10.0
Sharpe Ratio 0.54 0.54 0.49 0.82
Sortino Ratio 0.77 0.78 0.69 1.17
Max 21-Day Draw % -12.82 -13.7 -13.0 -17.88
Max 6-Month Draw % -21.15 -22.77 -21.54 -25.35
Peak to Trough Draw % -36.77 -38.97 -40.5 -28.49
Top 5% Monthly PnL Share 1.0 1.0 1.12 0.69
USD_GB10YXR_NSA correl 0.6 0.59 0.6 0.37
USD_EQXR_NSA correl -0.22 -0.21 -0.21 -0.16
Traded Months 284 284 284 284