Panel regression with JPMaQS (R-code) #
This notebook replicates the results discussed in Macrosynergy’s research post on general method for testing the significance of macro trading factors of Feb 11th “Testing macro trading factors” . The post uses the example J.P. Morgan Macrosynergy Quantamental System (JPMaQS) set of macro factors available on Kaggle. The importance of using correct time series of macro information, which at any date reflects what the market could have known on that date is discussed here
1. Loading/attaching relevant packages and JPMaQS dataset and exploring categories and cross-sections in the dataset. #
The description of each JPMaQS category is available either under JPMorgan Markets (password protected, full set of 5000+ indicators) or on Kaggle (just for the tickers used in this notebook, no password required).
suppressPackageStartupMessages({
install.packages("https://cran.r-project.org/src/contrib/Archive/Zelig/Zelig_4.2-1.tar.gz", repos=NULL, type="source")
library(plm)
library(car)
library(lme4)
library(gplots)
library(lubridate) # for aggregation, grouping by
library(dplyr) # for aggregation, grouping by
library(zoo)
library(tidyr) # pivot table
library(texreg)
library(Zelig)
library(tidyverse) # Modern data science library
library(lmtest) # For hetoroskedasticity analysis
library(xts)
library(huxtable)
})
Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
df <- read.csv('../input/fixed-income-returns-and-macro-trends/JPMaQS_Quantamental_Indicators.csv',header=TRUE, stringsAsFactors=FALSE)
df <- as.data.frame(df)[,c("cid","xcat", "real_date", "value")]
rownames(df) <- NULL
str(df)
unique(df$xcat)
unique(df$cid)
'data.frame': 3072750 obs. of 4 variables:
$ cid : chr "AUD" "AUD" "AUD" "AUD" ...
$ xcat : chr "CPIC_SA_P1M1ML12" "CPIC_SA_P1M1ML12" "CPIC_SA_P1M1ML12" "CPIC_SA_P1M1ML12" ...
$ real_date: chr "2000-01-03" "2000-01-04" "2000-01-05" "2000-01-06" ...
$ value : num 1.24 1.24 1.24 1.24 1.24 ...
- 'CPIC_SA_P1M1ML12'
- 'CPIC_SJA_P3M3ML3AR'
- 'CPIC_SJA_P6M6ML6AR'
- 'CPIH_SA_P1M1ML12'
- 'CPIH_SJA_P3M3ML3AR'
- 'CPIH_SJA_P6M6ML6AR'
- 'DU02YXR_NSA'
- 'DU02YXR_VT10'
- 'DU05YXR_NSA'
- 'DU05YXR_VT10'
- 'EQXR_NSA'
- 'EQXR_VT10'
- 'FXXR_NSA'
- 'FXXR_VT10'
- 'INFTEFF_NSA'
- 'INTRGDP_NSA_P1M1ML12_3MMA'
- 'INTRGDPv5Y_NSA_P1M1ML12_3MMA'
- 'PCREDITBN_SJA_P1M1ML12'
- 'PCREDITGDP_SJA_D1M1ML12'
- 'RGDP_SA_P1Q1QL4_20QMA'
- 'RYLDIRS02Y_NSA'
- 'RYLDIRS05Y_NSA'
- 'AUD'
- 'CAD'
- 'CHF'
- 'CLP'
- 'COP'
- 'CZK'
- 'EUR'
- 'GBP'
- 'HUF'
- 'IDR'
- 'ILS'
- 'INR'
- 'JPY'
- 'KRW'
- 'MXN'
- 'NOK'
- 'NZD'
- 'PLN'
- 'SEK'
- 'THB'
- 'TRY'
- 'TWD'
- 'USD'
- 'ZAR'
2. Prepare data for panel regression analysis #
2.1. Specify analysis. Select features, targets, cross-sections and start date #
In the examples below we use various types of quantamental macroeconomic information at a monthly frequency (month-end status) and relate it to the next month’s 5-year interest rate swap fixed receiver returns for the period 2000 to 2022 and for 6 major developed markets: the U.S. (USD), the euro area (EUR), Japan (JPY), the UK (GBP), Canada (CAD), and Australia (AUD). The description of each JPMaQS category is available either under Macro Quantamental Academy , JPMorgan Markets (password protected), or on Kaggle (just for the tickers used in this notebook).
feats <- list('CPIC_SJA_P6M6ML6AR', 'RYLDIRS02Y_NSA', "INTRGDPv5Y_NSA_P1M1ML12_3MMA")
targs<- list('DU05YXR_VT10', 'DU05YXR_NSA')
feat1 = feats[1] # 'CPIC_SJA_P6M6ML6AR' as covariate
feat2 = feats[2] # 'RYLDIRS02Y_NSA' as covariate
feat3 = feats[3] # 'INTRGDPv5Y_NSA_P1M1ML12_3MMA' as covariate
targ1 = targs[1] # 'DU05YXR_VT10' as target
targ2 = targs[2] # 'DU05YXR_NSA' as target
start_date <- "2000-01-01"
cids_sel <- c('EUR','GBP','USD', 'JPY', 'CAD', 'AUD')
2.2. Filter dataset for desired start date and cross-sections #
filt_date <- df$real_date > start_date
filt_cid <- df$cid %in% cids_sel
dfx <- df[filt_date & filt_cid, ]
2.3. Create lagged features’ dataframes #
The below section selects relevant features, pivots dataframes to wide format, aggregates data in the appropriate form to monthly basis, and lags features by one period, so that we may use the current month’s data to forecast next month’s returns.
feats = c(feat1, feat2, feat3)
for (i in 1:length(feats)) {
feat = feats[i]
dfx_feat <- dfx[dfx$xcat == feat, ] # select covariate (feature)
tbd_feat <- dfx_feat %>% pivot_wider(names_from = cid, values_from = value) # pivot dataframe to wide format
xdd_feat <- xts(tbd_feat[cids_sel], as.Date(tbd_feat$real_date)) # use real_date as index
xdm_feat <- period.apply(xdd_feat, endpoints(xdd_feat, on="months"), last) # convert to end-of-month values
xdm_feat <- stats::lag(xdm_feat, k=1) # lag by 1 period, i.e. delayed use
real_date <- index(xdm_feat)
tbm_feat <- cbind(real_date, data.frame(coredata(xdm_feat))) %>% pivot_longer(cols=any_of(cids_sel)) %>% rename(cid = name) # convert back to long data frame
tbm_feat <- tbm_feat[order(tbm_feat$cid),] # order by cid
tbm_feat$xcat = paste0(feat, "_L1M") # add _L1M to feature name and add the name to created df
assign(paste0("tbm_feat", i), tbm_feat)
}
2.4. Create targets’ dataframes #
targs = c(targ1, targ2)
for (i in 1:length(targs)) {
targ = targs[i]
dfx_targ = dfx[dfx$xcat == targ, ] # select target
tbd_targ = dfx_targ %>% pivot_wider(names_from = cid, values_from = value) # pivot dataframe to wider format
xdd_targ = xts(tbd_targ[cids_sel], as.Date(tbd_targ$real_date)) # use real_date as index
xdm_targ = xts(apply(xdd_targ, 2, apply.monthly, sum), index(xdd_targ)[endpoints(xdd_targ, on="months")]) # convert to monthly sums
real_date <- index(xdm_targ)
tbm_targ <- cbind(real_date, data.frame(coredata(xdm_targ))) %>% pivot_longer(cols=any_of(cids_sel)) %>% rename(cid = name) # transform back to long dataframe
tbm_targ <- tbm_targ[order(tbm_targ$cid),] # order by cid
tbm_targ$xcat = targ
assign(paste0("tbm_targ", i), tbm_targ)
}
2.4. Create combined dataframes for three regression examples #
tbm1 = rbind(tbm_feat1, tbm_targ1) # merge two dataframes. Feature 1, Target 1
tbx1 = tbm1 %>% pivot_wider(names_from = xcat, values_from = value) # pivot to wide df
dfx_pan1 <- pdata.frame(tbx1, index=c("cid","real_date"))
tbm2 = rbind(tbm_feat2, tbm_targ2) # merge two dataframes. Feature 2, Target 2
tbx2 = tbm2 %>% pivot_wider(names_from = xcat, values_from = value) # pivot to wide df
dfx_pan2 <- pdata.frame(tbx2, index=c("cid","real_date"))
tbm3 = rbind(tbm_feat3, tbm_targ2) # merge two dataframes. Feature 3, Target 2
tbx3 = tbm3 %>% pivot_wider(names_from = xcat, values_from = value) # pivot to wide df
dfx_pan3 <- pdata.frame(tbx3, index=c("cid","real_date"))
3. Compare standard pooled regression with time random effects #
3.1. Example 1: Deflating significance #
The simplistic way of checking significance would be to simply stack up all country data and run a pooled regression. Alternatively, one can check the significance of a correlation coefficient for the stacked data set.
In both cases, the implicit assumption is that the experiences of the different markets are independent and unrelated. By contrast, the method suggested in
“Testing macro trading factors”
requires us to run a panel regression with period-specific random effects.
We run our analysis by using the standard
plm
package in R on the quantamental data set. The JPMaQS ticker for the inflation indicator is
CPIC_SJA_P6M6ML6AR
and target variable is
DU05YXR_VT10
(description
here
).
form1 = paste(targ1, "~1 +", paste0(feat1, "_L1M"))
pool1 <- plm(form1, model="pooling", data=dfx_pan1)
summary(pool1)
Pooling Model
Call:
plm(formula = form1, data = dfx_pan1, model = "pooling")
Unbalanced Panel: n = 6, T = 216-277, N = 1583
Residuals:
Min. 1st Qu. Median 3rd Qu. Max.
-20.44023 -2.18271 0.16423 2.50893 15.04703
Coefficients:
Estimate Std. Error t-value Pr(>|t|)
(Intercept) 0.648450 0.169995 3.8145 0.0001417 ***
CPIC_SJA_P6M6ML6AR_L1M -0.220925 0.081715 -2.7036 0.0069330 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Total Sum of Squares: 23603
Residual Sum of Squares: 23494
R-Squared: 0.004602
Adj. R-Squared: 0.0039724
F-statistic: 7.3094 on 1 and 1581 DF, p-value: 0.006933
The pooled regression diagnoses high significance of the 1-month lagged core CPI trend for duration returns. As should be expected the relation between inflation and subsequent duration returns has been negative and the probability of the relationship not being accidental according to the pooled model has been over 99% (indicated by the p-value below being below 0.01.). The post argues, that the chosen inflation indicator is particularly important in the US and in fact, the statistics vastly exaggerates the significance of the chosen indicator for the pool of selected countries. The post argues, that this is a classic example, in which treating countries’ experiences as independent leads to pseudo-replication . The results are plausibly misleading with respect to the general importance of core inflation rates across countries. To demonstrate this point we have a look at random period model:
rets1 <- plm(form1, model = "random", effect = "time", data=dfx_pan1)
summary(rets1)
Oneway (time) effect Random Effect Model
(Swamy-Arora's transformation)
Call:
plm(formula = form1, data = dfx_pan1, effect = "time", model = "random")
Unbalanced Panel: n = 6, T = 216-277, N = 1583
Effects:
var std.dev share
idiosyncratic 6.620 2.573 0.445
time 8.253 2.873 0.555
theta:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.5913 0.6566 0.6566 0.6498 0.6566 0.6566
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-12.4009 -1.3821 0.0638 -0.0072 1.5523 10.4376
Coefficients:
Estimate Std. Error z-value Pr(>|z|)
(Intercept) 0.490390 0.218245 2.2470 0.02464 *
CPIC_SJA_P6M6ML6AR_L1M -0.105520 0.068414 -1.5424 0.12298
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Total Sum of Squares: 10496
Residual Sum of Squares: 10478
R-Squared: 0.0017562
Adj. R-Squared: 0.0011248
Chisq: 2.3789 on 1 DF, p-value: 0.12298
The negative relation between the month-end core CPI trend and next month’s return still shows as negative, but the statistical probability of this relation is only at 87-88%, below the level that many researchers would judge as significant. Also, the theta (degree of effective cross-sectional de-meaning of the data) has been near 2/3, indicating the importance of period-specific effects explaining the returns in the panel and testifying to the risk of pseudo-replication in pools.
For illustration see below period-specfic effects by month for the last 2 years:
print(tail(ranef(rets1, effect = "time"),24), 3)
2021-03-31 2021-04-30 2021-05-31 2021-06-30 2021-07-30 2021-08-31 2021-09-30
-0.1621 0.4628 -0.0991 -0.3644 2.8663 -0.9117 -5.7688
2021-10-29 2021-11-30 2021-12-31 2022-01-31 2022-02-28 2022-03-31 2022-04-29
-7.1200 0.4569 -0.9377 -4.4365 -2.0373 -4.7015 -2.4582
2022-05-31 2022-06-30 2022-07-29 2022-08-31 2022-09-30 2022-10-31 2022-11-30
-0.2491 -2.5410 2.5658 -4.0595 -4.2020 0.3827 1.7394
2022-12-30 2023-01-31 2023-02-03
-3.3615 2.8229 0.8966
3.2. Example 2: Confirming significance #
Using panel-specific regressions can not only help to reject the significance of particular indicators. In fact, it can also help to confirm the significance of particular inficators, shown in the next example. Here we investigate the significance of real Real IRS yield: 2-year maturity measured as the difference between IRS yields and formulaic inflation expectations based on recent CPI trends and effective inflation targets, on duration return. The JPMaQS ticker is RYLDIRS02Y_NSA and we use DU05YXR_NSA as target (description here ). The pooled regression below shows a very high significance of month-end real yields for subsequent returns. This relation has naturally been positive, with high yields predicting high returns, and with a statistical probability of more than 99.99%.
form2 = paste(targ2, "~1 +", paste0(feat2, "_L1M"))
pool2 <- plm(form2, model="pooling", data=dfx_pan2)
summary(pool2)
Pooling Model
Call:
plm(formula = form2, data = dfx_pan2, model = "pooling")
Unbalanced Panel: n = 6, T = 253-277, N = 1620
Residuals:
Min. 1st Qu. Median 3rd Qu. Max.
-7.568227 -0.571400 0.015524 0.570489 5.303937
Coefficients:
Estimate Std. Error t-value Pr(>|t|)
(Intercept) 0.071400 0.027757 2.5723 0.01019 *
RYLDIRS02Y_NSA_L1M 0.082863 0.015989 5.1824 2.465e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Total Sum of Squares: 1981.5
Residual Sum of Squares: 1949.2
R-Squared: 0.016328
Adj. R-Squared: 0.01572
F-statistic: 26.8577 on 1 and 1618 DF, p-value: 2.4651e-07
We run the period-specific random effects regression to check the significance of the relationship:
rets2 <- plm(form2, model = "random", effect = "time", data=dfx_pan2)
summary(rets2)
Oneway (time) effect Random Effect Model
(Swamy-Arora's transformation)
Call:
plm(formula = form2, data = dfx_pan2, effect = "time", model = "random")
Unbalanced Panel: n = 6, T = 253-277, N = 1620
Effects:
var std.dev share
idiosyncratic 0.5815 0.7626 0.481
time 0.6266 0.7916 0.519
theta:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.5660 0.6340 0.6340 0.6304 0.6340 0.6340
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-5.9147 -0.4006 -0.0095 -0.0004 0.4146 3.8556
Coefficients:
Estimate Std. Error z-value Pr(>|z|)
(Intercept) 0.077428 0.051608 1.5003 0.1335
RYLDIRS02Y_NSA_L1M 0.070252 0.017300 4.0607 4.892e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Total Sum of Squares: 949.22
Residual Sum of Squares: 939.5
R-Squared: 0.01024
Adj. R-Squared: 0.0096283
Chisq: 16.4895 on 1 DF, p-value: 4.8919e-05
In this case, the period-specific random effects regression confirms the high significance of the predictor (similar positive coefficient as in pooled regression and statistical probability remains above 99.99%)
3.3. Example 3: Enhancing significance #
Sometimes, the significance test based on random effects regression even reinforces evidence of significance provided by simplistic pooled analysis. This plausibly would be the case if relative (de-meaned) values of the features and targets have a close relation. We look at the predictive power of estimated GDP growth trends for duration returns. The JPMaQS ticker is INTRGDPv5Y_NSA_P1M1ML12_3MMA and the same target as in previous example DU05YXR_NSA (description here ).
form3 = paste(targ2, "~1 +", paste0(feat3, "_L1M"))
pool3 <- plm(form3, model="pooling", data=dfx_pan3)
summary(pool3)
Pooling Model
Call:
plm(formula = form3, data = dfx_pan3, model = "pooling")
Unbalanced Panel: n = 6, T = 259-277, N = 1631
Residuals:
Min. 1st Qu. Median 3rd Qu. Max.
-7.6702975 -0.5635270 -0.0053684 0.5804448 5.2240057
Coefficients:
Estimate Std. Error t-value Pr(>|t|)
(Intercept) 0.090940 0.027681 3.2852 0.001041 **
INTRGDPv5Y_NSA_P1M1ML12_3MMA_L1M -0.020007 0.010958 -1.8258 0.068065 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Total Sum of Squares: 1989.6
Residual Sum of Squares: 1985.5
R-Squared: 0.0020422
Adj. R-Squared: 0.0014295
F-statistic: 3.3335 on 1 and 1629 DF, p-value: 0.068065
Running period-specific random effects regression in fact strenghens the significance of growth trends: Indeed, period-specific random effects regression here shows a slightly higher significance of the predictor. Statistical probability around 95%.
rets3 <- plm(form3, model="random", effect="time", data=dfx_pan3)
summary(rets3)
Oneway (time) effect Random Effect Model
(Swamy-Arora's transformation)
Call:
plm(formula = form3, data = dfx_pan3, effect = "time", model = "random")
Unbalanced Panel: n = 6, T = 259-277, N = 1631
Effects:
var std.dev share
idiosyncratic 0.5834 0.7638 0.477
time 0.6396 0.7998 0.523
theta:
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.5691 0.6368 0.6368 0.6341 0.6368 0.6368
Residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-5.9494 -0.3883 -0.0118 -0.0012 0.4157 3.9001
Coefficients:
Estimate Std. Error z-value Pr(>|z|)
(Intercept) 0.095441 0.051829 1.8415 0.06555 .
INTRGDPv5Y_NSA_P1M1ML12_3MMA_L1M -0.024132 0.012348 -1.9543 0.05066 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Total Sum of Squares: 950.93
Residual Sum of Squares: 948.56
R-Squared: 0.0024934
Adj. R-Squared: 0.001881
Chisq: 3.81943 on 1 DF, p-value: 0.050662
Growth trends are often better indicators of relative country performance, because they are comparable and fairly exact, as opposed to survey data and early indicators, which are timelier and better suited to track changes to the absolute growth performance of a particular country. Hence, the consideration of period-specific effects, which partly look at relative country performance and returns, could plausibly reinforce their importance.
Indeed, period-specific random effects regression here shows a slightly higher significance of the predictor. Statistical probability is close to 95%.
4. Summarizing the regressions #
R offers various packages for displaying result of the regressions in a table, for example
huxreg
.
4.1. Example 1: Deflating significance with random time effects #
huxreg("Pool" = pool1, "Time Random" = rets1, error_pos = "below", stars = c(`*` = 0.1, `**` = 0.05, `***` = 0.01), statistics = c('P-value' = 'p.value'), number_format = "%.3f", error_format = "[{statistic}]",
note = "{stars}. T statistics in brackets.")
names | Pool | Time Random | |
---|---|---|---|
<chr> | <chr> | <chr> | |
Pool | Time Random | ||
1 | (Intercept) | 0.648450039311764 *** | 0.490390219003394 ** |
2 | [3.81452774975842] | [2.24697033688216] | |
3 | CPIC_SJA_P6M6ML6AR_L1M | -0.220924934337292 *** | -0.105519540253456 |
4 | [-2.70359030753287] | [-1.54236973778011] | |
1.1 | P-value | 0.00693298007520692 | 0.122983771084268 |
.1 | *** p < 0.01; ** p < 0.05; * p < 0.1. T statistics in brackets. | *** p < 0.01; ** p < 0.05; * p < 0.1. T statistics in brackets. | *** p < 0.01; ** p < 0.05; * p < 0.1. T statistics in brackets. |
4.2. Example 2: Confirming significance with random time effects #
huxreg("Pool" = pool2, "Time Random" = rets2, error_pos = "below", stars = c(`*` = 0.1, `**` = 0.05, `***` = 0.01), statistics = c('P-value' = 'p.value'), number_format = "%.3f")
names | Pool | Time Random | |
---|---|---|---|
<chr> | <chr> | <chr> | |
Pool | Time Random | ||
1 | (Intercept) | 0.0714000407995654 ** | 0.077428381691158 |
2 | (0.0277569633002538) | (0.0516083589903508) | |
3 | RYLDIRS02Y_NSA_L1M | 0.0828626212726093 *** | 0.0702523956115305 *** |
4 | (0.0159890971135926) | (0.0173004285290142) | |
1.1 | P-value | 2.46509730904769e-07 | 4.89191979781359e-05 |
.1 | *** p < 0.01; ** p < 0.05; * p < 0.1. | *** p < 0.01; ** p < 0.05; * p < 0.1. | *** p < 0.01; ** p < 0.05; * p < 0.1. |
4.3. Example 3: Enhancing significance with random time effects #
huxreg("Pool" = pool3, "Time Random" = rets3, error_pos = "below", stars = c(`*` = 0.1, `**` = 0.05, `***` = 0.01), statistics = c('P-value' = 'p.value'), number_format = "%.3f")
names | Pool | Time Random | |
---|---|---|---|
<chr> | <chr> | <chr> | |
Pool | Time Random | ||
1 | (Intercept) | 0.0909395750031496 *** | 0.095441166375607 * |
2 | (0.0276812642325732) | (0.0518291599921125) | |
3 | INTRGDPv5Y_NSA_P1M1ML12_3MMA_L1M | -0.0200069982009708 * | -0.024132192811326 * |
4 | (0.0109580138249196) | (0.0123480293844477) | |
1.1 | P-value | 0.0680653538559699 | 0.0506615471228162 |
.1 | *** p < 0.01; ** p < 0.05; * p < 0.1. | *** p < 0.01; ** p < 0.05; * p < 0.1. | *** p < 0.01; ** p < 0.05; * p < 0.1. |
The END