## This script is a companion to Appendix A.1 of Modern ART -- Using R. ## Load this file as an R-script. ## Or, copy its contents to the clipboard and paste it in a New script. ## Select some lines (until the next blank line), and feed them to R using Ctrl-R. ## Study what happens. ## This script lets the student get a feel of the concepts and methods behind R. ## While executing it, simultaneously read the text in Appendix A.1. ## Legenda: ## means explanation, #! is an important remark, #? is a question/exercise. ## Other non-empty lines are R-statements to be executed. ## Everything after # on a line fed to R is treated as comment. const <- 1/sqrt(2*pi) str(const) ## structure of the 'object' const: numeric, with value 0.399 numbers <- c(0, 3:5, 20, 0) numbers ## 0 3 4 5 20 0 vec <- 1:20 ## This is short for: vec <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20) ## Now make a matrix containing the elements in vec, stored row-wise: mat <- matrix(vec, ncol=5, byrow=TRUE) mat ## print the object mat matrix(vec, nrow=4) ## 1:20 arranged in R's standard *column major order* (that is, by column) (words <- c("Testing", "testing", "one", "two", "three")) ## putting brackets () around the statement induces R to print the result a.matrix <- rbind(numbers, 1:6) a.matrix ## this matrix arises by 'binding' two row vectors ## For-loops and vector operations a <- numeric(20) ## initialize a vector of 20 reals (all zero) for (i in 1:20) a[i] <- i^2 a ## the first 20 squares, filled in 'Pascal' way b <- (1:20)^2 ## needs no initialization b ## this vector operation is a lot simpler than a for-loop ## Matrix and vector operations a.matrix %*% 2:7 ## a.matrix multiplied by (2,3,4,5,6,7)' 1:3 * 1:3 ## elementwise product of two vectors (1:3)^2 ## gives the same result t(1:3)*1:3 ## elementwise product (note: now as a matrix) 1:3 %*% 1:3 ## inner product (as a matrix) t(1:3)%*%1:3 ## same as above crossprod(1:3) ## again the same, now using the cross-products function 1:3 %*% t(1:3) ## outer product 1:3%o%1:3 ## same as above crossprod(t(1:3))## again the same cross-products as above words[c(3:5, 1)] ## words[3], words[4], words[5], words[1] numbers[-4] ## the fourth element of vector numbers is omitted numbers[numbers < 10] ## only the small elements sqrt(2)^2 == 2 ## should be TRUE sqrt(2)^2 - 2 ## but this is the actual difference (machine precision) numbers2 <- numbers numbers2[4:5] <- c(17,19) numbers2 ## partial replacement ## Indexation of matrices goes separately for rows and columns: a.matrix ## is the original matrix a.matrix[2:1, numbers>4] ## rows 2 and 1; those columns that have numbers>4 a.matrix[, c(1,3,5)] ## all rows in natural order; columns 1,3,5 ## A little about lists (list1 <- list(num=numbers, char=words)) list1$char ## The item in list1 named char ## Put brackets where needed: 9*3 ^-2; (9*3)^-2 ## not equal; power precedes multiplication -2^-.5 ## is not equal to: (-2)^-.5 ## because 'unary minus' has lower priority than exponentiation; ?Syntax ## gives details 2^3^2 ## 2^9, not 8^2; exponentiation is from right to left ## Adding two vectors of different length c(1,0)+(-1:4) ## see book p. 330--331 for what happens ## Numbers can be NA (not applicable), NaN (not a number), Inf c(2, 4, NA, 8, 10) == NA ## comparisons with NA all result in NA, not in TRUE/FALSE is.na(c(2, 4, NA, 8, 10)) ## use is.na to check on missing values; the 3rd element is NA ######################################################################### ## This script is a companion to Appendix A.2 of Modern ART -- Using R. ## Load this file as an R-script. ## Or, copy its contents to the clipboard and paste it in a New script. ## Select some lines (until the next blank line), and feed them to R using Ctrl-R. ## Study what happens, and answer any questions given (after #?). ## While executing this script, read Appendix A.1 of Modern ART -- Using R. ## Legenda: ## means explanation, #! is an important remark, #? is a question/exercise. ## Other non-empty lines are R-statements to be executed. ## Everything after # on a line fed to R is treated as comment. ## We want to predict the future behavior of a portfolio consisting ## of a mix of two stocks: ## Reading the weekly stock prices of "Tyres" over a period of 3 years: Tyres <- scan(n=157) 307 315 316 314 324 310 311 295 278 295 318 343 342 323 328 303 309 307 315 296 313 316 317 306 307 326 330 330 333 341 337 353 356 359 349 351 359 360 363 342 337 334 352 357 360 368 363 366 366 365 381 401 401 421 422 425 417 427 436 440 432 406 401 420 420 424 416 403 400 392 391 390 406 415 429 420 415 420 417 445 447 449 447 450 460 470 495 507 518 516 522 524 484 497 490 500 464 458 446 450 471 485 486 501 506 502 494 497 465 478 490 496 517 506 497 483 474 495 499 483 477 481 474 479 431 438 431 436 434 453 442 445 461 463 481 490 470 480 497 507 503 508 485 492 490 519 506 539 542 553 558 562 532 510 512 504 474 ## Reading the stock prices of "Liquor" in the same period: Liquor <- scan(n=157) 781 784 757 741 728 726 743 746 768 752 758 754 779 777 780 815 791 779 802 797 800 860 873 854 855 846 824 833 838 851 827 847 859 853 926 935 952 962 958 943 938 949 1000 1003 1018 1022 1026 1019 1037 1026 999 1011 994 1036 1030 1028 1005 1006 1005 970 983 984 980 996 975 976 987 1008 1057 1054 1040 1045 1057 1101 1096 1094 1108 1102 1104 1105 1092 1098 1113 1076 1060 1054 1054 1057 1078 1077 1091 1093 1079 1086 1064 1134 1167 1217 1155 1171 1174 1213 1218 1250 1329 1350 1334 1318 1315 1310 1368 1370 1389 1420 1377 1366 1424 1455 1475 1478 1458 1430 1409 1407 1414 1409 1402 1386 1392 1391 1445 1448 1462 1474 1482 1495 1525 1547 1538 1436 1465 1454 1460 1476 1521 1588 1581 1587 1539 1574 1537 1518 1530 1500 1554 1532 1498 ## Plot the two series: par(mfrow=c(1,2)) ## this gives you two adjacent plots plot(Tyres, xlab="Week", type="l") plot(Liquor, xlab="Week", type="l") ## See Fig A.1; you can resize the plot with your mouse ## Generate the log-returns: Tyres.lr <- log(Tyres[-1] / ## Tyres[2:157] / Tyres[-length(Tyres)]) ## Tyres[1:156] Liquor.lr <- log(Liquor[-1]/Liquor[-length(Liquor)]) ## Make some plots to visualize the association and the behavior ## of the two series, see Fig A.2: par(mfrow=c(1,3)) plot(Tyres.lr, Liquor.lr, main="Scatterplot log-returns") plot(Tyres.lr, xlab="Week", type="l") plot(Liquor.lr, xlab="Week", type="l") ## To judge normality visually, see Fig. A.3 ## 1) produce the normal Q-Q plots: par(mfrow=c(1,3)) qqnorm(Tyres.lr, main="Normal Q-Q plot Tyres") qqline(Tyres.lr, col="red") qqnorm(Liquor.lr, main="Normal Q-Q plot Liquor") qqline(Liquor.lr, col="red") ## See the comments p. 334 ## 2) produce a histogram and compare ## it with a fitted normal density: hist(Tyres.lr, prob=T, breaks=21) curve(dnorm(x, mean=mean(Tyres.lr), sd=sd(Tyres.lr)), add=T, col="red") lines(density(Tyres.lr), col="blue") ## See the comments preceding (A.1) ## Test if the log-returns follow a normal distribution by computing the ## Jarque-Bera test statistic (A.1): ## For the Tyres data: x <- Tyres.lr - mean(Tyres.lr) m2 <- mean(x^2); m3 <- mean(x^3); m4 <- mean(x^4) S2 <- m3^2/m2^3; K <- m4/m2^2 - 3 (JB <- length(x)/6 * (S2 + K^2/4)) (p.value <- 1-pchisq(JB, df=2)) #? Compute the same quantities for the Liquor stock. ## Estimate the parameters of the underlying bivariate normal distribution: Tyres.lr.mean <- mean(Tyres.lr) Tyres.lr.sd <- sd(Tyres.lr) Liquor.lr.mean <- mean(Liquor.lr) Liquor.lr.sd <- sd(Liquor.lr) Tyres.Liquor.lr.corr <- cor(Tyres.lr, Liquor.lr) ## Now look at the future performance of a portfolio of equal ## parts of these stocks, based on the normal bivariate distribution ## with parameters as computed above: Periods <- 104 ## two year period (actually 104 weeks) mean.X <- Periods * Tyres.lr.mean mean.Y <- Periods * Liquor.lr.mean sd.X <- sqrt(Periods * Tyres.lr.sd^2) sd.Y <- sqrt(Periods * Liquor.lr.sd^2) cov.XY <- Periods * Tyres.Liquor.lr.corr * Tyres.lr.sd * Liquor.lr.sd r.XY <- cov.XY / sd.X / sd.Y ## Simulate outcomes of S: ## In the library MASS there is an R function to do this: library(MASS) ?mvrnorm ## But we will follow the Cholesky decomposition method for n=2 ## outlined in (A.5)-(A.7) to generate a random sample: set.seed(2525) ## initialize the random number generator U <- rnorm(1000); V <- rnorm(1000) ## generate 2x1000 iid N(0,1) observations alpha <- sqrt(1/r.XY^2-1) X <- mean.X + sd.X * U Y <- mean.Y + sd.Y * (U + alpha * V) * r.XY S <- Tyres[length(Tyres)]*exp(X) + Liquor[length(Liquor)]*exp(Y) #? Print sample means, variances and correlations of (X,Y) to see if #? the procedure followed was correct. ## Draw the corresponding histogram and fitted normal density, Fig. A.4: par(mfrow=c(1,1)); S1000 <- S/1000 hist(S1000, breaks=21, prob=T) lines(density(S1000), col="blue") curve(dnorm(x, mean=mean(S1000), sd=sd(S1000)), add=T, col="red") ## Compute the sample quantiles of S: pr <- c(2.5, 5, 10, 25, 50, 75, 90, 95, 97.5, 99, 99.5)/100 S <- S/(Tyres[length(Tyres)] + Liquor[length(Liquor)]) round(100*quantile(S, pr)) ## See comments about the performance of the portfolio at the end of App. A.2 ############################################################################# ## This script follows Appendix A.3 of Modern ART -- Using R. ## Load this file as an R-script, through File-Open script-MART-UR.A.3.R.txt . ## Or, copy its contents to the clipboard and paste it in a New script. ## Select some lines (until the next blank line), and feed them to R using Ctrl-R. ## Study what happens, and answer any questions given (after #?). ## For more explanation, read the text of App A.3. ## Legenda: ## means explanation, #! is an important remark, #? is a question/exercise. ## Other non-empty lines are R-statements to be executed. ## Everything after # on a line fed to R is treated as comment. ## Our aim is to generate a portfolio of automobile policies, ## resembling a real-life portfolio. ## See also Section 9.5, in which a similar portfolio is studied. n.obs <- 10000; set.seed(4) ## 10000 observations; random seed initialized to 4 ## Think of the pseudo-random numbers generated by a computer as a very long vector. ## The function set.seed determines where in this array we want to start. ## Of the policy holders, we want about 60% to be of gender 1, 40% of gender 2. ## We do this by calling the sample()-function. ## Its first argument is a vector with the possible values. ## The second parameter is the number of observations. ## The repl= argument (with default F, i.e., FALSE) induces sampling with (T) ## or without (F) replacement. ## The prob= argument gives the (relative) probabilities of the values 1 and 2. sx <- sample(1:2, n.obs, repl=T, prob=c(6,4)) ## Note that prob=c(6,4) means that 60% is to fall in category 1, 40% in 2. ## We want the gender numbers 1 and 2 to act as class labels, so we make ## sx into a classification (factor), as follows: sx <- as.factor(sx) ## Analogously, we assign random job classes to our policies: jb <- as.factor(sample(1:3, n.obs, repl=T, prob=c(3,2,1))) ## Note that sx and jb were drawn independently, so for instance the probability ## of sx=1 and jb=3 is equal to 6/10 * 1/6. ## Between the factors re (region) and tp (type of, e.g., car) there is interaction, ## which means that the conditional probability of re=r, given tp=t, depends on t. ## We want the joint probabilities of combinations (re,tp) to be as follows: ## ## tp 1 2 3 ## re 1 .1 .05 .15 ## 2 .15 .1 .05 ## 3 .1 .1 .2 ## ## To achieve this, we first draw combinations of these factors, with probabilities ## as given in the table. re.tp <- sample(1:9, n.obs, repl=T, prob=c(2,1,3,3,2,1,2,2,4)) ## Next deduce the corresponding values of re and tp from the 'level' 1:9 of re.tp. ## The levels 1, 4 and 7 give tp=1, 2,5,8 give tp=2, 3,6,9 give tp=3. ## Analogously for region re. tp <- as.factor(c(1,2,3,1,2,3,1,2,3)[re.tp]) re <- as.factor(c(1,1,1,2,2,2,3,3,3)[re.tp]) ## The first ten elements of the vector re.tp are 3 8 2 4 4 9 7 8 6 9 ## This means that the first ten elements of tp will be 3 2 2 1 1 3 1 2 3 3 ## The first ten elements of vector re get the values 1 3 1 2 2 3 3 3 2 3 #? Look at the help with the table() function to interpret the following command; #? also, inspect the output to see if we did it right: table(list(region=re, type=tp)) ## We get a table with the numbers of policies for each (region,type) combination: ## type ## region 1 2 3 ## 1 960 527 1475 ## 2 1496 1038 465 ## 3 991 1002 2046 ## For instance, 465 policies had region=2 and type=3. ## These numbers are quite close to the expected numbers: ## type ## region 1 2 3 ## 1 1000 500 1500 ## 2 1500 1000 500 ## 3 1000 1000 2000 ## For efficiency reasons, R aims to store its data in memory. ## So it pays to be parsimonious sometimes, especially for really large datasets. ## Therefore we remove the vector re.tp that is no longer needed. ## To see how many bytes were occupied by this vector, do: object.size(re.tp) ## Apparently this vector occupies 10000*4 bytes storage space, plus 24 bytes overhead. ## To release these 40024 bytes, do rm(re.tp) ## We do some basic tests on memory usage by R: hh <- 1:2^20; object.size(hh)/length(hh) hh <- rep(0,2^20); object.size(hh)/length(hh) hh <- as.integer(hh); object.size(hh)/length(hh) ## Apparently integers occupy only 4 bytes each. ## Reals take 8 bytes; sometimes obvious integers (the 2^20 zeroes) are stored as reals. ## Note that in Pascal (with extended precision), reals take 10 bytes apiece. ## The policies had different exposure times. Each was in force for either 3 (10%), ## 6 (10%) or 12 months (80%), independent of the other characteristics of the policy: mo <- 3 * sample(1:4, n.obs, repl=T, prob=c(1,1,0,8)) ## The mean annual number of claims on each policy is a function of its characteristics. ## An increase of 1 in each level of sx, re and tp leads to 20% more claims; ## jb level is irrelevant for the claim frequency: mu <- 0.05 * c(1,1.2)[sx] * c(1,1,1)[jb] * c(1,1.2,1.44)[re] * c(1,1.2,1.44)[tp] ## The number 0.05 is the base claim frequency. ## This is the claim frequency for someone in the 'standard' cell (1,1,1,1). ## We let the drivers produce claims in a one-year period according to a Poisson process. ## The Poisson parameters to be fed to rpois are proportional to mo. y <- rpois(n.obs, mu * mo/12) ## Now we are going to make some tables. First of the number of claims per policy: table(y) ## The output is ## y ## 0 1 2 3 ## 9276 702 20 2 ## So of the 10000 policies, 9276 were claim-free ## A cross-table of the numbers of accidents by gender is created by table(list(nCl=y,gender=sx)) ## gender ## nCl 1 2 ## 0 5578 3698 ## 1 400 302 ## 2 9 11 ## 3 1 1 ## So, e.g., 3698 persons of gender 2 were claim-free ## To tabulate the number of claims for each gender-region combination, use either ## the interaction() function, or the ":" interaction operator between factors: table(list(nCl=y,gender.region=sx:re)) ## The output looks like: ## gender.region ## nCl 1:1 1:2 1:3 2:1 2:2 2:3 ## 0 1683 1686 2209 1103 1119 1476 ## 1 102 105 193 70 84 148 ## 2 2 0 7 2 4 5 ## 3 0 0 1 0 1 0 ## As you can see, the factor levels of gender and region are separated by ":" ## and the (gender,region) combinations appear lexically #? With the interaction() function, which arguments sep= and lex.order= should #? be given to produce the same output? [see ?interaction] ## In the help files of Version 2.7.0, you can find: ## sep string to construct the new level labels by joining the constituent ones. ## lex.order logical indicating if the order of factor concatenation should be ## lexically ordered ## It will be clear that we must give arguments sep=":", lex.order=TRUE. ## Compare the results of table(list(nCl=y,gender.region=interaction(sx,re))) table(list(nCl=y,gender.region=interaction(sx,re,sep=":", lex.order=TRUE))) ## To reduce time and memory needed, without actually losing (much) information, ## it often makes sense to use an aggregated version of a portfolio. ## For n=10000, the effect is small, but for a million policies, it shows. ## For every combination of risk factors, we need the total numbers of claims, ## the total exposure times and the numbers of policies. ## This is achieved by calling the aggregate() function. aggr <- aggregate(list(Expo=mo/12,nCl=y,nPol=1), list(Jb=jb,Tp=tp,Re=re,Sx=sx), sum) ## Its first argument is a list of the quantities we need totaled, ## the second argument is a list of factors to split up the portfolio. ## We list a selection of rows from the newly created aggr dataframe: aggr[sort(sample(1:54,10)),] #? How much memory is used by the aggr dataframe, how much by the #? vectors on which it is based? ## The easiest way to answer this is just to ask R: object.size(aggr) object.size(mo);object.size(y) object.size(jb);object.size(tp);object.size(re);object.size(sx) ## Note that mo and y store reals, not integers.