load("BANK1.Rdata")
BANK1
## X Employee EducLev JobGrade YrHired YrBorn Gender YrsPrior PCJob Salary
## 1 1 1 3 1 92 69 Male 1 No 32.00
## 2 2 2 1 1 81 57 Female 1 No 39.10
## 3 3 3 1 1 83 60 Female 0 No 33.20
## 4 4 4 2 1 87 55 Female 7 No 30.60
## 5 5 5 3 1 92 67 Male 0 No 29.00
## 6 6 6 3 1 92 71 Female 0 No 30.50
## 7 7 7 3 1 91 68 Female 0 No 30.00
## 8 8 8 3 1 87 62 Male 2 No 27.00
## 9 9 9 1 1 91 33 Female 0 No 34.00
## 10 10 10 3 1 86 64 Female 0 No 29.50
## 11 11 11 3 1 86 61 Female 2 No 26.80
## 12 12 12 2 1 87 58 Female 8 No 31.30
## 13 13 13 2 1 86 58 Female 0 No 31.20
## 14 14 14 2 1 85 37 Female 6 No 34.70
## 15 15 15 3 1 91 62 Female 0 No 30.00
## 16 16 16 3 1 92 68 Female 0 No 31.00
## 17 17 17 3 1 89 65 Female 0 No 27.00
## 18 18 18 2 1 87 58 Female 9 No 29.60
## 19 19 19 3 1 90 51 Female 6 No 32.60
## 20 20 20 2 1 91 66 Female 3 No 29.60
## 21 21 21 3 1 91 59 Female 2 No 29.50
## 22 22 22 2 1 92 67 Male 3 No 31.00
## 23 23 23 1 1 90 50 Female 0 No 28.50
## 24 24 24 2 1 92 62 Male 4 No 26.70
## 25 25 25 3 1 92 71 Male 1 No 30.75
## 26 26 26 3 1 92 68 Male 1 No 29.50
## 27 27 27 2 1 79 35 Female 6 No 42.20
## 28 28 28 1 1 82 47 Female 0 No 37.60
## 29 29 29 1 1 83 55 Female 6 No 34.00
## 30 30 30 2 1 91 62 Female 7 No 33.00
## 31 31 31 1 1 88 60 Female 4 No 28.76
## 32 32 32 1 1 84 51 Female 0 No 35.40
## 33 33 33 3 1 92 52 Male 8 No 31.00
## 34 34 34 2 1 77 49 Female 2 No 38.80
## 35 35 35 2 1 81 53 Female 0 No 34.30
## 36 36 36 1 1 76 48 Female 0 No 35.00
## 37 37 37 3 1 92 70 Female 2 Yes 34.60
## 38 38 38 2 1 93 65 Female 4 No 28.50
## 39 39 39 1 1 84 55 Female 0 No 29.50
## 40 40 40 3 1 92 69 Male 2 No 30.50
## 41 41 41 3 1 90 63 Female 1 No 34.20
## 42 42 42 1 1 80 44 Female 0 No 43.60
## 43 43 43 5 1 88 60 Female 0 Yes 33.50
## 44 44 44 3 1 83 58 Female 1 No 33.00
## 45 45 45 1 1 77 51 Female 0 No 45.30
## 46 46 46 1 1 78 42 Male 3 No 38.80
## 47 47 47 1 1 85 55 Female 0 No 29.90
## 48 48 48 3 1 90 44 Male 10 No 31.20
## 49 49 49 1 1 80 53 Female 0 No 34.00
## 50 50 50 2 1 93 42 Female 0 No 30.45
## 51 51 51 1 1 92 37 Male 3 No 35.50
## 52 52 52 1 1 91 51 Female 10 Yes 34.00
## 53 53 53 2 1 88 64 Female 0 No 29.10
## 54 54 54 1 1 87 31 Female 0 No 29.65
## 55 55 55 3 1 80 48 Female 1 No 29.20
## 56 56 56 3 1 86 58 Female 0 Yes 29.80
## 57 57 57 2 1 79 49 Female 0 No 33.50
## 58 58 58 1 1 87 40 Female 0 No 34.00
## 59 59 59 1 1 86 56 Female 0 No 29.60
## 60 60 60 3 1 77 44 Female 0 No 34.00
## 61 61 61 2 2 92 58 Female 8 No 37.25
## 62 62 62 2 2 89 65 Male 3 No 33.00
## 63 63 63 3 2 91 69 Female 0 No 28.60
## 64 64 64 5 2 90 54 Female 1 Yes 36.00
## 65 65 65 3 2 91 61 Female 4 Yes 37.30
## 66 66 66 2 2 88 38 Male 4 No 29.90
## 67 67 67 1 2 84 42 Female 8 No 31.50
## 68 68 68 3 2 90 63 Female 4 Yes 41.40
## 69 69 69 1 2 78 51 Female 5 No 32.74
## 70 70 70 3 2 92 70 Male 1 No 33.50
## 71 71 71 1 2 90 64 Female 9 No 32.00
## 72 72 72 1 2 86 45 Female 0 No 30.80
## 73 73 73 5 2 92 48 Female 3 Yes 42.00
## 74 74 74 3 2 91 60 Male 0 No 34.00
## 75 75 75 2 2 79 52 Female 0 No 32.50
## 76 76 76 2 2 86 49 Female 10 No 31.70
## 77 77 77 5 2 92 60 Male 0 No 36.50
## 78 78 78 3 2 91 73 Male 0 No 33.00
## 79 79 79 2 2 87 37 Female 0 No 31.20
## 80 80 80 5 2 87 55 Female 0 No 34.00
## 81 81 81 3 2 89 65 Female 0 No 33.00
## 82 82 82 5 2 91 66 Female 4 No 33.90
## 83 83 83 1 2 92 64 Female 9 Yes 39.00
## 84 84 84 2 2 83 43 Male 18 No 34.92
## 85 85 85 5 2 92 62 Male 5 No 39.00
## 86 86 86 1 2 87 46 Female 0 No 34.00
## 87 87 87 2 2 89 61 Female 7 No 31.90
## 88 88 88 5 2 92 69 Male 1 No 37.00
## 89 89 89 5 2 91 67 Male 0 No 34.00
## 90 90 90 5 2 92 60 Female 2 No 36.40
## 91 91 91 1 2 80 48 Female 1 Yes 38.20
## 92 92 92 1 2 80 44 Female 0 No 35.30
## 93 93 93 3 2 92 69 Male 2 No 34.50
## 94 94 94 3 2 83 62 Female 0 No 30.50
## 95 95 95 4 2 93 68 Male 2 No 30.00
## 96 96 96 5 2 87 61 Female 0 Yes 37.30
## 97 97 97 4 2 90 66 Female 0 No 40.20
## 98 98 98 3 2 90 68 Male 0 No 35.50
## 99 99 99 1 2 84 52 Female 0 No 35.00
## 100 100 100 3 2 91 59 Female 3 No 38.00
## 101 101 101 1 2 86 57 Female 0 No 35.30
## 102 102 102 2 2 81 35 Female 0 No 34.10
## 103 103 103 3 3 91 52 Female 5 Yes 43.20
## 104 104 104 2 3 80 47 Female 5 No 36.10
## 105 105 105 5 3 88 63 Female 3 No 34.60
## 106 106 106 3 3 90 64 Male 0 No 36.00
## 107 107 107 5 3 88 66 Female 2 No 36.20
## 108 108 108 3 3 88 60 Female 0 No 37.50
## 109 109 109 3 3 91 58 Female 12 No 41.00
## 110 110 110 2 3 85 52 Female 0 No 35.60
## 111 111 111 3 3 90 62 Female 5 No 39.80
## 112 112 112 4 3 84 37 Female 4 Yes 41.30
## 113 113 113 3 3 86 51 Female 7 No 42.50
## 114 114 114 3 3 91 58 Female 8 Yes 45.80
## 115 115 115 5 3 90 47 Female 6 No 34.90
## 116 116 116 5 3 91 69 Male 0 No 41.50
## 117 117 117 3 3 90 70 Female 0 No 38.00
## 118 118 118 4 3 89 57 Female 0 No 35.00
## 119 119 119 3 3 89 54 Female 0 No 40.00
## 120 120 120 3 3 90 66 Male 0 No 36.00
## 121 121 121 2 3 86 36 Female 0 No 33.70
## 122 122 122 2 3 90 66 Male 4 No 36.30
## 123 123 123 3 3 92 68 Female 2 Yes 38.00
## 124 124 124 5 3 91 65 Female 0 No 39.50
## 125 125 125 2 3 88 61 Female 5 No 36.30
## 126 126 126 3 3 87 60 Female 2 No 32.50
## 127 127 127 2 3 83 45 Female 6 No 37.00
## 128 128 128 5 3 92 62 Female 1 No 32.60
## 129 129 129 3 3 91 69 Female 0 No 36.00
## 130 130 130 5 3 92 59 Female 0 No 35.00
## 131 131 131 5 3 92 62 Female 5 Yes 43.60
## 132 132 132 3 3 87 48 Female 0 No 33.80
## 133 133 133 1 3 74 44 Female 0 No 35.30
## 134 134 134 1 3 79 53 Female 6 No 42.40
## 135 135 135 5 3 90 64 Male 0 No 39.50
## 136 136 136 2 3 70 33 Female 10 No 43.50
## 137 137 137 5 3 89 49 Male 1 No 42.00
## 138 138 138 3 3 74 35 Female 9 No 40.30
## 139 139 139 4 3 89 52 Male 5 No 44.00
## 140 140 140 1 3 70 42 Female 2 No 40.66
## 141 141 141 3 3 82 57 Female 1 No 39.70
## 142 142 142 5 3 89 56 Female 5 No 45.00
## 143 143 143 5 3 88 60 Female 0 No 43.90
## 144 144 144 4 3 87 55 Female 3 No 38.00
## 145 145 145 5 3 90 63 Female 3 No 39.02
## 146 146 146 5 4 90 62 Male 3 No 44.50
## 147 147 147 5 4 91 65 Male 1 No 41.00
## 148 148 148 5 4 89 58 Male 3 No 44.00
## 149 149 149 5 4 89 65 Male 0 No 44.00
## 150 150 150 5 4 90 63 Female 4 No 42.50
## 151 151 151 5 4 88 58 Female 3 No 40.26
## 152 152 152 5 4 90 66 Male 1 No 44.50
## 153 153 153 1 4 82 45 Female 9 No 35.50
## 154 154 154 5 4 89 66 Male 0 No 42.50
## 155 155 155 5 4 88 63 Female 0 No 44.00
## 156 156 156 5 4 89 64 Male 2 No 45.00
## 157 157 157 2 4 80 48 Female 4 No 44.40
## 158 158 158 3 4 78 51 Female 0 No 38.00
## 159 159 159 5 4 91 68 Male 0 No 41.80
## 160 160 160 1 4 72 40 Male 0 No 45.50
## 161 161 161 3 4 90 43 Male 4 No 42.50
## 162 162 162 5 4 92 45 Female 12 No 44.00
## 163 163 163 3 4 76 36 Female 8 Yes 54.30
## 164 164 164 3 4 69 48 Female 0 No 44.80
## 165 165 165 3 4 89 52 Male 4 No 47.00
## 166 166 166 5 4 80 54 Female 0 No 43.80
## 167 167 167 1 4 83 56 Female 4 Yes 48.00
## 168 168 168 5 4 86 56 Female 0 No 42.70
## 169 169 169 3 4 81 55 Female 1 Yes 48.50
## 170 170 170 3 4 79 46 Female 0 No 42.00
## 171 171 171 2 4 79 42 Female 1 No 45.50
## 172 172 172 3 4 84 58 Female 0 No 44.50
## 173 173 173 2 4 82 55 Female 2 No 51.20
## 174 174 174 5 5 88 61 Male 0 No 47.50
## 175 175 175 5 5 87 58 Female 0 No 44.50
## 176 176 176 5 5 87 64 Male 0 No 47.00
## 177 177 177 5 5 89 54 Male 10 No 47.00
## 178 178 178 3 5 78 49 Female 4 No 43.10
## 179 179 179 5 5 87 58 Male 2 No 49.00
## 180 180 180 5 5 87 62 Male 0 No 48.50
## 181 181 181 3 5 87 60 Female 5 No 45.00
## 182 182 182 5 5 79 46 Female 5 No 52.50
## 183 183 183 5 5 89 62 Male 2 No 47.50
## 184 184 184 5 5 88 64 Male 0 No 48.00
## 185 185 185 5 5 87 46 Male 4 No 46.50
## 186 186 186 5 5 83 55 Female 2 No 61.50
## 187 187 187 5 5 86 58 Female 2 No 50.00
## 188 188 188 5 5 83 49 Female 2 No 61.80
## 189 189 189 4 5 79 52 Female 0 No 43.00
## 190 190 190 5 5 84 59 Male 1 No 47.00
## 191 191 191 5 5 86 58 Female 6 No 58.50
## 192 192 192 5 5 79 55 Male 7 No 55.00
## 193 193 193 3 5 71 41 Male 3 No 57.00
## 194 194 194 5 5 78 38 Male 1 No 57.00
## 195 195 195 5 6 81 46 Male 0 No 60.00
## 196 196 196 3 6 82 54 Male 0 No 60.00
## 197 197 197 5 6 76 36 Male 4 No 59.00
## 198 198 198 5 6 83 44 Male 0 No 60.00
## 199 199 199 5 6 75 50 Male 0 No 65.00
## 200 200 200 5 6 75 39 Male 1 No 52.00
## 201 201 201 5 6 73 38 Male 0 No 58.00
## 202 202 202 4 6 74 42 Male 0 No 60.00
## 203 203 203 5 6 56 30 Male 0 No 74.00
## 204 204 204 3 6 61 35 Male 0 No 95.00
## 205 205 205 5 6 59 34 Male 0 No 97.00
## 206 206 206 5 6 63 33 Male 0 No 88.00
## 207 207 207 5 6 60 36 Male 0 No 94.00
## 208 208 208 5 6 62 33 Female 0 No 30.00
Categorical variable: A variable is a collection of information that is divided into groups.
Nominal variale: a variable can be defined as a categorical variable in which the categories cannot be ordered.
ordinal variable: An ordinal variable is a categorical variable for which the possible values are ordered.
Qualitative variables or Categorical data
EduLev, JobGrade, Gender, PCJob
Nominal variables: Gendeer, PCJob, EduLev
Ordinal data: JobGrade
Dumy variables: one variable that takes only the value 0 or 1 to indicate the absence or presence of some categorical effect that may be expected to shift the outcome.
YrHird, YrBorn, Salary
Discrete variables: YrHird, YrBorn
Continuous variables: Salary
Functional data: Data that “provides information about curves, surfaces or anything else varying over a continuum.”
Aemet data. Series of daily summaries of 73 spanish weather stations selected for the period 1980-2009. The dataset contains geographic information of each station and the average for the period 1980-2009 of dayly temperature, daily precipitation and daily wind speed. Meteorological State Agency of Spain (AEMET),Government of Spain.
library(fda.usc)
## Warning: package 'fda.usc' was built under R version 4.0.5
## Loading required package: fda
## Warning: package 'fda' was built under R version 4.0.3
## Loading required package: splines
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.0.3
## Loading required package: fds
## Warning: package 'fds' was built under R version 4.0.3
## Loading required package: rainbow
## Warning: package 'rainbow' was built under R version 4.0.3
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 4.0.3
## Loading required package: pcaPP
## Warning: package 'pcaPP' was built under R version 4.0.3
## Loading required package: RCurl
##
## Attaching package: 'fda'
## The following object is masked from 'package:graphics':
##
## matplot
## Loading required package: mgcv
## Warning: package 'mgcv' was built under R version 4.0.3
## Loading required package: nlme
## Warning: package 'nlme' was built under R version 4.0.3
## This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.
## ----------------------------------------------------------------------------------
## Functional Data Analysis and Utilities for Statistical Computing
## fda.usc version 2.0.2 (built on 2020-02-17) is now loaded
## fda.usc is running sequentially usign foreach package
## Please, execute ops.fda.usc() once to run in local parallel mode
## Deprecated functions: min.basis, min.np, anova.hetero, anova.onefactor, anova.RPm
## New functions: optim.basis, optim.np, fanova.hetero, fanova.onefactor, fanova.RPm
## ----------------------------------------------------------------------------------
data(aemet)
names(aemet)
## [1] "df" "temp" "wind.speed" "logprec"
class(aemet$temp)
## [1] "fdata"
names(aemet$temp)
## [1] "data" "argvals" "rangeval" "names"
dim(aemet$temp)
## [1] 73 365
head(aemet$temp$argvals)
## [1] 0.5 1.5 2.5 3.5 4.5 5.5
aemet$tem$rangeval
## [1] 0 365
range(aemet$temp)
## [1] -1.613333 29.053846
is.fdata(aemet$df)
## [1] FALSE
is.fdata(aemet$temp)
## [1] TRUE
par(mfrow = c(1, 2), oma = c(0, 0, 0, 0))
col1 = ifelse(aemet$df$latitude < 31, "red", "blue")
plot(aemet$df[, c("longitude", "latitude")], col = col1, lwd = 2)
plot(aemet$temp, col = col1, lwd = 2)
Surveys are physical or digital questionnaires that gather both qualitative and quantitative data from subjects. One situation in which you might conduct a survey is gathering attendee feedback after an event. This can provide a sense of what attendees enjoyed, what they wish was different, and areas you can improve or save money on during your next event for a similar audience.
Because they can be sent out physically or digitally, surveys present the opportunity for distribution at scale. They can also be inexpensive; running a survey can cost nothing if you use a free tool. If you wish to target a specific group of people, partnering with a market research firm to get the survey in the hands of that demographic may be worth the money.
Something to watch out for when crafting and running surveys is the effect of bias, including:
Collection bias: It can be easy to accidentally write survey questions with a biased lean. Watch out for this when creating questions to ensure your subjects answer honestly and aren’t swayed by your wording.
Subject bias: Because your subjects know their responses will be read by you, their answers may be biased toward what seems socially acceptable. For this reason, consider pairing survey data with behavioral data from other collection methods to get the full picture.
Example: Each time your customers make a purchase, tracking that data can allow you to make decisions about targeted marketing efforts and understand your customer base better.
Interviews and focus groups consist of talking to subjects face-to-face about a specific topic or issue. Interviews tend to be one-on-one, and focus groups are typically made up of several people. You can use both to gather qualitative and quantitative data.
Through interviews and focus groups, you can gather feedback from people in your target audience about new product features. Seeing them interact with your product in real-time and recording their reactions and responses to questions can provide valuable data about which product features to pursue.
As is the case with surveys, these collection methods allow you to ask subjects anything you want about their opinions, motivations, and feelings regarding your product or brand. It also introduces the potential for bias. Aim to craft questions that don’t lead them in one particular direction.
One downside of interviewing and conducting focus groups is they can be time-consuming and expensive. If you plan to conduct them yourself, it can be a lengthy process. To avoid this, you can hire a market research facilitator to organize and conduct interviews on your behalf.
Observing people interacting with your website or product can be useful for data collection because of the candor it offers. If your user experience is confusing or difficult, you can witness it in real-time.
Yet, setting up observation sessions can be difficult. You can use a third-party tool to record users’ journeys through your site or observe a user’s interaction with a beta version of your site or product.
While less accessible than other data collection methods, observations enable you to see firsthand how users interact with your product or site. You can leverage the qualitative and quantitative data gleaned from this to make improvements and double down on points of success.
To gather behavioral data, you can implement pixels and cookies. These are both tools that track users’ online behavior across websites and provide insight into what content they’re interested in and typically engage with.
You can also track users’ behavior on your company’s website, including which parts are of the highest interest, whether users are confused when using it, and how long they spend on product pages. This can enable you to improve the website’s design and help users navigate to their destination.
Inserting a pixel is often free and relatively easy to set up. Implementing cookies may come with a fee but could be worth it for the quality of data you’ll receive. Once pixels and cookies are set, they gather data on their own and don’t need much maintenance, if any.
It’s important to note: Tracking online behavior can have legal and ethical privacy implications. Before tracking users’ online behavior, ensure you’re in compliance with local and industry data privacy standards.
Online forms are beneficial for gathering qualitative data about users, specifically demographic data or contact information. They’re relatively inexpensive and simple to set up, and you can use them to gate content or registrations, such as webinars and email newsletters.
You can then use this data to contact people who may be interested in your product, build out demographic profiles of existing customers, and in remarketing efforts, such as email workflows and content recommendations.
Monitoring your company’s social media channels for follower engagement is an accessible way to track data about your audience’s interests and motivations. Many social media platforms have analytics built in, but there are also third-party social platforms that give more detailed, organized insights pulled from multiple channels.
You can use data collected from social media to determine which issues are most important to your followers. For instance, you may notice that the number of engagements dramatically increases when your company posts about its sustainability efforts.
Every thing in R is an object.
R has 7 basic data types
character, “a”, “swc”
numeric (real or decimal), 2, 11.5
integer, 2L (the L tells R to store this as integer)
logical, TRUE, FALSE
complex, 1+4i
x <- c(1,2,3)
x
## [1] 1 2 3
x1 <- c(1L, 2L, 3L)
x1
## [1] 1 2 3
y <- c(TRUE, TRUE, FALSE, FALSE)
y
## [1] TRUE TRUE FALSE FALSE
z <- c("Sarah", "Tracy", "Jon")
z
## [1] "Sarah" "Tracy" "Jon"
typeof(x)
## [1] "double"
typeof(x1)
## [1] "integer"
typeof(y)
## [1] "logical"
typeof(z)
## [1] "character"
class(x)
## [1] "numeric"
class(x1)
## [1] "integer"
class(y)
## [1] "logical"
class(z)
## [1] "character"
length(x)
## [1] 3
length(x1)
## [1] 3
length(y)
## [1] 4
length(z)
## [1] 3
m <- matrix(1:6, nrow=2, ncol=3)
m
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
m <-1:10
dim(m) <- c(2,5)
m
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 3 5 7 9
## [2,] 2 4 6 8 10
x <- 1:3
y <- 10:12
cbind(x,y)
## x y
## [1,] 1 10
## [2,] 2 11
## [3,] 3 12
rbind(x,y)
## [,1] [,2] [,3]
## x 1 2 3
## y 10 11 12
mdat <- matrix(c(1, 2, 3, 11, 12, 13),
nrow = 2,
ncol = 3,
byrow = TRUE)
dim(mdat)
## [1] 2 3
mdat
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 11 12 13
mdat[2,3]
## [1] 13
In R lists act as containers. Unlike atomic vectors, the contents of a list are not restricted to a single mode and can encompass any mixture of data types. Lists are sometimes called generic vectors, because the elements of a list can by of any type of R object, even lists containing further lists. This property makes them fundamentally different from atomic vectors.
A list is a special type of vector. Each element can be a different type.
x <- list(1, "a", TRUE, 1+4i)
x
## [[1]]
## [1] 1
##
## [[2]]
## [1] "a"
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] 1+4i
xlist <- list(a = "Karthik Ram", b = 1:10, data = head(mtcars))
xlist
## $a
## [1] "Karthik Ram"
##
## $b
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $data
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
names(xlist)
## [1] "a" "b" "data"
A data frame is a very important data type in R. It’s pretty much the de facto data structure for most tabular data and what we use for statistics.
A data frame is a special type of list where every element of the list has same length (i.e. data frame is a “rectangular” list).
Some additional information on data frames:
dat <- data.frame(id = letters[1:10], x = 1:10, y = 11:20)
dat
## id x y
## 1 a 1 11
## 2 b 2 12
## 3 c 3 13
## 4 d 4 14
## 5 e 5 15
## 6 f 6 16
## 7 g 7 17
## 8 h 8 18
## 9 i 9 19
## 10 j 10 20
is.list(dat)
## [1] TRUE
class(dat)
## [1] "data.frame"
dat[1, 3]
## [1] 11
dat[["y"]]
## [1] 11 12 13 14 15 16 17 18 19 20
dat$y
## [1] 11 12 13 14 15 16 17 18 19 20
5. factors
Conceptually, factors are variables in R which take on a limited number of different values; such variables are often refered to as categorical variables. One of the most important uses of factors is in statistical modeling; since categorical variables enter into statistical models differently than continuous variables, storing data as factors insures that the modeling functions will treat such data correctly.
Factors in R are stored as a vector of integer values with a corresponding set of character values to use when the factor is displayed. The factor function is used to create a factor. The only required argument to factor is a vector of values which will be returned as a vector of factor values. Both numeric and character variables can be made into factors, but a factor’s levels will always be character values. You can see the possible levels for a factor through the levels command.
data = c(1,2,2,3,1,2,3,3,1,2,3,3,1)
fdata = factor(data)
fdata
## [1] 1 2 2 3 1 2 3 3 1 2 3 3 1
## Levels: 1 2 3
rdata = factor(data,labels=c("I","II","III"))
rdata
## [1] I II II III I II III III I II III III I
## Levels: I II III
levels(fdata) = c('I','II','III')
fdata
## [1] I II II III I II III III I II III III I
## Levels: I II III
mons = c("March","April","January","November","January", "September","October","September","November","August", "January","November","November","February","May","August", "July","December","August","August","September","November","February","April")
mons = factor(mons)
table(mons)
## mons
## April August December February January July March May
## 2 4 1 2 3 1 1 1
## November October September
## 5 1 3
mons = factor(mons,levels=c("January","February","March", "April","May","June","July","August","September","October","November","December"),ordered=TRUE)
table(mons)
## mons
## January February March April May June July August
## 3 2 1 2 1 0 1 4
## September October November December
## 3 1 5 1
https://towardsdatascience.com/text-representation-for-data-science-and-text-mining-719ce81f3c84
https://knowthecode.io/labs/basics-of-digitizing-data/episode-14