// change directory
// for pc:
cd “C:\Users\…..”
// for mac, the direction of the slash is different:
cd “C:/Users/……”
// import
use “http://www.ats.ucla.edu/stat/data/hs0”, clear
import excel “C:\folder\class.xls”, sheet(“Sheet1”) firstrow
import delimited “C:\folder\class.csv”, delimiter(comma) clear
// export
export delimited using filename
// some basics commands
browse
describe
codebook
count
list
keep
drop
save
use
list id math science socst sum science2 in 7/11, clean noobs
browse id math science
// create id numbers 1,2,3,…. for each observation
gen id = _n
// wildcard *
describe micro*
// save data dictionary
describe,replace
export excel using dictionaryName
// save output to a log file
log using output.log, replace text
<enter commands here!>
log close
// install software
ssc install <softwarename>
———————————————
// label variables
label variable year “year of visit”
// label values
// string variable; don’t change value labels
encode sex, generate(sex_num)
// string variable; change value labels
encode sex, generate(sex_num)
label define sexfmt 1 “Female” 2 “Male” 3 “Other”, modify
label values sex_num sexfmt
// numeric categorical variables without labels
label define timefmt 1 “short” 2 “long”
label values q1 q2 q3 timefmt
//numeric continuous variables
recode age(min/19.99999=1 “under 19 years old”)(20/24.99999=2 “20-24 years old”)(25/29.99999=3 “25-29 years old”)(30/max=4 “over 30 years old”), gen(age_cat)
//numeric categorical variables with labels
recode race_num (1=1 “American Indian or Alaska Native”)(2=2 Asian)(3 4=3 Black)(10 11 12=4 White)(5 7 9=5 “Other/Unknown”)(6=.b)(8=.c), gen(race2)
numlabel,add // prefix numeric values to value labels
ta race2,m // show missing values when tabulate
// create variables
generate id1 = _n // Create a simple id variable
gen region=1 if inlist(zip,63116,63110,63119)
// create a variable and deal with missing values
gen mathhigh=0
gen mathhigh=1 if math>50
replace mathhigh=0 if mi(mathhigh)
tabstat math , by(mathhigh) stat(mean sd min max) //check
// alternative
gen mathhigh=0 if math != .
gen mathhigh=1 if math>50 & math != .
// gen sum2=A+B+C
// if A or B or C has missing values, sum will be missing
// use egen to ignore missing
egen sum2 = rowtotal(math science socst)
————————————————
//missing values and numeric codes
replace weight = .a if weight == 9999
// convert multiple variables: numeric codes to missing
mvdecode race income read write , mv(99=.a \ 88=.b)
// convert all missing values to numeric codes
mvencode race income read write , mv(999)
// graphs
histogram read, normal
graph box science, over(ses)
graph twoway scatter math science // scatter plot
twoway lfit math science // linear model
twoway (scatter math science) (lfit math science ) // overlay linear model over scatter plot
//correlation
pwcorr v1-v6, st(.05)
/*
v1 v2 v3 v4 v5 v6 all included;
attach * to significant correlations at 95% confident level
*/
// Bonferroni-Adjusted
pwcorr v1-v6, print(.05) bonferroni
//t test
// sdtest is to test equal variances
// two-sample t test between groups
sdtest v, by (group)
ttest v, by(group) //if equal variances
ttest v, by(group) unequal // if unequal variances between groups
// two-sample t test between two variables
sdtest mathScore == englishScore
ttest mathScore = englishScore, unpaired
// one-sample t test
sdtest score == 100
ttest score == 100
// chi-square
tabulate v1 v2, chi2
// linear regression
regress mathscience i.gender ib2.race
// logistic regression
logistic DV IV i.IV2
———————————————–
// merge two data files with matched ids
// step1: check duplicates
duplicates report zip
duplicates tag zip, gen(dups)
ta dups
// step2: sort both datasets by id
use data1
sort id
describe
save data1, replace
use data2
sort id
describe
save data2, replace
use data1, clear
merge 1:1 id using data2
ta _merge
describe
save data3, replace
// save Stata file to be read in an older version
Use the saveold command. Details:
http://www.stata.com/support/faqs/data-management/save-for-previous-version/
This post is a summary of what I learnt from the STATA Skill Lab. Special Thanks to Ben Cooper, our instructor of this course. Some commands in this post come from his course materials.